diff --git a/docs_v1.0/DESIGN/Audio_Scene_Detection_POC.md b/docs_v1.0/DESIGN/Audio_Scene_Detection_POC.md new file mode 100644 index 0000000..5a3b92d --- /dev/null +++ b/docs_v1.0/DESIGN/Audio_Scene_Detection_POC.md @@ -0,0 +1,162 @@ +--- +title: Audio Scene & Instrument Detection POC Plan +version: 0.1 +date: 2026-07-02 +author: OpenCode +status: planned +--- + +| scope | status | applicable to | +|-------|--------|---------------| +| Audio processing pipeline | planned | Video files with non-speech audio | + +## Goal + +Detect non-speech audio events (instruments, music, environmental sounds) in video files alongside existing ASRX speech recognition. + +## Why + +Current pipeline only detects speech (ASRX → 64 segments + 1554 speaker embeddings). Instrument sounds, background music, and environmental audio are completely ignored. + +## Technical Options + +### Option A: PANNs (Pre-trained Audio Neural Networks) +- **Model**: Cnn14 (313M params, 700MB weights) +- **Classes**: 527 AudioSet classes (piano, guitar, drums, speech, etc.) +- **Pros**: Production-ready, accurate, PyTorch-based +- **Cons**: Large download, ~200MB RAM per inference +- **Install**: `pip install panns-inference` + +### Option B: YAMNet (Google) +- **Model**: MobileNet-based, 4MB weights +- **Classes**: 521 AudioSet classes +- **Pros**: Lightweight, fast +- **Cons**: Requires TensorFlow (not currently installed) +- **Install**: `pip install yamnet` + TensorFlow + +### Option C: torchaudio + heuristics (lightweight fallback) +- Use existing PyTorch + torchaudio +- Extract spectral features (MFCC, centroid, energy) +- Simple classification: speech vs music vs silence +- **Pros**: No extra dependencies +- **Cons**: Less accurate, limited classes + +## Recommended: Option A (PANNs) + +## Pipeline Integration + +``` +Video → Audio Extract → ASRX (speech) → Speaker Embeddings (3.4/s) + → Audio Scene (new) → Scene Labels (1/s) +``` + +### New Processor: `audio_scene` + +| Field | Value | +|-------|-------| +| Processor type | `audio_scene` | +| Input | Video file (audio track) | +| Output | `file_uuid.audio_scene.json` | +| Sampling | 1-second segments | +| Qdrant collection | `momentry_{schema}_audio_scene` | + +### Output Format + +```json +{ + "file_uuid": "...", + "segments": [ + { + "start_time": 0.0, + "end_time": 1.0, + "primary_class": "speech", + "confidence": 0.95, + "top_classes": [ + {"class": "speech", "score": 0.95}, + {"class": "music", "score": 0.03}, + {"class": "piano", "score": 0.01} + ] + } + ], + "summary": { + "speech_ratio": 0.72, + "music_ratio": 0.15, + "silence_ratio": 0.08, + "instrument_ratio": 0.05, + "instruments_detected": ["piano", "guitar"] + } +} +``` + +### Qdrant Storage + +| Field | Type | Purpose | +|-------|------|---------| +| `file_uuid` | string | Filter by file | +| `start_time` | float | Segment start | +| `end_time` | float | Segment end | +| `primary_class` | keyword | Filter by class | +| `confidence` | float | Filter by confidence | +| `instrument_name` | keyword | Search by instrument | +| `vector` | f32[2048] | Audio embedding for similarity search | + +### Processor Dependencies + +``` +audio_scene → (no dependencies, runs parallel with ASRX) +``` + +## Key AudioSet Instrument Classes + +| Category | Classes | +|----------|---------| +| Piano | Piano, Electric piano, Keyboard | +| Guitar | Guitar, Electric guitar, Acoustic guitar | +| Drums | Drum kit, Snare drum, Cymbal, Hi-hat | +| Strings | Violin, Cello, Harp, Double bass | +| Wind | Flute, Saxophone, Trumpet, Clarinet | +| Voice | Speech, Singing, Chant, Choir | +| Other | Music, Percussion, Organ, Synthesizer | + +## POC Steps + +1. **Install panns-inference** + ```bash + pip install panns-inference + ``` + +2. **Create `scripts/audio_scene_processor.py`** + - Load audio via ffmpeg → numpy array + - Process 1-second segments through Cnn14 + - Save results to JSON + Qdrant + +3. **Add processor type to pipeline** + - Add `AudioScene` to `ProcessorType` enum + - Add to worker's processor dispatch + - Add `AUDIO_SCENE_TIMEOUT` config + +4. **Test with existing video** + - Run on KOBA interview video + - Verify instrument detection accuracy + - Check performance (time, memory) + +5. **Integrate with search** + - Add audio_scene to universal_search + - Add filter by audio class (speech/music/instrument) + +## Estimated Effort + +| Step | Time | +|------|------| +| Install + prototype script | 2-3 hours | +| Pipeline integration | 1-2 hours | +| Qdrant + search integration | 1 hour | +| Testing + tuning | 1-2 hours | +| **Total** | **5-8 hours** | + +## Future Enhancements + +- Real-time audio classification during processing +- Audio event timeline visualization +- Combine with TKG for audio-visual relationships +- Background music detection for copyright checks diff --git a/src/api/scan.rs b/src/api/scan.rs index a852c83..ec67d0a 100644 --- a/src/api/scan.rs +++ b/src/api/scan.rs @@ -836,24 +836,31 @@ async fn get_file_stats( let tkg_nodes_table = schema::table_name("tkg_nodes"); let tkg_edges_table = schema::table_name("tkg_edges"); + let tkg_nodes_total: i64 = sqlx::query_scalar::<_, i64>(&format!("SELECT COUNT(*) FROM {} WHERE file_uuid = $1", tkg_nodes_table)) + .bind(&file_uuid).fetch_one(pool).await.unwrap_or(0); + let tkg_edges_total: i64 = sqlx::query_scalar::<_, i64>(&format!("SELECT COUNT(*) FROM {} WHERE file_uuid = $1", tkg_edges_table)) + .bind(&file_uuid).fetch_one(pool).await.unwrap_or(0); + let tkg = TkgFileStats { - face_track_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "face_track").await, - gaze_track_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "gaze_track").await, - lip_track_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "lip_track").await, - text_region_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "text_region").await, - appearance_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "appearance_trace").await, - accessory_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "accessory").await, - object_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "yolo_object").await, - hand_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "hand").await, - speaker_nodes: count_by_type(pool, &tkg_nodes_table, &file_uuid, "speaker").await, - co_occurrence_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "CO_OCCURS_WITH").await, - speaker_face_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "SPEAKS_AS").await, - face_face_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "FACE_TO_FACE").await, - mutual_gaze_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "MUTUAL_GAZE").await, - lip_sync_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "LIP_SYNC").await, - has_appearance_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "HAS_APPEARANCE").await, - wears_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "WEARS").await, - hand_object_edges: count_by_type(pool, &tkg_edges_table, &file_uuid, "HAND_OBJECT").await, + total_nodes: tkg_nodes_total, + total_edges: tkg_edges_total, + face_track_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "face_track").await, + gaze_track_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "gaze_track").await, + lip_track_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "lip_track").await, + text_region_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "text_trace").await, + appearance_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "appearance_trace").await, + accessory_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "accessory").await, + object_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "yolo_object").await, + hand_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "hand").await, + speaker_nodes: count_nodes(pool, &tkg_nodes_table, &file_uuid, "speaker").await, + co_occurrence_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "CO_OCCURS_WITH").await, + speaker_face_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "SPEAKS_AS").await, + face_face_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "FACE_TO_FACE").await, + mutual_gaze_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "MUTUAL_GAZE").await, + lip_sync_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "LIP_SYNC").await, + has_appearance_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "HAS_APPEARANCE").await, + wears_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "WEARS").await, + hand_object_edges: count_edges(pool, &tkg_edges_table, &file_uuid, "HAND_OBJECT").await, ..Default::default() }; @@ -890,13 +897,25 @@ async fn get_file_stats( })) } -async fn count_by_type(pool: &sqlx::PgPool, table: &str, file_uuid: &str, type_val: &str) -> i64 { +async fn count_nodes(pool: &sqlx::PgPool, table: &str, file_uuid: &str, node_type: &str) -> i64 { sqlx::query_scalar::<_, i64>(&format!( - "SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND (node_type = $2 OR edge_type = $2)", + "SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND node_type = $2", table )) .bind(file_uuid) - .bind(type_val) + .bind(node_type) + .fetch_one(pool) + .await + .unwrap_or(0) +} + +async fn count_edges(pool: &sqlx::PgPool, table: &str, file_uuid: &str, edge_type: &str) -> i64 { + sqlx::query_scalar::<_, i64>(&format!( + "SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND edge_type = $2", + table + )) + .bind(file_uuid) + .bind(edge_type) .fetch_one(pool) .await .unwrap_or(0) diff --git a/src/core/chunk/rule1_ingest.rs b/src/core/chunk/rule1_ingest.rs index 60995d1..a212395 100644 --- a/src/core/chunk/rule1_ingest.rs +++ b/src/core/chunk/rule1_ingest.rs @@ -13,7 +13,7 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result let pool = db.pool(); let pre_chunks_table = schema::table_name("pre_chunks"); - let asr_segments = fetch_asr_segments(pool, file_uuid, &pre_chunks_table).await?; + let asr_segments = fetch_asr_segments(pool, file_uuid, &pre_chunks_table, fps).await?; let ocr_map = fetch_ocr_texts(pool, file_uuid, &pre_chunks_table).await?; let video = db @@ -97,6 +97,7 @@ async fn fetch_asr_segments( pool: &PgPool, file_uuid: &str, table: &str, + fps: f64, ) -> Result> { let query = format!( r#" @@ -114,8 +115,6 @@ async fn fetch_asr_segments( let segments: Vec = rows .iter() .map(|row| { - let start_frame: i64 = row.try_get("start_frame").unwrap_or(0); - let end_frame: i64 = row.try_get("end_frame").unwrap_or(0); let start_time: f64 = row.try_get("start_time").unwrap_or(0.0); let end_time_raw: Option = row.try_get("end_time").ok(); let data: Value = row.try_get("data").unwrap_or(Value::Null); @@ -124,6 +123,13 @@ async fn fetch_asr_segments( .or_else(|| data.get("end_time").and_then(|v| v.as_f64())) .unwrap_or(0.0); + let start_frame = (start_time * fps) as i64; + let end_frame = if end_time > 0.0 { + (end_time * fps) as i64 + } else { + start_frame + }; + if end_time <= 0.0 { warn!( "ASR segment end_time is 0.0 for file {} (frame {}..{})",