feat: OCR independent chunks (方案 A) + stats API

Rule 1 now creates OCR chunks separately from ASRX segments: - Phase 1: ASRX segments (pure speech, NO OCR merge) - Phase 2: OCR-only chunks (all OCR frames grouped by proximity) Added OCR statistics to ingestion status API: - rule1_ocr: shows OCR pre_chunks count - rule1_ocr_chunks: shows OCR-only chunks count Example: FilmRiot_test now has 32 ASRX + 3 OCR-only = 35 chunks Stats: rule1_sentence: 35, rule1_ocr: 30, rule1_ocr_chunks: 3
2026-07-05 23:28:09 +08:00
parent 5a3f791ecd
commit e91d51cc5e
2 changed files with 64 additions and 57 deletions
@@ -508,6 +508,17 @@ async fn get_ingestion_status(
        "SELECT COUNT(*) FROM {chunk} WHERE file_uuid = '{file_uuid}' AND chunk_type = 'sentence'"
    ));
    let sentence_embedded = count_sql!(&format!("SELECT COUNT(*) FROM {chunk} WHERE file_uuid = '{file_uuid}' AND chunk_type = 'sentence' AND embedding IS NOT NULL"));
+
+    // OCR statistics
+    let pre_chunks_table = schema::table_name("pre_chunks");
+    let ocr_pre_chunks: i64 = count_sql!(&format!(
+        "SELECT COUNT(*) FROM {pre_chunks_table} WHERE file_uuid = '{file_uuid}' AND processor_type = 'ocr'"
+    ));
+    let ocr_only_chunks: i64 = count_sql!(&format!(
+        "SELECT COUNT(*) FROM {chunk} WHERE file_uuid = '{file_uuid}' AND chunk_type = 'sentence' \
+         AND (content->>'text' = '' OR content->>'text' IS NULL) \
+         AND content->>'ocr_text' IS NOT NULL AND content->>'ocr_text' != ''"
+    ));
    let scene_count = count_sql!(&format!(
        "SELECT COUNT(*) FROM {chunk} WHERE file_uuid = '{file_uuid}' AND chunk_type = 'cut'"
    ));
@@ -595,6 +606,16 @@ async fn get_ingestion_status(
            sentence_count > 0,
            Some(format!("{sentence_count} sentence chunks"))
        ),
+        step!(
+            "rule1_ocr",
+            ocr_pre_chunks > 0,
+            Some(format!("{ocr_pre_chunks} OCR frames"))
+        ),
+        step!(
+            "rule1_ocr_chunks",
+            ocr_only_chunks > 0,
+            Some(format!("{ocr_only_chunks} OCR-only chunks"))
+        ),
        step!(
            "auto_vectorize",
            sentence_embedded > 0,
@@ -30,27 +30,10 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
    let mut count = 0;
    let mut tx = pool.begin().await?;

-    // Track which OCR frames were merged into ASRX segments
-    let mut merged_ocr_frames = std::collections::HashSet::new();
-
-    // Phase 1: Process ASRX segments (merge OCR where overlapping)
+    // Phase 1: ASRX segments (pure speech, NO OCR merge)
    for seg in asr_segments.iter() {
-        let ocr_text = collect_ocr_text(seg.start_frame, seg.end_frame, &ocr_map);
-        let combined_text = if ocr_text.is_empty() {
-            seg.text.clone()
-        } else {
-            format!("{} {}", seg.text, ocr_text)
-        };
-
-        // Track merged OCR frames
-        if !ocr_text.is_empty() {
-            for frame in seg.start_frame..=seg.end_frame {
-                merged_ocr_frames.insert(frame);
-            }
-        }
-
-        // Skip chunks with no text (empty ASRX and no OCR)
-        if combined_text.trim().is_empty() {
+        // Skip chunks with no text
+        if seg.text.trim().is_empty() {
            continue;
        }

@@ -60,7 +43,7 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result

        let content = serde_json::json!({
            "text": seg.text,
-            "ocr_text": ocr_text,
+            "ocr_text": "",
        });

        let chunk = Chunk::from_seconds(
@@ -75,7 +58,7 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
            content,
        )
        .with_metadata(metadata)
-        .with_text_content(combined_text);
+        .with_text_content(seg.text.clone());

        db.store_chunk_in_tx(&chunk, &mut tx).await?;

@@ -83,28 +66,31 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result

        if count % 100 == 0 {
            info!(
-                "Rule 1: Processed {} segments for video {}",
+                "Rule 1: Processed {} ASRX segments for video {}",
                count, file_uuid
            );
        }
    }

-    // Phase 2: Create chunks for OCR-only text (not overlapping with ASRX)
-    let ocr_only_chunks = collect_ocr_only_chunks(&ocr_map, &merged_ocr_frames, fps);
-    let ocr_only_count = ocr_only_chunks.len();
-    for (frame, ocr_text) in ocr_only_chunks {
+    let asrx_count = count;
+
+    // Phase 2: OCR-only chunks (all OCR frames grouped by proximity)
+    let ocr_chunks = group_ocr_frames(&ocr_map, fps);
+    for (start_frame, end_frame, ocr_text) in ocr_chunks {
        if ocr_text.trim().is_empty() {
            continue;
        }

-        let time = frame as f64 / fps;
+        let start_time = start_frame as f64 / fps;
+        let end_time = (end_frame + 1) as f64 / fps;
+
        let metadata = serde_json::json!({
            "language": "ocr",
        });

        let content = serde_json::json!({
            "text": "",
-            "ocr_text": ocr_text,
+            "ocr_text": ocr_text.clone(),
        });

        let chunk = Chunk::from_seconds(
@@ -113,8 +99,8 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
            format!("{}", count),
            ChunkType::Sentence,
            ChunkRule::Rule1,
-            time,
-            time + (1.0 / fps),
+            start_time,
+            end_time,
            fps,
            content,
        )
@@ -126,14 +112,13 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
        count += 1;
    }

+    let ocr_only_count = count - asrx_count;
+
    tx.commit().await?;

    info!(
-        "Rule 1 completed: {} sentence chunks created for video {} ({} ASRX + {} OCR-only)",
-        count,
-        file_uuid,
-        count - ocr_only_count,
-        ocr_only_count
+        "Rule 1 completed: {} sentence chunks for video {} ({} ASRX + {} OCR-only)",
+        count, file_uuid, asrx_count, ocr_only_count
    );

    Ok(count)
@@ -303,33 +288,33 @@ fn collect_ocr_text(
    parts.join(" ")
 }

-/// Collect OCR text that doesn't overlap with any ASRX segment
-/// Returns vec of (frame, combined_ocr_text) for OCR-only chunks
-fn collect_ocr_only_chunks(
+/// Group ALL OCR frames by proximity into chunks
+/// Returns vec of (start_frame, end_frame, combined_ocr_text)
+fn group_ocr_frames(
    ocr_map: &BTreeMap<i64, Vec<String>>,
-    merged_ocr_frames: &std::collections::HashSet<i64>,
    _fps: f64,
-) -> Vec<(i64, String)> {
+) -> Vec<(i64, i64, String)> {
+    const MAX_FRAME_GAP: i64 = 5; // ~0.2s at 24fps
+
    let mut result = Vec::new();
-    let mut current_frame: Option<i64> = None;
+    let mut start_frame: Option<i64> = None;
+    let mut end_frame: Option<i64> = None;
    let mut current_texts: Vec<String> = Vec::new();

    for (frame, texts) in ocr_map.iter() {
-        // Skip frames already merged into ASRX segments
-        if merged_ocr_frames.contains(frame) {
-            continue;
-        }
-
-        // Start a new group or continue existing group
-        if current_frame.is_none() {
-            current_frame = Some(*frame);
+        if start_frame.is_none() {
+            // Start first group
+            start_frame = Some(*frame);
+            end_frame = Some(*frame);
            current_texts = texts.clone();
        } else {
-            // Group consecutive OCR frames (within 5 frames = ~0.2s at 24fps)
-            if *frame - current_frame.unwrap() <= 5 {
+            let gap = *frame - end_frame.unwrap();
+            if gap <= MAX_FRAME_GAP {
+                // Continue current group
+                end_frame = Some(*frame);
                current_texts.extend(texts.clone());
            } else {
-                // Save previous group and start new one
+                // Save current group and start new one
                if !current_texts.is_empty() {
                    let mut seen = std::collections::HashSet::new();
                    let unique: Vec<String> = current_texts
@@ -337,16 +322,17 @@ fn collect_ocr_only_chunks(
                        .filter(|t| seen.insert((*t).clone()))
                        .cloned()
                        .collect();
-                    result.push((current_frame.unwrap(), unique.join(" ")));
+                    result.push((start_frame.unwrap(), end_frame.unwrap(), unique.join(" ")));
                }
-                current_frame = Some(*frame);
+                start_frame = Some(*frame);
+                end_frame = Some(*frame);
                current_texts = texts.clone();
            }
        }
    }

    // Don't forget the last group
-    if let Some(frame) = current_frame {
+    if let (Some(start), Some(end)) = (start_frame, end_frame) {
        if !current_texts.is_empty() {
            let mut seen = std::collections::HashSet::new();
            let unique: Vec<String> = current_texts
@@ -354,7 +340,7 @@ fn collect_ocr_only_chunks(
                .filter(|t| seen.insert((*t).clone()))
                .cloned()
                .collect();
-            result.push((frame, unique.join(" ")));
+            result.push((start, end, unique.join(" ")));
        }
    }