fix: ASRX duplication, TKG edges, trace ingest, and add pipeline progress publishing

- ASRX handler no longer stores duplicate 'asr' pre_chunks - Pre_chunks storage made idempotent (delete-before-insert) - Rule 1 + trace_ingest changed to query 'asrx' not 'asr' - Trace chunks removed (dynamic from TKG/Qdrant) - TKG scroll_face_points fixed: trace_id >= 1 (not == 1) - TKG AsrxSegmentEntry: start/end -> start_time/end_time (match ASRX JSON) - Unregister error handling: log instead of silent discard - Add publish_pipeline_progress calls at each pipeline stage (processors, rule1, face_trace, identity_agent, TKG, rule2, completion)
2026-07-02 10:43:46 +08:00
parent d791d138f2
commit 3eabd45882
65 changed files with 9477 additions and 3852 deletions
@@ -126,9 +126,17 @@ def _convert_result(result, output_path):
        except Exception:
            pass

+    segment_count = len(result.get("segments", []))
+    if segment_count > 0:
+        status = "has_transcript"
+    else:
+        status = "silent_audio"
+
    output_result = {
+        "status": status,
        "language": result.get("language"),
        "segments": [],
+        "segment_count": segment_count,
        "n_speakers": result.get("n_speakers", 0),
        "speaker_stats": result.get("speaker_stats", {}),
    }
@@ -172,6 +180,37 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
    if publisher:
        publisher.info("asrx", "ASRX_START")

+    # Check for audio stream first
+    tracks = probe_audio_tracks(video_path)
+    if not tracks:
+        if publisher:
+            publisher.info("asrx", "No audio stream detected")
+        output_result = {"status": "no_audio_track", "language": None, "segments": [], "segment_count": 0}
+        _atomic_write(output_path, output_result)
+        if publisher:
+            publisher.complete("asrx", "0 segments (no audio)")
+        print("[ASRX] No audio stream, skipping", file=sys.stderr)
+        return output_result
+
+    # Check if ASR already determined no audio/silent - skip processing
+    asr_path = output_path.replace(".asrx.json", ".asr.json")
+    if os.path.exists(asr_path):
+        try:
+            with open(asr_path) as f:
+                asr_data = json.load(f)
+            asr_status = asr_data.get("status", "")
+            if asr_status in ("no_audio_track", "silent_audio"):
+                if publisher:
+                    publisher.info("asrx", f"ASR status={asr_status}, skipping ASRX processing")
+                output_result = {"status": asr_status, "language": asr_data.get("language"), "segments": [], "segment_count": 0}
+                _atomic_write(output_path, output_result)
+                if publisher:
+                    publisher.complete("asrx", f"0 segments (ASR: {asr_status})")
+                print(f"[ASRX] ASR status={asr_status}, skipping", file=sys.stderr)
+                return output_result
+        except Exception as e:
+            print(f"[ASRX] Failed to read ASR output: {e}", file=sys.stderr)
+
    checkpoint_path = output_path + ".stage1.json"

    # ── Phase 2: Resume from checkpoint (Steps 4-7 only) ──
@@ -189,7 +228,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
            if "error" in result:
                if publisher:
                    publisher.error("asrx", result["error"])
-                output_result = {"language": None, "segments": []}
+                output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
                _atomic_write(output_path, output_result)
                if publisher:
                    publisher.complete("asrx", "0 segments")
@@ -225,7 +264,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
                publisher.error("asrx", str(e))
            import traceback
            traceback.print_exc()
-            output_result = {"language": None, "segments": []}
+            output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
            _atomic_write(output_path, output_result)
            if publisher:
                publisher.complete("asrx", "0 segments")
@@ -289,7 +328,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
        if "error" in result:
            if publisher:
                publisher.error("asrx", result["error"])
-            output_result = {"language": None, "segments": []}
+            output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
            _atomic_write(output_path, output_result)
            if publisher:
                publisher.complete("asrx", "0 segments")
@@ -320,7 +359,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
        import traceback
        traceback.print_exc()

-        output_result = {"language": None, "segments": []}
+        output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
        _atomic_write(output_path, output_result)
        if publisher:
            publisher.complete("asrx", "0 segments")