fix: ASRX duplication, TKG edges, trace ingest, and add pipeline progress publishing

- ASRX handler no longer stores duplicate 'asr' pre_chunks
- Pre_chunks storage made idempotent (delete-before-insert)
- Rule 1 + trace_ingest changed to query 'asrx' not 'asr'
- Trace chunks removed (dynamic from TKG/Qdrant)
- TKG scroll_face_points fixed: trace_id >= 1 (not == 1)
- TKG AsrxSegmentEntry: start/end -> start_time/end_time (match ASRX JSON)
- Unregister error handling: log instead of silent discard
- Add publish_pipeline_progress calls at each pipeline stage
  (processors, rule1, face_trace, identity_agent, TKG, rule2, completion)
This commit is contained in:
Accusys
2026-07-02 10:43:46 +08:00
parent d791d138f2
commit 3eabd45882
65 changed files with 9477 additions and 3852 deletions
+43 -4
View File
@@ -126,9 +126,17 @@ def _convert_result(result, output_path):
except Exception:
pass
segment_count = len(result.get("segments", []))
if segment_count > 0:
status = "has_transcript"
else:
status = "silent_audio"
output_result = {
"status": status,
"language": result.get("language"),
"segments": [],
"segment_count": segment_count,
"n_speakers": result.get("n_speakers", 0),
"speaker_stats": result.get("speaker_stats", {}),
}
@@ -172,6 +180,37 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
if publisher:
publisher.info("asrx", "ASRX_START")
# Check for audio stream first
tracks = probe_audio_tracks(video_path)
if not tracks:
if publisher:
publisher.info("asrx", "No audio stream detected")
output_result = {"status": "no_audio_track", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments (no audio)")
print("[ASRX] No audio stream, skipping", file=sys.stderr)
return output_result
# Check if ASR already determined no audio/silent - skip processing
asr_path = output_path.replace(".asrx.json", ".asr.json")
if os.path.exists(asr_path):
try:
with open(asr_path) as f:
asr_data = json.load(f)
asr_status = asr_data.get("status", "")
if asr_status in ("no_audio_track", "silent_audio"):
if publisher:
publisher.info("asrx", f"ASR status={asr_status}, skipping ASRX processing")
output_result = {"status": asr_status, "language": asr_data.get("language"), "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", f"0 segments (ASR: {asr_status})")
print(f"[ASRX] ASR status={asr_status}, skipping", file=sys.stderr)
return output_result
except Exception as e:
print(f"[ASRX] Failed to read ASR output: {e}", file=sys.stderr)
checkpoint_path = output_path + ".stage1.json"
# ── Phase 2: Resume from checkpoint (Steps 4-7 only) ──
@@ -189,7 +228,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
if "error" in result:
if publisher:
publisher.error("asrx", result["error"])
output_result = {"language": None, "segments": []}
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
@@ -225,7 +264,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
publisher.error("asrx", str(e))
import traceback
traceback.print_exc()
output_result = {"language": None, "segments": []}
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
@@ -289,7 +328,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
if "error" in result:
if publisher:
publisher.error("asrx", result["error"])
output_result = {"language": None, "segments": []}
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
@@ -320,7 +359,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
import traceback
traceback.print_exc()
output_result = {"language": None, "segments": []}
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")