Files
momentry_core/investigate_segment_diff.py
Warren b54c2def30 feat: add migrations, test scripts, and utility tools
- Add database migrations (006-028) for face recognition, identity, file_uuid
- Add test scripts for ASR, face, search, processing
- Add portal frontend (Tauri)
- Add config, benchmark, and monitoring utilities
- Add model checkpoints and pretrained model references
2026-04-30 15:11:53 +08:00

358 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Investigate segment count differences between direct and chunked transcription.
Analyze timestamps, durations, and text to understand why segment counts differ.
"""
import sys
import os
import json
import tempfile
import subprocess
import shutil
import time
from typing import List, Dict, Any, Tuple
import statistics
VIDEO_PATH = "../test_video/BigBuckBunny_320x180.mp4" # 10 min, 62MB
def run_transcription(
mode_name: str, max_direct: int, chunk_dur: int
) -> Dict[str, Any]:
"""Run transcription with given parameters and return detailed results."""
temp_dir = tempfile.mkdtemp(prefix=f"asr_invest_{mode_name}_")
output_path = os.path.join(temp_dir, "output.json")
audio_path = os.path.join(temp_dir, "audio.wav")
# Extract audio first
extract_cmd = [
"ffmpeg",
"-i",
VIDEO_PATH,
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
"-y",
audio_path,
]
subprocess.run(extract_cmd, capture_output=True)
# Set environment for ASR processor
env = os.environ.copy()
env["MOMENTRY_ASR_MAX_DIRECT_DURATION"] = str(max_direct)
env["MOMENTRY_ASR_CHUNK_DURATION"] = str(chunk_dur)
env["MOMENTRY_ASR_MODEL_SIZE"] = "tiny"
env["MOMENTRY_ASR_COMPUTE_TYPE"] = "int8"
cmd = [
"/opt/homebrew/bin/python3.11",
"scripts/asr_processor.py",
VIDEO_PATH,
output_path,
"--uuid",
f"invest_{mode_name}",
]
start = time.time()
proc = subprocess.run(cmd, capture_output=True, env=env, text=True)
elapsed = time.time() - start
# Load results
if os.path.exists(output_path):
with open(output_path, "r") as f:
data = json.load(f)
segments = data.get("segments", [])
language = data.get("language", "")
mode = data.get("processing_mode", "unknown")
chunk_count = data.get("chunk_count", 1)
else:
segments = []
language = ""
mode = "failed"
chunk_count = 0
# Calculate segment statistics
if segments:
durations = [s["end"] - s["start"] for s in segments]
stats = {
"count": len(segments),
"total_duration": sum(durations),
"avg_duration": statistics.mean(durations) if durations else 0,
"min_duration": min(durations) if durations else 0,
"max_duration": max(durations) if durations else 0,
}
else:
stats = {
"count": 0,
"total_duration": 0,
"avg_duration": 0,
"min_duration": 0,
"max_duration": 0,
}
# Clean up
shutil.rmtree(temp_dir, ignore_errors=True)
return {
"mode_name": mode_name,
"processing_mode": mode,
"chunk_count": chunk_count,
"chunk_duration": chunk_dur,
"elapsed": elapsed,
"language": language,
"segment_count": len(segments),
"segments": segments,
"segment_stats": stats,
"returncode": proc.returncode,
"stderr": proc.stderr[:500] if proc.stderr else "",
}
def analyze_segment_overlap(
segments1: List[Dict], segments2: List[Dict], tolerance: float = 0.5
) -> Dict[str, Any]:
"""Analyze overlap between two segment lists based on timestamps."""
matches = []
only_in_1 = []
only_in_2 = []
# For each segment in list1, find closest match in list2
for s1 in segments1:
best_match = None
best_overlap = 0
for s2 in segments2:
# Calculate overlap
start_overlap = max(s1["start"], s2["start"])
end_overlap = min(s1["end"], s2["end"])
if end_overlap > start_overlap:
overlap = end_overlap - start_overlap
if overlap > best_overlap:
best_overlap = overlap
best_match = s2
if best_match and best_overlap >= tolerance:
matches.append(
{
"segment1": s1,
"segment2": best_match,
"overlap": best_overlap,
"text_diff": s1["text"] != best_match["text"],
}
)
else:
only_in_1.append(s1)
# Find segments only in list2
for s2 in segments2:
matched = any(m["segment2"] == s2 for m in matches)
if not matched:
only_in_2.append(s2)
return {
"matches": matches,
"only_in_1": only_in_1,
"only_in_2": only_in_2,
"match_count": len(matches),
"unique_to_1": len(only_in_1),
"unique_to_2": len(only_in_2),
}
def analyze_chunk_boundaries(
chunk_results: Dict[str, Any], chunk_duration: float
) -> Dict[str, Any]:
"""Analyze segments near chunk boundaries."""
if chunk_results["chunk_count"] <= 1:
return {"boundary_issues": [], "segments_near_boundary": 0}
boundaries = []
for i in range(chunk_results["chunk_count"] - 1):
boundary_time = (i + 1) * chunk_duration
boundaries.append(boundary_time)
segments_near_boundary = []
boundary_tolerance = 1.0 # 1 second tolerance
for segment in chunk_results["segments"]:
for boundary in boundaries:
if (
abs(segment["start"] - boundary) < boundary_tolerance
or abs(segment["end"] - boundary) < boundary_tolerance
):
segments_near_boundary.append(
{
"segment": segment,
"boundary": boundary,
"distance_to_start": segment["start"] - boundary,
"distance_to_end": segment["end"] - boundary,
}
)
break
return {
"boundaries": boundaries,
"segments_near_boundary": segments_near_boundary,
"count_near_boundary": len(segments_near_boundary),
}
def print_segment_comparison(title: str, segments: List[Dict]):
"""Print segment details for comparison."""
print(f"\n{title} ({len(segments)} segments):")
print("-" * 80)
for i, seg in enumerate(segments):
print(
f"{i:3d}: {seg['start']:7.2f}s - {seg['end']:7.2f}s "
f"(dur:{seg['end'] - seg['start']:5.2f}s): {seg['text'][:60]}"
)
def main():
print(
"Investigating segment count differences between direct and chunked transcription"
)
print(f"Video: {os.path.basename(VIDEO_PATH)}")
print("=" * 80)
# Run different transcription modes
modes = [
("direct", 1800, 600), # Direct (30 min max, 10 min chunk size)
("chunked_10min", 300, 600), # 1 chunk (10 min)
("chunked_5min", 300, 300), # 2 chunks (5 min each)
("chunked_2min", 300, 120), # 5 chunks (2 min each)
]
results = {}
for mode_name, max_direct, chunk_dur in modes:
print(
f"\nRunning {mode_name} (max_direct={max_direct}s, chunk={chunk_dur}s)..."
)
result = run_transcription(mode_name, max_direct, chunk_dur)
results[mode_name] = result
print(f" Mode: {result['processing_mode']}, Chunks: {result['chunk_count']}")
print(f" Segments: {result['segment_count']}, Language: {result['language']}")
print(f" Time: {result['elapsed']:.1f}s")
print(
f" Segment stats: avg={result['segment_stats']['avg_duration']:.2f}s, "
f"min={result['segment_stats']['min_duration']:.2f}s, "
f"max={result['segment_stats']['max_duration']:.2f}s"
)
# Compare direct with each chunked mode
direct_result = results["direct"]
direct_segments = direct_result["segments"]
print("\n" + "=" * 80)
print("COMPARISON WITH DIRECT TRANSCRIPTION")
print("=" * 80)
for mode_name in ["chunked_10min", "chunked_5min", "chunked_2min"]:
chunk_result = results[mode_name]
chunk_segments = chunk_result["segments"]
print(
f"\n{direct_result['segment_count']} direct vs {chunk_result['segment_count']} {mode_name} segments"
)
print(
f"Chunk size: {chunk_result['chunk_duration']}s, Chunks: {chunk_result['chunk_count']}"
)
# Analyze overlap
overlap = analyze_segment_overlap(direct_segments, chunk_segments)
print(
f" Matches: {overlap['match_count']}, Unique to direct: {overlap['unique_to_1']}, Unique to chunked: {overlap['unique_to_2']}"
)
# Print unique segments if any
if overlap["unique_to_1"] > 0:
print(f" Segments only in direct transcription:")
for i, seg in enumerate(overlap["only_in_1"][:5]): # Show first 5
print(
f" {seg['start']:.2f}s-{seg['end']:.2f}s: {seg['text'][:50]}..."
)
if overlap["unique_to_1"] > 5:
print(f" ... and {overlap['unique_to_1'] - 5} more")
if overlap["unique_to_2"] > 0:
print(f" Segments only in {mode_name}:")
for i, seg in enumerate(overlap["only_in_2"][:5]):
print(
f" {seg['start']:.2f}s-{seg['end']:.2f}s: {seg['text'][:50]}..."
)
if overlap["unique_to_2"] > 5:
print(f" ... and {overlap['unique_to_2'] - 5} more")
# Analyze chunk boundary issues for chunked modes
if chunk_result["chunk_count"] > 1:
boundary_analysis = analyze_chunk_boundaries(
chunk_result, chunk_result["chunk_duration"]
)
if boundary_analysis["count_near_boundary"] > 0:
print(
f" ⚠️ {boundary_analysis['count_near_boundary']} segments near chunk boundaries"
)
for item in boundary_analysis["segments_near_boundary"][:3]:
seg = item["segment"]
print(
f" At {item['boundary']:.1f}s: {seg['start']:.2f}s-{seg['end']:.2f}s "
f"(dist: {item['distance_to_start']:.2f}s)"
)
# Detailed segment comparison
print("\n" + "=" * 80)
print("DETAILED SEGMENT COMPARISON")
print("=" * 80)
print_segment_comparison("Direct Transcription", direct_segments)
print_segment_comparison(
"Chunked (10min chunks)", results["chunked_10min"]["segments"]
)
# Analyze segment duration distribution
print("\n" + "=" * 80)
print("SEGMENT DURATION ANALYSIS")
print("=" * 80)
for mode_name, result in results.items():
stats = result["segment_stats"]
if stats["count"] > 0:
print(f"\n{mode_name}:")
print(f" Total segments: {stats['count']}")
print(f" Avg duration: {stats['avg_duration']:.2f}s")
print(f" Min duration: {stats['min_duration']:.2f}s")
print(f" Max duration: {stats['max_duration']:.2f}s")
print(f" Total speech duration: {stats['total_duration']:.2f}s")
# Summary of findings
print("\n" + "=" * 80)
print("SUMMARY OF FINDINGS")
print("=" * 80)
print("\n1. Segment count decreases dramatically with smaller chunks:")
for mode_name, result in results.items():
print(f" {mode_name:15s}: {result['segment_count']:3d} segments")
print("\n2. Potential causes:")
print(" - Small chunks (2min) may not provide enough context for Whisper")
print(" - Speech near chunk boundaries may be cut off")
print(
" - Whisper's VAD (voice activity detection) may behave differently on short clips"
)
print(" - Model initialization/context window effects")
print("\n3. Recommendations:")
print(" - Use larger chunk sizes (≥5 minutes) for better accuracy")
print(" - Consider overlapping chunks to avoid boundary issues")
print(" - For critical applications, prefer direct transcription when possible")
print(" - Test with different Whisper model sizes (tiny vs. base vs. small)")
if __name__ == "__main__":
main()