Files
momentry_core/test_chunking_threshold.py
Warren b54c2def30 feat: add migrations, test scripts, and utility tools
- Add database migrations (006-028) for face recognition, identity, file_uuid
- Add test scripts for ASR, face, search, processing
- Add portal frontend (Tauri)
- Add config, benchmark, and monitoring utilities
- Add model checkpoints and pretrained model references
2026-04-30 15:11:53 +08:00

143 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""
Test chunking threshold (30 minutes/1800 seconds).
Create a 40-minute audio file and test ASR chunking.
"""
import sys
import os
import subprocess
import tempfile
import time
from pathlib import Path
def create_test_audio(duration_seconds, output_path):
"""Create a silent audio file of specified duration using ffmpeg."""
cmd = [
"ffmpeg",
"-f",
"lavfi",
"-i",
f"anullsrc=r=16000:cl=mono",
"-t",
str(duration_seconds),
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
"-y",
output_path,
]
result = subprocess.run(cmd, capture_output=True)
return result.returncode == 0 and os.path.exists(output_path)
def test_chunking():
"""Test ASR chunking with different audio durations."""
# Add scripts directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "scripts"))
# Import after path is set
try:
from asr_processor import run_asr
except ImportError as e:
print(f"Failed to import asr_processor: {e}")
return False
test_cases = [
(1200, "20 minutes - should use direct transcription"),
(1800, "30 minutes - boundary, should use direct"),
(1810, "30m10s - should use chunked transcription"),
(2400, "40 minutes - should use chunked transcription"),
]
for duration, description in test_cases:
print(f"\n{'=' * 60}")
print(f"Test: {description}")
print(f"Duration: {duration} seconds ({duration / 60:.1f} minutes)")
with tempfile.TemporaryDirectory() as temp_dir:
audio_path = os.path.join(temp_dir, "test_audio.wav")
output_path = os.path.join(temp_dir, "output.json")
print(f"Creating test audio...")
if not create_test_audio(duration, audio_path):
print(f"Failed to create test audio")
continue
print(f"Running ASR...")
start_time = time.time()
try:
# Run ASR
success = run_asr(
video_path=None, # Use audio directly
audio_path=audio_path,
output_path=output_path,
model_size="tiny",
progress=False, # Don't use Redis publisher
)
elapsed = time.time() - start_time
if success and os.path.exists(output_path):
# Load and check result
import json
with open(output_path, "r") as f:
data = json.load(f)
processing_mode = data.get("processing_mode", "unknown")
chunk_count = data.get("chunk_count", 1)
print(f"Result: SUCCESS")
print(f"Processing mode: {processing_mode}")
print(f"Chunk count: {chunk_count}")
print(f"Elapsed time: {elapsed:.2f}s")
# Verify expected behavior
if duration <= 1800 and processing_mode != "direct":
print(
f"WARNING: Expected direct transcription but got {processing_mode}"
)
elif duration > 1800 and processing_mode != "chunked":
print(
f"WARNING: Expected chunked transcription but got {processing_mode}"
)
else:
print(f"Result: FAILED")
print(f"Success flag: {success}")
print(f"Output exists: {os.path.exists(output_path)}")
except Exception as e:
print(f"Exception during ASR: {e}")
import traceback
traceback.print_exc()
return True
if __name__ == "__main__":
print("Testing ASR chunking threshold (30 minutes/1800 seconds)")
print("This test creates synthetic audio files of various durations")
print("and verifies the correct transcription mode is used.\n")
# Check if ffmpeg is available
if subprocess.run(["which", "ffmpeg"], capture_output=True).returncode != 0:
print("ERROR: ffmpeg not found in PATH")
sys.exit(1)
success = test_chunking()
if success:
print("\n✅ Chunking threshold test completed")
sys.exit(0)
else:
print("\n❌ Chunking threshold test failed")
sys.exit(1)