#!/usr/bin/env python3 """ Test chunking threshold (30 minutes/1800 seconds). Create a 40-minute audio file and test ASR chunking. """ import sys import os import subprocess import tempfile import time from pathlib import Path def create_test_audio(duration_seconds, output_path): """Create a silent audio file of specified duration using ffmpeg.""" cmd = [ "ffmpeg", "-f", "lavfi", "-i", f"anullsrc=r=16000:cl=mono", "-t", str(duration_seconds), "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", output_path, ] result = subprocess.run(cmd, capture_output=True) return result.returncode == 0 and os.path.exists(output_path) def test_chunking(): """Test ASR chunking with different audio durations.""" # Add scripts directory to path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "scripts")) # Import after path is set try: from asr_processor import run_asr except ImportError as e: print(f"Failed to import asr_processor: {e}") return False test_cases = [ (1200, "20 minutes - should use direct transcription"), (1800, "30 minutes - boundary, should use direct"), (1810, "30m10s - should use chunked transcription"), (2400, "40 minutes - should use chunked transcription"), ] for duration, description in test_cases: print(f"\n{'=' * 60}") print(f"Test: {description}") print(f"Duration: {duration} seconds ({duration / 60:.1f} minutes)") with tempfile.TemporaryDirectory() as temp_dir: audio_path = os.path.join(temp_dir, "test_audio.wav") output_path = os.path.join(temp_dir, "output.json") print(f"Creating test audio...") if not create_test_audio(duration, audio_path): print(f"Failed to create test audio") continue print(f"Running ASR...") start_time = time.time() try: # Run ASR success = run_asr( video_path=None, # Use audio directly audio_path=audio_path, output_path=output_path, model_size="tiny", progress=False, # Don't use Redis publisher ) elapsed = time.time() - start_time if success and os.path.exists(output_path): # Load and check result import json with open(output_path, "r") as f: data = json.load(f) processing_mode = data.get("processing_mode", "unknown") chunk_count = data.get("chunk_count", 1) print(f"Result: SUCCESS") print(f"Processing mode: {processing_mode}") print(f"Chunk count: {chunk_count}") print(f"Elapsed time: {elapsed:.2f}s") # Verify expected behavior if duration <= 1800 and processing_mode != "direct": print( f"WARNING: Expected direct transcription but got {processing_mode}" ) elif duration > 1800 and processing_mode != "chunked": print( f"WARNING: Expected chunked transcription but got {processing_mode}" ) else: print(f"Result: FAILED") print(f"Success flag: {success}") print(f"Output exists: {os.path.exists(output_path)}") except Exception as e: print(f"Exception during ASR: {e}") import traceback traceback.print_exc() return True if __name__ == "__main__": print("Testing ASR chunking threshold (30 minutes/1800 seconds)") print("This test creates synthetic audio files of various durations") print("and verifies the correct transcription mode is used.\n") # Check if ffmpeg is available if subprocess.run(["which", "ffmpeg"], capture_output=True).returncode != 0: print("ERROR: ffmpeg not found in PATH") sys.exit(1) success = test_chunking() if success: print("\n✅ Chunking threshold test completed") sys.exit(0) else: print("\n❌ Chunking threshold test failed") sys.exit(1)