momentry_core/test_chunking_threshold.py

#!/usr/bin/env python3
"""
Test chunking threshold (30 minutes/1800 seconds).
Create a 40-minute audio file and test ASR chunking.
"""

import sys
import os
import subprocess
import tempfile
import time
from pathlib import Path


def create_test_audio(duration_seconds, output_path):
    """Create a silent audio file of specified duration using ffmpeg."""
    cmd = [
        "ffmpeg",
        "-f",
        "lavfi",
        "-i",
        f"anullsrc=r=16000:cl=mono",
        "-t",
        str(duration_seconds),
        "-acodec",
        "pcm_s16le",
        "-ar",
        "16000",
        "-ac",
        "1",
        "-y",
        output_path,
    ]
    result = subprocess.run(cmd, capture_output=True)
    return result.returncode == 0 and os.path.exists(output_path)


def test_chunking():
    """Test ASR chunking with different audio durations."""

    # Add scripts directory to path
    sys.path.insert(0, os.path.join(os.path.dirname(__file__), "scripts"))

    # Import after path is set
    try:
        from asr_processor import run_asr
    except ImportError as e:
        print(f"Failed to import asr_processor: {e}")
        return False

    test_cases = [
        (1200, "20 minutes - should use direct transcription"),
        (1800, "30 minutes - boundary, should use direct"),
        (1810, "30m10s - should use chunked transcription"),
        (2400, "40 minutes - should use chunked transcription"),
    ]

    for duration, description in test_cases:
        print(f"\n{'=' * 60}")
        print(f"Test: {description}")
        print(f"Duration: {duration} seconds ({duration / 60:.1f} minutes)")

        with tempfile.TemporaryDirectory() as temp_dir:
            audio_path = os.path.join(temp_dir, "test_audio.wav")
            output_path = os.path.join(temp_dir, "output.json")

            print(f"Creating test audio...")
            if not create_test_audio(duration, audio_path):
                print(f"Failed to create test audio")
                continue

            print(f"Running ASR...")
            start_time = time.time()

            try:
                # Run ASR
                success = run_asr(
                    video_path=None,  # Use audio directly
                    audio_path=audio_path,
                    output_path=output_path,
                    model_size="tiny",
                    progress=False,  # Don't use Redis publisher
                )
                elapsed = time.time() - start_time

                if success and os.path.exists(output_path):
                    # Load and check result
                    import json

                    with open(output_path, "r") as f:
                        data = json.load(f)

                    processing_mode = data.get("processing_mode", "unknown")
                    chunk_count = data.get("chunk_count", 1)

                    print(f"Result: SUCCESS")
                    print(f"Processing mode: {processing_mode}")
                    print(f"Chunk count: {chunk_count}")
                    print(f"Elapsed time: {elapsed:.2f}s")

                    # Verify expected behavior
                    if duration <= 1800 and processing_mode != "direct":
                        print(
                            f"WARNING: Expected direct transcription but got {processing_mode}"
                        )
                    elif duration > 1800 and processing_mode != "chunked":
                        print(
                            f"WARNING: Expected chunked transcription but got {processing_mode}"
                        )

                else:
                    print(f"Result: FAILED")
                    print(f"Success flag: {success}")
                    print(f"Output exists: {os.path.exists(output_path)}")

            except Exception as e:
                print(f"Exception during ASR: {e}")
                import traceback

                traceback.print_exc()

    return True


if __name__ == "__main__":
    print("Testing ASR chunking threshold (30 minutes/1800 seconds)")
    print("This test creates synthetic audio files of various durations")
    print("and verifies the correct transcription mode is used.\n")

    # Check if ffmpeg is available
    if subprocess.run(["which", "ffmpeg"], capture_output=True).returncode != 0:
        print("ERROR: ffmpeg not found in PATH")
        sys.exit(1)

    success = test_chunking()

    if success:
        print("\n✅ Chunking threshold test completed")
        sys.exit(0)
    else:
        print("\n❌ Chunking threshold test failed")
        sys.exit(1)