feat: add migrations, test scripts, and utility tools

- Add database migrations (006-028) for face recognition, identity, file_uuid - Add test scripts for ASR, face, search, processing - Add portal frontend (Tauri) - Add config, benchmark, and monitoring utilities - Add model checkpoints and pretrained model references
2026-04-30 15:11:53 +08:00
parent 4d75b2e251
commit b54c2def30
192 changed files with 46721 additions and 0 deletions
--- a/test_chunking_threshold.py
+++ b/test_chunking_threshold.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Test chunking threshold (30 minutes/1800 seconds).
+Create a 40-minute audio file and test ASR chunking.
+"""
+
+import sys
+import os
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+
+def create_test_audio(duration_seconds, output_path):
+    """Create a silent audio file of specified duration using ffmpeg."""
+    cmd = [
+        "ffmpeg",
+        "-f",
+        "lavfi",
+        "-i",
+        f"anullsrc=r=16000:cl=mono",
+        "-t",
+        str(duration_seconds),
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        "-ac",
+        "1",
+        "-y",
+        output_path,
+    ]
+    result = subprocess.run(cmd, capture_output=True)
+    return result.returncode == 0 and os.path.exists(output_path)
+
+
+def test_chunking():
+    """Test ASR chunking with different audio durations."""
+
+    # Add scripts directory to path
+    sys.path.insert(0, os.path.join(os.path.dirname(__file__), "scripts"))
+
+    # Import after path is set
+    try:
+        from asr_processor import run_asr
+    except ImportError as e:
+        print(f"Failed to import asr_processor: {e}")
+        return False
+
+    test_cases = [
+        (1200, "20 minutes - should use direct transcription"),
+        (1800, "30 minutes - boundary, should use direct"),
+        (1810, "30m10s - should use chunked transcription"),
+        (2400, "40 minutes - should use chunked transcription"),
+    ]
+
+    for duration, description in test_cases:
+        print(f"\n{'=' * 60}")
+        print(f"Test: {description}")
+        print(f"Duration: {duration} seconds ({duration / 60:.1f} minutes)")
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            audio_path = os.path.join(temp_dir, "test_audio.wav")
+            output_path = os.path.join(temp_dir, "output.json")
+
+            print(f"Creating test audio...")
+            if not create_test_audio(duration, audio_path):
+                print(f"Failed to create test audio")
+                continue
+
+            print(f"Running ASR...")
+            start_time = time.time()
+
+            try:
+                # Run ASR
+                success = run_asr(
+                    video_path=None,  # Use audio directly
+                    audio_path=audio_path,
+                    output_path=output_path,
+                    model_size="tiny",
+                    progress=False,  # Don't use Redis publisher
+                )
+                elapsed = time.time() - start_time
+
+                if success and os.path.exists(output_path):
+                    # Load and check result
+                    import json
+
+                    with open(output_path, "r") as f:
+                        data = json.load(f)
+
+                    processing_mode = data.get("processing_mode", "unknown")
+                    chunk_count = data.get("chunk_count", 1)
+
+                    print(f"Result: SUCCESS")
+                    print(f"Processing mode: {processing_mode}")
+                    print(f"Chunk count: {chunk_count}")
+                    print(f"Elapsed time: {elapsed:.2f}s")
+
+                    # Verify expected behavior
+                    if duration <= 1800 and processing_mode != "direct":
+                        print(
+                            f"WARNING: Expected direct transcription but got {processing_mode}"
+                        )
+                    elif duration > 1800 and processing_mode != "chunked":
+                        print(
+                            f"WARNING: Expected chunked transcription but got {processing_mode}"
+                        )
+
+                else:
+                    print(f"Result: FAILED")
+                    print(f"Success flag: {success}")
+                    print(f"Output exists: {os.path.exists(output_path)}")
+
+            except Exception as e:
+                print(f"Exception during ASR: {e}")
+                import traceback
+
+                traceback.print_exc()
+
+    return True
+
+
+if __name__ == "__main__":
+    print("Testing ASR chunking threshold (30 minutes/1800 seconds)")
+    print("This test creates synthetic audio files of various durations")
+    print("and verifies the correct transcription mode is used.\n")
+
+    # Check if ffmpeg is available
+    if subprocess.run(["which", "ffmpeg"], capture_output=True).returncode != 0:
+        print("ERROR: ffmpeg not found in PATH")
+        sys.exit(1)
+
+    success = test_chunking()
+
+    if success:
+        print("\n✅ Chunking threshold test completed")
+        sys.exit(0)
+    else:
+        print("\n❌ Chunking threshold test failed")
+        sys.exit(1)