- Add database migrations (006-028) for face recognition, identity, file_uuid - Add test scripts for ASR, face, search, processing - Add portal frontend (Tauri) - Add config, benchmark, and monitoring utilities - Add model checkpoints and pretrained model references
151 lines
4.1 KiB
Python
151 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Debug ASR processing stages for large video.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import subprocess
|
|
import tempfile
|
|
import json
|
|
from pathlib import Path
|
|
|
|
|
|
def run_ffmpeg_extract(video_path, audio_path):
|
|
"""Extract audio using ffmpeg."""
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-i",
|
|
str(video_path),
|
|
"-vn",
|
|
"-acodec",
|
|
"pcm_s16le",
|
|
"-ar",
|
|
"16000",
|
|
"-ac",
|
|
"1",
|
|
"-y",
|
|
str(audio_path),
|
|
]
|
|
print(f"Running ffmpeg: {' '.join(cmd)}")
|
|
start = time.time()
|
|
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
elapsed = time.time() - start
|
|
print(f"ffmpeg completed in {elapsed:.1f}s, return code: {proc.returncode}")
|
|
if proc.returncode != 0:
|
|
print(f"stderr: {proc.stderr[:500]}")
|
|
return proc.returncode == 0, elapsed
|
|
|
|
|
|
def test_asr_stages(video_path):
|
|
"""Test ASR stages step by step."""
|
|
video_path = Path(video_path)
|
|
print(f"Testing video: {video_path}")
|
|
print(f"Size: {video_path.stat().st_size / 1024 / 1024:.1f} MB")
|
|
|
|
# Stage 1: Check audio streams
|
|
print("\n=== Stage 1: Check audio streams ===")
|
|
cmd = [
|
|
"ffprobe",
|
|
"-v",
|
|
"error",
|
|
"-select_streams",
|
|
"a",
|
|
"-show_entries",
|
|
"stream=codec_name,channels,sample_rate,duration",
|
|
"-of",
|
|
"csv=p=0",
|
|
str(video_path),
|
|
]
|
|
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
print(f"Audio streams: {proc.stdout.strip()}")
|
|
|
|
# Stage 2: Extract audio
|
|
print("\n=== Stage 2: Extract audio ===")
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
audio_path = f.name
|
|
try:
|
|
success, extract_time = run_ffmpeg_extract(video_path, audio_path)
|
|
if success:
|
|
print(f"Audio extracted to {audio_path}")
|
|
print(f"Audio size: {Path(audio_path).stat().st_size / 1024 / 1024:.1f} MB")
|
|
else:
|
|
print("Audio extraction failed")
|
|
os.unlink(audio_path)
|
|
return
|
|
except Exception as e:
|
|
print(f"Error extracting audio: {e}")
|
|
return
|
|
|
|
# Stage 3: Load faster_whisper model (just import)
|
|
print("\n=== Stage 3: Test faster_whisper import ===")
|
|
try:
|
|
start = time.time()
|
|
from faster_whisper import WhisperModel
|
|
|
|
elapsed = time.time() - start
|
|
print(f"Import faster_whisper: {elapsed:.1f}s")
|
|
except Exception as e:
|
|
print(f"Import failed: {e}")
|
|
os.unlink(audio_path)
|
|
return
|
|
|
|
# Stage 4: Transcribe a small segment (first 30 seconds)
|
|
print("\n=== Stage 4: Transcribe first 30 seconds ===")
|
|
try:
|
|
# Trim audio to first 30 seconds
|
|
trim_path = audio_path + ".trim.wav"
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-i",
|
|
audio_path,
|
|
"-t",
|
|
"30",
|
|
"-acodec",
|
|
"pcm_s16le",
|
|
"-ar",
|
|
"16000",
|
|
"-ac",
|
|
"1",
|
|
"-y",
|
|
trim_path,
|
|
]
|
|
subprocess.run(cmd, capture_output=True)
|
|
|
|
# Load model with small model
|
|
start = time.time()
|
|
model = WhisperModel("tiny", device="cpu", compute_type="int8")
|
|
load_time = time.time() - start
|
|
print(f"Model loaded in {load_time:.1f}s")
|
|
|
|
# Transcribe
|
|
start = time.time()
|
|
segments, info = model.transcribe(trim_path, beam_size=5)
|
|
segments = list(segments) # Force processing
|
|
transcribe_time = time.time() - start
|
|
print(f"Transcription of 30s audio: {transcribe_time:.1f}s")
|
|
print(
|
|
f"Detected language: {info.language} with probability {info.language_probability}"
|
|
)
|
|
print(f"Segments found: {len(segments)}")
|
|
|
|
# Cleanup
|
|
os.unlink(trim_path)
|
|
except Exception as e:
|
|
print(f"Transcription test failed: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
finally:
|
|
os.unlink(audio_path)
|
|
|
|
print("\n=== Debug complete ===")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 2:
|
|
print(f"Usage: {sys.argv[0]} <video_file>")
|
|
sys.exit(1)
|
|
test_asr_stages(sys.argv[1])
|