- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
142 lines
4.1 KiB
Python
142 lines
4.1 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
ASRX Processor - Custom Implementation Wrapper
|
|
Uses SpeechBrain ECAPA-TDNN (no HuggingFace token required)
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import argparse
|
|
import os
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.insert(
|
|
0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self")
|
|
)
|
|
|
|
from redis_publisher import RedisPublisher
|
|
|
|
|
|
def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
|
|
"""Process video for speaker diarization using custom implementation"""
|
|
|
|
publisher = RedisPublisher(uuid) if uuid else None
|
|
if publisher:
|
|
publisher.info("asrx", "ASRX_START")
|
|
|
|
try:
|
|
from asrx_self.main_fixed import SelfASRXFixed
|
|
|
|
if publisher:
|
|
publisher.info("asrx", "ASRX_LOADING_MODEL")
|
|
|
|
# Initialize custom ASRX processor
|
|
asrx = SelfASRXFixed()
|
|
|
|
if publisher:
|
|
publisher.info("asrx", "ASRX_TRANSCRIBING")
|
|
|
|
# Process video/audio
|
|
result = asrx.process(
|
|
video_path,
|
|
output_path=None, # We'll save our own format
|
|
min_speech_duration_ms=500,
|
|
max_speakers=10,
|
|
)
|
|
|
|
if "error" in result:
|
|
if publisher:
|
|
publisher.error("asrx", result["error"])
|
|
|
|
# Return empty result
|
|
output_result = {"language": None, "segments": []}
|
|
|
|
with open(output_path, "w") as f:
|
|
json.dump(output_result, f, indent=2)
|
|
|
|
if publisher:
|
|
publisher.complete("asrx", "0 segments")
|
|
|
|
return output_result
|
|
|
|
# Convert to Rust-expected format
|
|
output_result = {
|
|
"language": None, # Custom implementation doesn't detect language
|
|
"segments": [],
|
|
}
|
|
|
|
# Convert segments
|
|
for seg in result["segments"]:
|
|
output_result["segments"].append(
|
|
{
|
|
"start": seg["start"],
|
|
"end": seg["end"],
|
|
"text": "", # Will be filled by matching with ASR later
|
|
"speaker_id": seg["speaker"],
|
|
}
|
|
)
|
|
|
|
# Add speaker_stats as optional metadata
|
|
if "speaker_stats" in result:
|
|
output_result["speaker_stats"] = result["speaker_stats"]
|
|
|
|
if publisher:
|
|
publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")
|
|
|
|
# Save output
|
|
with open(output_path, "w") as f:
|
|
json.dump(output_result, f, indent=2)
|
|
|
|
if publisher:
|
|
publisher.complete("asrx", f"{len(output_result['segments'])} segments")
|
|
|
|
print(
|
|
f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}"
|
|
)
|
|
|
|
return output_result
|
|
|
|
except Exception as e:
|
|
if publisher:
|
|
publisher.error("asrx", str(e))
|
|
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
|
|
# Return empty result on error
|
|
output_result = {"language": None, "segments": []}
|
|
|
|
with open(output_path, "w") as f:
|
|
json.dump(output_result, f, indent=2)
|
|
|
|
if publisher:
|
|
publisher.complete("asrx", "0 segments")
|
|
|
|
return output_result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="ASRX Processor (Custom Implementation)"
|
|
)
|
|
parser.add_argument("video_path", help="Path to video/audio file")
|
|
parser.add_argument("output_path", help="Path to output JSON file")
|
|
parser.add_argument("--uuid", help="UUID for Redis publishing", default="")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not Path(args.video_path).exists():
|
|
print(f"Error: Video file not found: {args.video_path}")
|
|
sys.exit(1)
|
|
|
|
result = process_asrx_custom(args.video_path, args.output_path, args.uuid)
|
|
|
|
print(f"\n[Summary]")
|
|
print(f" Total segments: {len(result['segments'])}")
|
|
if "speaker_stats" in result:
|
|
print(f" Detected speakers: {len(result['speaker_stats'])}")
|
|
for speaker, stats in result["speaker_stats"].items():
|
|
print(f" {speaker}: {stats['count']} segments")
|