- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
199 lines
6.8 KiB
Python
Executable File
199 lines
6.8 KiB
Python
Executable File
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Self-implemented ASRX - Fixed Version
|
|
使用魯棒的聚類算法
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import time
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
# 導入自定義模組
|
|
from vad import load_vad_model, extract_speech_segments
|
|
from speaker_encoder import (
|
|
load_speaker_encoder,
|
|
extract_speaker_embeddings_batch,
|
|
normalize_embeddings
|
|
)
|
|
from speaker_cluster_fixed import robust_speaker_clustering
|
|
|
|
|
|
class SelfASRXFixed:
|
|
"""自實作說話人分離系統(修復版)"""
|
|
|
|
def __init__(self):
|
|
print("[SelfASRX-Fixed] Initializing models...")
|
|
|
|
# 載入 VAD 模型
|
|
print("[SelfASRX-Fixed] Loading VAD model (Silero)...")
|
|
self.vad_model, self.vad_utils = load_vad_model()
|
|
|
|
# 載入聲紋模型
|
|
print("[SelfASRX-Fixed] Loading speaker encoder (ECAPA-TDNN)...")
|
|
self.speaker_encoder = load_speaker_encoder()
|
|
|
|
print("[SelfASRX-Fixed] Models loaded successfully")
|
|
|
|
def process(self, audio_path, output_path=None,
|
|
min_speech_duration_ms=500,
|
|
n_speakers=None,
|
|
max_speakers=10):
|
|
"""處理音頻文件"""
|
|
start_time = time.time()
|
|
print(f"\n[SelfASRX-Fixed] Processing: {audio_path}")
|
|
print("=" * 60)
|
|
|
|
# 步驟 1: VAD
|
|
print("\n[Step 1] Voice Activity Detection...")
|
|
step1_start = time.time()
|
|
|
|
speech_segments, wav, sample_rate = extract_speech_segments(
|
|
audio_path, self.vad_model, self.vad_utils,
|
|
min_speech_duration_ms=min_speech_duration_ms
|
|
)
|
|
|
|
step1_time = time.time() - step1_start
|
|
print(f" Speech segments: {len(speech_segments)}")
|
|
print(f" Total duration: {len(wav)/sample_rate:.2f}s")
|
|
print(f" VAD time: {step1_time:.2f}s")
|
|
|
|
if len(speech_segments) == 0:
|
|
print("[SelfASRX-Fixed] No speech detected!")
|
|
return {"error": "No speech detected", "segments": []}
|
|
|
|
# 步驟 2: 聲紋特徵提取
|
|
print("\n[Step 2] Speaker embedding extraction...")
|
|
step2_start = time.time()
|
|
|
|
# 提取語音片段音頻
|
|
audio_segments = []
|
|
for start_sec, end_sec in speech_segments:
|
|
start_sample = int(start_sec * sample_rate)
|
|
end_sample = int(end_sec * sample_rate)
|
|
audio_segments.append(wav[start_sample:end_sample])
|
|
|
|
# 批量提取嵌入
|
|
embeddings = extract_speaker_embeddings_batch(
|
|
self.speaker_encoder, audio_segments, sample_rate
|
|
)
|
|
|
|
# 正規化
|
|
embeddings = normalize_embeddings(embeddings)
|
|
|
|
step2_time = time.time() - step2_start
|
|
print(f" Embedding shape: {embeddings.shape}")
|
|
print(f" Embedding time: {step2_time:.2f}s")
|
|
|
|
# 步驟 3: 魯棒聚類
|
|
print("\n[Step 3] Robust speaker clustering...")
|
|
step3_start = time.time()
|
|
|
|
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
|
embeddings,
|
|
n_speakers=n_speakers,
|
|
max_speakers=max_speakers
|
|
)
|
|
|
|
step3_time = time.time() - step3_start
|
|
print(f" Clustering time: {step3_time:.2f}s")
|
|
|
|
# 步驟 4: 建立輸出
|
|
print("\n[Step 4] Building output...")
|
|
|
|
result = {
|
|
"audio_path": str(audio_path),
|
|
"total_duration": len(wav) / sample_rate,
|
|
"n_speech_segments": len(speech_segments),
|
|
"n_speakers": int(estimated_n_speakers),
|
|
"segments": []
|
|
}
|
|
|
|
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
|
result["segments"].append({
|
|
"index": i,
|
|
"start": round(start, 3),
|
|
"end": round(end, 3),
|
|
"duration": round(end - start, 3),
|
|
"speaker": f"SPEAKER_{int(label)}"
|
|
})
|
|
|
|
# 統計每個說話人的總時長
|
|
speaker_stats = {}
|
|
for seg in result["segments"]:
|
|
speaker = seg["speaker"]
|
|
if speaker not in speaker_stats:
|
|
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
|
speaker_stats[speaker]["count"] += 1
|
|
speaker_stats[speaker]["duration"] += seg["duration"]
|
|
|
|
result["speaker_stats"] = speaker_stats
|
|
|
|
total_time = time.time() - start_time
|
|
result["processing_time"] = round(total_time, 2)
|
|
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
|
|
|
print(f"\n[SelfASRX-Fixed] Processing completed!")
|
|
print(f" Total time: {total_time:.2f}s")
|
|
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
|
print(f" Detected speakers: {estimated_n_speakers}")
|
|
|
|
# 保存結果
|
|
if output_path:
|
|
output_path = Path(output_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f" Results saved to: {output_path}")
|
|
|
|
print("=" * 60)
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Self-implemented ASRX (Fixed)")
|
|
parser.add_argument("audio_path", help="Path to audio file")
|
|
parser.add_argument("-o", "--output", help="Output JSON path")
|
|
parser.add_argument("--min-speech-duration", type=int, default=500)
|
|
parser.add_argument("--n-speakers", type=int, default=None)
|
|
parser.add_argument("--max-speakers", type=int, default=10)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not Path(args.audio_path).exists():
|
|
print(f"Error: Audio file not found: {args.audio_path}")
|
|
sys.exit(1)
|
|
|
|
asrx = SelfASRXFixed()
|
|
result = asrx.process(
|
|
args.audio_path,
|
|
args.output,
|
|
min_speech_duration_ms=args.min_speech_duration,
|
|
n_speakers=args.n_speakers,
|
|
max_speakers=args.max_speakers
|
|
)
|
|
|
|
if "error" not in result:
|
|
print(f"\n[Summary]")
|
|
print(f" Audio duration: {result['total_duration']:.2f}s")
|
|
print(f" Speech segments: {result['n_speech_segments']}")
|
|
print(f" Detected speakers: {result['n_speakers']}")
|
|
print(f" Processing time: {result['processing_time']:.2f}s")
|
|
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
|
|
|
print(f"\n[Speaker Statistics]")
|
|
for speaker, stats in result['speaker_stats'].items():
|
|
pct = stats['duration'] / result['total_duration'] * 100
|
|
print(f" {speaker}: {stats['count']} segments, " +
|
|
f"{stats['duration']:.2f}s ({pct:.1f}%)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|