feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
This commit is contained in:
Warren
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
+269
View File
@@ -0,0 +1,269 @@
#!/opt/homebrew/bin/python3.11
"""
Self-implemented ASRX - 自實作說話人分離系統
基於聲紋嵌入 + 譜聚類
技術架構:
1. VAD (Silero VAD) - 語音活動檢測
2. Speaker Encoder (ECAPA-TDNN) - 聲紋特徵提取
3. Spectral Clustering - 譜聚類
4. Post-processing - 後處理
流程:
音頻 → VAD → 語音片段 → 聲紋嵌入 → 相似度矩陣 → 譜聚類 → 說話人 ID
"""
import sys
import json
import time
import numpy as np
from pathlib import Path
# 導入自定義模組
from vad import load_vad_model, extract_speech_segments
from speaker_encoder import (
load_speaker_encoder,
extract_speaker_embeddings_batch,
compute_similarity_matrix,
normalize_embeddings,
)
from speaker_cluster import spectral_clustering_speaker, smooth_speaker_labels
class SelfASRX:
"""
自實作說話人分離系統
"""
def __init__(self):
"""初始化模型"""
print("[SelfASRX] Initializing models...")
# 載入 VAD 模型
print("[SelfASRX] Loading VAD model (Silero)...")
self.vad_model, self.vad_utils = load_vad_model()
# 載入聲紋模型
print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
self.speaker_encoder = load_speaker_encoder()
print("[SelfASRX] Models loaded successfully")
def process(
self,
audio_path,
output_path=None,
min_speech_duration_ms=500,
n_speakers=None,
smooth_window=5,
):
"""
處理音頻文件進行說話人分離
Args:
audio_path: 音頻文件路徑
output_path: 輸出 JSON 路徑(可選)
min_speech_duration_ms: 最小語音持續時間
n_speakers: 說話人數量(None=自動估計)
smooth_window: 平滑窗口大小
Returns:
result: 說話人分離結果
"""
start_time = time.time()
print(f"\n[SelfASRX] Processing: {audio_path}")
print("=" * 60)
# 步驟 1: VAD - 語音活動檢測
print("\n[Step 1] Voice Activity Detection...")
step1_start = time.time()
speech_segments, wav, sample_rate = extract_speech_segments(
audio_path,
self.vad_model,
self.vad_utils,
min_speech_duration_ms=min_speech_duration_ms,
)
step1_time = time.time() - step1_start
print(f" Speech segments: {len(speech_segments)}")
print(f" Total duration: {len(wav) / sample_rate:.2f}s")
print(f" VAD time: {step1_time:.2f}s")
if len(speech_segments) == 0:
print("[SelfASRX] No speech detected!")
return {"error": "No speech detected", "segments": []}
# 步驟 2: 聲紋特徵提取
print("\n[Step 2] Speaker embedding extraction...")
step2_start = time.time()
# 提取語音片段音頻
audio_segments = []
for start_sec, end_sec in speech_segments:
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
audio_segments.append(wav[start_sample:end_sample])
# 批量提取嵌入
embeddings = extract_speaker_embeddings_batch(
self.speaker_encoder, audio_segments, sample_rate
)
# 正規化
embeddings = normalize_embeddings(embeddings)
step2_time = time.time() - step2_start
print(f" Embedding shape: {embeddings.shape}")
print(f" Embedding time: {step2_time:.2f}s")
# 步驟 3: 計算相似度矩陣
print("\n[Step 3] Computing similarity matrix...")
step3_start = time.time()
similarity_matrix = compute_similarity_matrix(embeddings, method="cosine")
step3_time = time.time() - step3_start
print(f" Similarity matrix shape: {similarity_matrix.shape}")
print(f" Similarity time: {step3_time:.2f}s")
# 步驟 4: 譜聚類
print("\n[Step 4] Spectral clustering...")
step4_start = time.time()
speaker_labels, estimated_n_speakers = spectral_clustering_speaker(
similarity_matrix, n_speakers=n_speakers, auto_estimate=(n_speakers is None)
)
# 平滑標籤
if smooth_window > 1:
speaker_labels = smooth_speaker_labels(
speaker_labels, window_size=smooth_window
)
step4_time = time.time() - step4_start
print(f" Estimated speakers: {estimated_n_speakers}")
print(f" Clustering time: {step4_time:.2f}s")
# 步驟 5: 建立輸出結果
print("\n[Step 5] Building output...")
result = {
"audio_path": str(audio_path),
"total_duration": len(wav) / sample_rate,
"n_speech_segments": len(speech_segments),
"n_speakers": int(estimated_n_speakers),
"segments": [],
}
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
result["segments"].append(
{
"index": i,
"start": round(start, 3),
"end": round(end, 3),
"duration": round(end - start, 3),
"speaker": f"SPEAKER_{int(label)}",
}
)
# 統計每個說話人的總時長
speaker_stats = {}
for seg in result["segments"]:
speaker = seg["speaker"]
if speaker not in speaker_stats:
speaker_stats[speaker] = {"count": 0, "duration": 0}
speaker_stats[speaker]["count"] += 1
speaker_stats[speaker]["duration"] += seg["duration"]
result["speaker_stats"] = speaker_stats
total_time = time.time() - start_time
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print(f"\n[SelfASRX] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
# 保存結果
if output_path:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f" Results saved to: {output_path}")
print("=" * 60)
return result
def main():
"""主函數"""
import argparse
parser = argparse.ArgumentParser(
description="Self-implemented ASRX - Speaker Diarization"
)
parser.add_argument("audio_path", help="Path to audio file")
parser.add_argument("-o", "--output", help="Output JSON path")
parser.add_argument(
"--min-speech-duration",
type=int,
default=500,
help="Minimum speech duration in ms (default: 500)",
)
parser.add_argument(
"--n-speakers",
type=int,
default=None,
help="Number of speakers (default: auto-estimate)",
)
parser.add_argument(
"--smooth-window",
type=int,
default=5,
help="Smoothing window size (default: 5)",
)
args = parser.parse_args()
# 檢查文件是否存在
if not Path(args.audio_path).exists():
print(f"Error: Audio file not found: {args.audio_path}")
sys.exit(1)
# 創建 ASRX 實例並處理
asrx = SelfASRX()
result = asrx.process(
args.audio_path,
args.output,
min_speech_duration_ms=args.min_speech_duration,
n_speakers=args.n_speakers,
smooth_window=args.smooth_window,
)
# 顯示結果摘要
if "error" not in result:
print(f"\n[Summary]")
print(f" Audio duration: {result['total_duration']:.2f}s")
print(f" Speech segments: {result['n_speech_segments']}")
print(f" Detected speakers: {result['n_speakers']}")
print(f" Processing time: {result['processing_time']:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f"\n[Speaker Statistics]")
for speaker, stats in result["speaker_stats"].items():
pct = stats["duration"] / result["total_duration"] * 100
print(
f" {speaker}: {stats['count']} segments, "
+ f"{stats['duration']:.2f}s ({pct:.1f}%)"
)
if __name__ == "__main__":
main()