feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
@@ -0,0 +1,269 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Self-implemented ASRX - 自實作說話人分離系統
+基於聲紋嵌入 + 譜聚類
+
+技術架構:
+1. VAD (Silero VAD) - 語音活動檢測
+2. Speaker Encoder (ECAPA-TDNN) - 聲紋特徵提取
+3. Spectral Clustering - 譜聚類
+4. Post-processing - 後處理
+
+流程:
+音頻 → VAD → 語音片段 → 聲紋嵌入 → 相似度矩陣 → 譜聚類 → 說話人 ID
+"""
+
+import sys
+import json
+import time
+import numpy as np
+from pathlib import Path
+
+# 導入自定義模組
+from vad import load_vad_model, extract_speech_segments
+from speaker_encoder import (
+    load_speaker_encoder,
+    extract_speaker_embeddings_batch,
+    compute_similarity_matrix,
+    normalize_embeddings,
+)
+from speaker_cluster import spectral_clustering_speaker, smooth_speaker_labels
+
+
+class SelfASRX:
+    """
+    自實作說話人分離系統
+    """
+
+    def __init__(self):
+        """初始化模型"""
+        print("[SelfASRX] Initializing models...")
+
+        # 載入 VAD 模型
+        print("[SelfASRX] Loading VAD model (Silero)...")
+        self.vad_model, self.vad_utils = load_vad_model()
+
+        # 載入聲紋模型
+        print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
+        self.speaker_encoder = load_speaker_encoder()
+
+        print("[SelfASRX] Models loaded successfully")
+
+    def process(
+        self,
+        audio_path,
+        output_path=None,
+        min_speech_duration_ms=500,
+        n_speakers=None,
+        smooth_window=5,
+    ):
+        """
+        處理音頻文件進行說話人分離
+
+        Args:
+            audio_path: 音頻文件路徑
+            output_path: 輸出 JSON 路徑（可選）
+            min_speech_duration_ms: 最小語音持續時間
+            n_speakers: 說話人數量（None=自動估計）
+            smooth_window: 平滑窗口大小
+
+        Returns:
+            result: 說話人分離結果
+        """
+        start_time = time.time()
+        print(f"\n[SelfASRX] Processing: {audio_path}")
+        print("=" * 60)
+
+        # 步驟 1: VAD - 語音活動檢測
+        print("\n[Step 1] Voice Activity Detection...")
+        step1_start = time.time()
+
+        speech_segments, wav, sample_rate = extract_speech_segments(
+            audio_path,
+            self.vad_model,
+            self.vad_utils,
+            min_speech_duration_ms=min_speech_duration_ms,
+        )
+
+        step1_time = time.time() - step1_start
+        print(f"  Speech segments: {len(speech_segments)}")
+        print(f"  Total duration: {len(wav) / sample_rate:.2f}s")
+        print(f"  VAD time: {step1_time:.2f}s")
+
+        if len(speech_segments) == 0:
+            print("[SelfASRX] No speech detected!")
+            return {"error": "No speech detected", "segments": []}
+
+        # 步驟 2: 聲紋特徵提取
+        print("\n[Step 2] Speaker embedding extraction...")
+        step2_start = time.time()
+
+        # 提取語音片段音頻
+        audio_segments = []
+        for start_sec, end_sec in speech_segments:
+            start_sample = int(start_sec * sample_rate)
+            end_sample = int(end_sec * sample_rate)
+            audio_segments.append(wav[start_sample:end_sample])
+
+        # 批量提取嵌入
+        embeddings = extract_speaker_embeddings_batch(
+            self.speaker_encoder, audio_segments, sample_rate
+        )
+
+        # 正規化
+        embeddings = normalize_embeddings(embeddings)
+
+        step2_time = time.time() - step2_start
+        print(f"  Embedding shape: {embeddings.shape}")
+        print(f"  Embedding time: {step2_time:.2f}s")
+
+        # 步驟 3: 計算相似度矩陣
+        print("\n[Step 3] Computing similarity matrix...")
+        step3_start = time.time()
+
+        similarity_matrix = compute_similarity_matrix(embeddings, method="cosine")
+
+        step3_time = time.time() - step3_start
+        print(f"  Similarity matrix shape: {similarity_matrix.shape}")
+        print(f"  Similarity time: {step3_time:.2f}s")
+
+        # 步驟 4: 譜聚類
+        print("\n[Step 4] Spectral clustering...")
+        step4_start = time.time()
+
+        speaker_labels, estimated_n_speakers = spectral_clustering_speaker(
+            similarity_matrix, n_speakers=n_speakers, auto_estimate=(n_speakers is None)
+        )
+
+        # 平滑標籤
+        if smooth_window > 1:
+            speaker_labels = smooth_speaker_labels(
+                speaker_labels, window_size=smooth_window
+            )
+
+        step4_time = time.time() - step4_start
+        print(f"  Estimated speakers: {estimated_n_speakers}")
+        print(f"  Clustering time: {step4_time:.2f}s")
+
+        # 步驟 5: 建立輸出結果
+        print("\n[Step 5] Building output...")
+
+        result = {
+            "audio_path": str(audio_path),
+            "total_duration": len(wav) / sample_rate,
+            "n_speech_segments": len(speech_segments),
+            "n_speakers": int(estimated_n_speakers),
+            "segments": [],
+        }
+
+        for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
+            result["segments"].append(
+                {
+                    "index": i,
+                    "start": round(start, 3),
+                    "end": round(end, 3),
+                    "duration": round(end - start, 3),
+                    "speaker": f"SPEAKER_{int(label)}",
+                }
+            )
+
+        # 統計每個說話人的總時長
+        speaker_stats = {}
+        for seg in result["segments"]:
+            speaker = seg["speaker"]
+            if speaker not in speaker_stats:
+                speaker_stats[speaker] = {"count": 0, "duration": 0}
+            speaker_stats[speaker]["count"] += 1
+            speaker_stats[speaker]["duration"] += seg["duration"]
+
+        result["speaker_stats"] = speaker_stats
+
+        total_time = time.time() - start_time
+        result["processing_time"] = round(total_time, 2)
+        result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
+
+        print(f"\n[SelfASRX] Processing completed!")
+        print(f"  Total time: {total_time:.2f}s")
+        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
+        print(f"  Detected speakers: {estimated_n_speakers}")
+
+        # 保存結果
+        if output_path:
+            output_path = Path(output_path)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+
+            with open(output_path, "w", encoding="utf-8") as f:
+                json.dump(result, f, indent=2, ensure_ascii=False)
+
+            print(f"  Results saved to: {output_path}")
+
+        print("=" * 60)
+
+        return result
+
+
+def main():
+    """主函數"""
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Self-implemented ASRX - Speaker Diarization"
+    )
+    parser.add_argument("audio_path", help="Path to audio file")
+    parser.add_argument("-o", "--output", help="Output JSON path")
+    parser.add_argument(
+        "--min-speech-duration",
+        type=int,
+        default=500,
+        help="Minimum speech duration in ms (default: 500)",
+    )
+    parser.add_argument(
+        "--n-speakers",
+        type=int,
+        default=None,
+        help="Number of speakers (default: auto-estimate)",
+    )
+    parser.add_argument(
+        "--smooth-window",
+        type=int,
+        default=5,
+        help="Smoothing window size (default: 5)",
+    )
+
+    args = parser.parse_args()
+
+    # 檢查文件是否存在
+    if not Path(args.audio_path).exists():
+        print(f"Error: Audio file not found: {args.audio_path}")
+        sys.exit(1)
+
+    # 創建 ASRX 實例並處理
+    asrx = SelfASRX()
+    result = asrx.process(
+        args.audio_path,
+        args.output,
+        min_speech_duration_ms=args.min_speech_duration,
+        n_speakers=args.n_speakers,
+        smooth_window=args.smooth_window,
+    )
+
+    # 顯示結果摘要
+    if "error" not in result:
+        print(f"\n[Summary]")
+        print(f"  Audio duration: {result['total_duration']:.2f}s")
+        print(f"  Speech segments: {result['n_speech_segments']}")
+        print(f"  Detected speakers: {result['n_speakers']}")
+        print(f"  Processing time: {result['processing_time']:.2f}s")
+        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
+
+        print(f"\n[Speaker Statistics]")
+        for speaker, stats in result["speaker_stats"].items():
+            pct = stats["duration"] / result["total_duration"] * 100
+            print(
+                f"  {speaker}: {stats['count']} segments, "
+                + f"{stats['duration']:.2f}s ({pct:.1f}%)"
+            )
+
+
+if __name__ == "__main__":
+    main()