- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
120 lines
3.2 KiB
Python
120 lines
3.2 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
測試 pyannote.audio 的多語種說話人分離能力
|
|
"""
|
|
|
|
print("=== pyannote.audio 多語種測試 ===\n")
|
|
|
|
# 1. 檢查 pyannote.audio 版本
|
|
try:
|
|
import pyannote
|
|
print(f"✅ pyannote.audio 版本:{pyannote.__version__}")
|
|
except Exception as e:
|
|
print(f"❌ 無法導入 pyannote.audio: {e}")
|
|
|
|
# 2. 檢查模型
|
|
try:
|
|
from pyannote.audio import Pipeline
|
|
print("✅ Pipeline 導入成功")
|
|
|
|
# 檢查可用模型
|
|
print("\n可用模型:")
|
|
print("- pyannote/speaker-diarization-3.1 (最新版)")
|
|
print("- pyannote/speaker-diarization (穩定版)")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Pipeline 導入失敗:{e}")
|
|
|
|
# 3. 多語種支援說明
|
|
print("\n=== 多語種支援說明 ===\n")
|
|
|
|
print("pyannote.audio 的說話人分離原理:")
|
|
print("1. 基於聲紋特徵(非語言內容)")
|
|
print("2. 分析音色、音調、語速等")
|
|
print("3. 不依賴語言識別")
|
|
print("")
|
|
print("✅ 支援所有語言(因為不分析語意)")
|
|
print("✅ 中文 + 英文混合也可以")
|
|
print("✅ 粵語 + 國語混合也可以")
|
|
print("")
|
|
print("限制:")
|
|
print("⚠️ 重疊說話時準確度下降")
|
|
print("⚠️ 背景噪音影響準確度")
|
|
print("⚠️ 需要 HuggingFace token")
|
|
|
|
# 4. 使用範例
|
|
print("\n=== 使用範例 ===\n")
|
|
|
|
print("""
|
|
程式碼範例:
|
|
|
|
from pyannote.audio import Pipeline
|
|
|
|
# 載入模型
|
|
pipeline = Pipeline.from_pretrained(
|
|
"pyannote/speaker-diarization-3.1",
|
|
use_auth_token="hf_xxxxx" # 需要 token
|
|
)
|
|
|
|
# 執行說話人分離(支援任何語言)
|
|
diarization = pipeline("audio.wav")
|
|
|
|
# 輸出結果
|
|
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
|
print(f"[{turn.start:.2f}s - {turn.end:.2f}s] {speaker}")
|
|
|
|
輸出範例:
|
|
[0.00s - 5.32s] SPEAKER_00 (中文)
|
|
[5.50s - 12.18s] SPEAKER_01 (英文)
|
|
[12.50s - 18.75s] SPEAKER_00 (中文)
|
|
[19.00s - 25.43s] SPEAKER_02 (日文)
|
|
""")
|
|
|
|
# 5. 與 Whisper 整合
|
|
print("\n=== 與 Whisper 整合(多語種 ASR + 說話人分離)===\n")
|
|
|
|
print("""
|
|
完整流程:
|
|
|
|
1. Whisper 轉錄(支援多語種識別)
|
|
2. pyannote 說話人分離(支援多語種)
|
|
3. 整合結果
|
|
|
|
程式碼:
|
|
|
|
import whisper
|
|
from pyannote.audio import Pipeline
|
|
|
|
# Whisper ASR
|
|
whisper_model = whisper.load_model("base")
|
|
result = whisper_model.transcribe("audio.wav")
|
|
|
|
# pyannote 說話人分離
|
|
pipeline = Pipeline.from_pretrained(
|
|
"pyannote/speaker-diarization-3.1",
|
|
use_auth_token="hf_xxxxx"
|
|
)
|
|
diarization = pipeline("audio.wav")
|
|
|
|
# 整合
|
|
for segment in result["segments"]:
|
|
# 找到重疊的說話人
|
|
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
|
if segment["start"] < turn.end and segment["end"] > turn.start:
|
|
print(f"[{speaker}] ({result['language']}) {segment['text']}")
|
|
break
|
|
|
|
輸出範例:
|
|
[SPEAKER_00] (zh) 你好,歡迎來到今天的會議。
|
|
[SPEAKER_01] (en) Hello, let's start the meeting.
|
|
[SPEAKER_00] (zh) 首先討論第一季度的業績。
|
|
[SPEAKER_02] (ja) 私は反対です。
|
|
""")
|
|
|
|
print("\n=== 結論 ===\n")
|
|
print("✅ pyannote.audio 支援多語種說話人分離")
|
|
print("✅ 因為基於聲紋,不依賴語言")
|
|
print("✅ 適合多語言混合場景")
|
|
print("⚠️ 需要 HuggingFace token")
|
|
print("⚠️ 需要接受使用條款")
|