Files
momentry_core/scripts/asrx_self/speaker_encoder.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

192 lines
5.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/opt/homebrew/bin/python3.11
"""
Speaker Encoder - 聲紋特徵提取
使用 ECAPA-TDNN 模型提取聲紋嵌入向量
技術來源:
- ECAPA-TDNN: Desplanques et al. (2020), Interspeech
- 論文https://arxiv.org/abs/2005.07143
- 模型SpeechBrain spkrec-ecapa-voxceleb
- 準確度EER 0.80% (VoxCeleb1)
"""
import torch
import numpy as np
from speechbrain.inference.speaker import EncoderClassifier
def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
"""
載入聲紋編碼器模型
Args:
model_name: 模型名稱HuggingFace
Returns:
classifier: 聲紋編碼器
"""
print(f"[SpeakerEncoder] Loading model: {model_name}")
classifier = EncoderClassifier.from_hparams(
source=model_name,
run_opts={"device": "cpu"}, # 使用 CPU
)
# 獲取模型資訊
print("[SpeakerEncoder] Model loaded successfully")
print("[SpeakerEncoder] Embedding dimension: 192")
return classifier
def extract_speaker_embedding(classifier, audio_waveform, sample_rate=16000):
"""
從音頻波形提取聲紋嵌入
Args:
classifier: 聲紋編碼器
audio_waveform: 音頻波形 (numpy array)
sample_rate: 採樣率
Returns:
embedding: 聲紋嵌入向量 (192 維)
"""
# 轉換為 torch tensor
if isinstance(audio_waveform, np.ndarray):
audio_tensor = torch.from_numpy(audio_waveform).float()
else:
audio_tensor = audio_waveform
# 確保是 2D [batch, time]
if audio_tensor.dim() == 1:
audio_tensor = audio_tensor.unsqueeze(0)
# 提取嵌入
with torch.no_grad():
embedding = classifier.encode_batch(audio_tensor)
# 轉換為 numpy
embedding = embedding.squeeze().cpu().numpy()
return embedding
def extract_speaker_embeddings_batch(classifier, audio_segments, sample_rate=16000):
"""
批量提取多個語音片段的聲紋嵌入
Args:
classifier: 聲紋編碼器
audio_segments: 音頻片段列表 [numpy array, ...]
sample_rate: 採樣率
Returns:
embeddings: 嵌入矩陣 [n_segments, 192]
"""
embeddings = []
for i, audio in enumerate(audio_segments):
emb = extract_speaker_embedding(classifier, audio, sample_rate)
embeddings.append(emb)
if (i + 1) % 50 == 0:
print(f"[SpeakerEncoder] Processed {i + 1} segments")
embeddings = np.vstack(embeddings)
print(f"[SpeakerEncoder] Extracted {embeddings.shape[0]} embeddings")
return embeddings
def compute_similarity_matrix(embeddings, method="cosine"):
"""
計算聲紋相似度矩陣
Args:
embeddings: 嵌入矩陣 [n_segments, 192]
method: 相似度計算方法 ('cosine', 'euclidean')
Returns:
similarity_matrix: 相似度矩陣 [n_segments, n_segments]
"""
from sklearn.metrics.pairwise import cosine_similarity
# 清洗數據:移除 NaN 和 Inf
embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
# 正規化
embeddings = normalize_embeddings(embeddings)
# 再次清洗
embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
if method == "cosine":
similarity = cosine_similarity(embeddings)
elif method == "euclidean":
from sklearn.metrics.pairwise import euclidean_distances
# 將距離轉換為相似度
distances = euclidean_distances(embeddings)
similarity = 1 / (1 + distances)
else:
raise ValueError(f"Unknown method: {method}")
# 確保沒有 NaN
similarity = np.nan_to_num(similarity, nan=0.5)
return similarity
def normalize_embeddings(embeddings):
"""
正規化嵌入向量(單位長度)
Args:
embeddings: 嵌入矩陣 [n_segments, 192]
Returns:
normalized: 正規化後的嵌入矩陣
"""
from sklearn.preprocessing import normalize
return normalize(embeddings, norm="l2")
if __name__ == "__main__":
# 測試聲紋編碼器
import sys
import torchaudio
if len(sys.argv) < 2:
print("Usage: python3 speaker_encoder.py <audio_path>")
sys.exit(1)
audio_path = sys.argv[1]
print("[Test] Loading speaker encoder...")
classifier = load_speaker_encoder()
print(f"\n[Test] Loading audio: {audio_path}")
wav, sr = torchaudio.load(audio_path)
# 重採樣到 16kHz
if sr != 16000:
transform = torchaudio.transforms.Resample(sr, 16000)
wav = transform(wav)
print(f"[Test] Audio shape: {wav.shape}")
print(f"[Test] Duration: {wav.shape[1] / 16000:.2f}s")
# 提取嵌入
print("\n[Test] Extracting speaker embedding...")
embedding = extract_speaker_embedding(classifier, wav.numpy())
print(f"[Test] Embedding shape: {embedding.shape}")
print(f"[Test] Embedding norm: {np.linalg.norm(embedding):.4f}")
print(f"[Test] Embedding mean: {embedding.mean():.4f}")
print(f"[Test] Embedding std: {embedding.std():.4f}")
# 顯示部分嵌入值
print("\n[Test] First 10 embedding values:")
print(f" {embedding[:10]}")