- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
192 lines
5.1 KiB
Python
192 lines
5.1 KiB
Python
#!/opt/homebrew/bin/python3.11
|
||
"""
|
||
Speaker Encoder - 聲紋特徵提取
|
||
使用 ECAPA-TDNN 模型提取聲紋嵌入向量
|
||
|
||
技術來源:
|
||
- ECAPA-TDNN: Desplanques et al. (2020), Interspeech
|
||
- 論文:https://arxiv.org/abs/2005.07143
|
||
- 模型:SpeechBrain spkrec-ecapa-voxceleb
|
||
- 準確度:EER 0.80% (VoxCeleb1)
|
||
"""
|
||
|
||
import torch
|
||
import numpy as np
|
||
from speechbrain.inference.speaker import EncoderClassifier
|
||
|
||
|
||
def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
|
||
"""
|
||
載入聲紋編碼器模型
|
||
|
||
Args:
|
||
model_name: 模型名稱(HuggingFace)
|
||
|
||
Returns:
|
||
classifier: 聲紋編碼器
|
||
"""
|
||
print(f"[SpeakerEncoder] Loading model: {model_name}")
|
||
|
||
classifier = EncoderClassifier.from_hparams(
|
||
source=model_name,
|
||
run_opts={"device": "cpu"}, # 使用 CPU
|
||
)
|
||
|
||
# 獲取模型資訊
|
||
print("[SpeakerEncoder] Model loaded successfully")
|
||
print("[SpeakerEncoder] Embedding dimension: 192")
|
||
|
||
return classifier
|
||
|
||
|
||
def extract_speaker_embedding(classifier, audio_waveform, sample_rate=16000):
|
||
"""
|
||
從音頻波形提取聲紋嵌入
|
||
|
||
Args:
|
||
classifier: 聲紋編碼器
|
||
audio_waveform: 音頻波形 (numpy array)
|
||
sample_rate: 採樣率
|
||
|
||
Returns:
|
||
embedding: 聲紋嵌入向量 (192 維)
|
||
"""
|
||
# 轉換為 torch tensor
|
||
if isinstance(audio_waveform, np.ndarray):
|
||
audio_tensor = torch.from_numpy(audio_waveform).float()
|
||
else:
|
||
audio_tensor = audio_waveform
|
||
|
||
# 確保是 2D [batch, time]
|
||
if audio_tensor.dim() == 1:
|
||
audio_tensor = audio_tensor.unsqueeze(0)
|
||
|
||
# 提取嵌入
|
||
with torch.no_grad():
|
||
embedding = classifier.encode_batch(audio_tensor)
|
||
|
||
# 轉換為 numpy
|
||
embedding = embedding.squeeze().cpu().numpy()
|
||
|
||
return embedding
|
||
|
||
|
||
def extract_speaker_embeddings_batch(classifier, audio_segments, sample_rate=16000):
|
||
"""
|
||
批量提取多個語音片段的聲紋嵌入
|
||
|
||
Args:
|
||
classifier: 聲紋編碼器
|
||
audio_segments: 音頻片段列表 [numpy array, ...]
|
||
sample_rate: 採樣率
|
||
|
||
Returns:
|
||
embeddings: 嵌入矩陣 [n_segments, 192]
|
||
"""
|
||
embeddings = []
|
||
|
||
for i, audio in enumerate(audio_segments):
|
||
emb = extract_speaker_embedding(classifier, audio, sample_rate)
|
||
embeddings.append(emb)
|
||
|
||
if (i + 1) % 50 == 0:
|
||
print(f"[SpeakerEncoder] Processed {i + 1} segments")
|
||
|
||
embeddings = np.vstack(embeddings)
|
||
print(f"[SpeakerEncoder] Extracted {embeddings.shape[0]} embeddings")
|
||
|
||
return embeddings
|
||
|
||
|
||
def compute_similarity_matrix(embeddings, method="cosine"):
|
||
"""
|
||
計算聲紋相似度矩陣
|
||
|
||
Args:
|
||
embeddings: 嵌入矩陣 [n_segments, 192]
|
||
method: 相似度計算方法 ('cosine', 'euclidean')
|
||
|
||
Returns:
|
||
similarity_matrix: 相似度矩陣 [n_segments, n_segments]
|
||
"""
|
||
from sklearn.metrics.pairwise import cosine_similarity
|
||
|
||
# 清洗數據:移除 NaN 和 Inf
|
||
embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
|
||
|
||
# 正規化
|
||
embeddings = normalize_embeddings(embeddings)
|
||
|
||
# 再次清洗
|
||
embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
|
||
|
||
if method == "cosine":
|
||
similarity = cosine_similarity(embeddings)
|
||
elif method == "euclidean":
|
||
from sklearn.metrics.pairwise import euclidean_distances
|
||
|
||
# 將距離轉換為相似度
|
||
distances = euclidean_distances(embeddings)
|
||
similarity = 1 / (1 + distances)
|
||
else:
|
||
raise ValueError(f"Unknown method: {method}")
|
||
|
||
# 確保沒有 NaN
|
||
similarity = np.nan_to_num(similarity, nan=0.5)
|
||
|
||
return similarity
|
||
|
||
|
||
def normalize_embeddings(embeddings):
|
||
"""
|
||
正規化嵌入向量(單位長度)
|
||
|
||
Args:
|
||
embeddings: 嵌入矩陣 [n_segments, 192]
|
||
|
||
Returns:
|
||
normalized: 正規化後的嵌入矩陣
|
||
"""
|
||
from sklearn.preprocessing import normalize
|
||
|
||
return normalize(embeddings, norm="l2")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 測試聲紋編碼器
|
||
import sys
|
||
import torchaudio
|
||
|
||
if len(sys.argv) < 2:
|
||
print("Usage: python3 speaker_encoder.py <audio_path>")
|
||
sys.exit(1)
|
||
|
||
audio_path = sys.argv[1]
|
||
|
||
print("[Test] Loading speaker encoder...")
|
||
classifier = load_speaker_encoder()
|
||
|
||
print(f"\n[Test] Loading audio: {audio_path}")
|
||
wav, sr = torchaudio.load(audio_path)
|
||
|
||
# 重採樣到 16kHz
|
||
if sr != 16000:
|
||
transform = torchaudio.transforms.Resample(sr, 16000)
|
||
wav = transform(wav)
|
||
|
||
print(f"[Test] Audio shape: {wav.shape}")
|
||
print(f"[Test] Duration: {wav.shape[1] / 16000:.2f}s")
|
||
|
||
# 提取嵌入
|
||
print("\n[Test] Extracting speaker embedding...")
|
||
embedding = extract_speaker_embedding(classifier, wav.numpy())
|
||
|
||
print(f"[Test] Embedding shape: {embedding.shape}")
|
||
print(f"[Test] Embedding norm: {np.linalg.norm(embedding):.4f}")
|
||
print(f"[Test] Embedding mean: {embedding.mean():.4f}")
|
||
print(f"[Test] Embedding std: {embedding.std():.4f}")
|
||
|
||
# 顯示部分嵌入值
|
||
print("\n[Test] First 10 embedding values:")
|
||
print(f" {embedding[:10]}")
|