Files
momentry_core/scripts/asrx_self/vad.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

162 lines
4.3 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
VAD (Voice Activity Detection) - 語音活動檢測
使用 Silero VAD 模型提取語音片段
技術來源:
- Silero VAD: https://github.com/snakers4/silero-vad
- 模型基於深度學習,準確度 95%+
"""
import torch
import numpy as np
def load_vad_model():
"""
載入 Silero VAD 模型
Returns:
model: VAD 模型
utils: 工具函數
"""
model, utils = torch.hub.load(
repo_or_dir="snakers4/silero-vad",
model="silero_vad",
force_reload=False,
trust_repo=True,
)
return model, utils
def extract_speech_segments(
audio_path, model, utils, min_speech_duration_ms=500, min_silence_duration_ms=300
):
"""
使用 VAD 提取語音片段
Args:
audio_path: 音頻文件路徑
model: VAD 模型
utils: 工具函數
min_speech_duration_ms: 最小語音持續時間(毫秒)
min_silence_duration_ms: 最小靜音持續時間(毫秒)
Returns:
speech_segments: 語音片段列表 [(start_sec, end_sec), ...]
audio_waveform: 音頻波形 (numpy array)
sample_rate: 採樣率
"""
get_speech_timestamps, save_audio, read_audio, _, _ = utils
# 讀取音頻
wav = read_audio(audio_path, sampling_rate=16000)
sample_rate = 16000
# 獲取語音時間戳
speech_timestamps = get_speech_timestamps(
wav,
model,
sampling_rate=sample_rate,
min_speech_duration_ms=min_speech_duration_ms,
min_silence_duration_ms=min_silence_duration_ms,
return_seconds=True,
)
# 轉換為片段列表
speech_segments = [(ts["start"], ts["end"]) for ts in speech_timestamps]
return speech_segments, wav.numpy(), sample_rate
def extract_speech_audio(audio_path, model, utils, output_dir=None):
"""
提取語音片段並保存為單獨音頻文件
Args:
audio_path: 原始音頻路徑
model: VAD 模型
utils: 工具函數
output_dir: 輸出目錄(可選)
Returns:
speech_audios: 語音音頻列表 [numpy array, ...]
speech_segments: 語音片段列表
"""
get_speech_timestamps, save_audio, read_audio, _, _ = utils
# 讀取音頻
wav = read_audio(audio_path, sampling_rate=16000)
sample_rate = 16000
# 獲取語音時間戳
speech_timestamps = get_speech_timestamps(
wav,
model,
sampling_rate=sample_rate,
min_speech_duration_ms=500,
min_silence_duration_ms=300,
return_seconds=False, # 使用樣本索引
)
# 提取語音片段
speech_audios = []
speech_segments = []
for i, ts in enumerate(speech_timestamps):
start_sample = ts["start"]
end_sample = ts["end"]
# 提取音頻片段
speech_audio = wav[start_sample:end_sample]
speech_audios.append(speech_audio.numpy())
speech_segments.append(
(
start_sample / sample_rate, # 轉換為秒
end_sample / sample_rate,
)
)
# 保存為文件(可選)
if output_dir:
import os
output_path = os.path.join(output_dir, f"speech_{i:03d}.wav")
save_audio(output_path, speech_audio, sample_rate)
return speech_audios, speech_segments
if __name__ == "__main__":
# 測試 VAD
import sys
if len(sys.argv) < 2:
print("Usage: python3 vad.py <audio_path>")
sys.exit(1)
audio_path = sys.argv[1]
print("[VAD] Loading model...")
model, utils = load_vad_model()
print(f"[VAD] Processing: {audio_path}")
segments, wav, sr = extract_speech_segments(audio_path, model, utils)
print(f"\n[VAD] Results:")
print(f" Sample rate: {sr} Hz")
print(f" Speech segments: {len(segments)}")
print(f" Total duration: {len(wav) / sr:.2f}s")
total_speech = sum(end - start for start, end in segments)
print(
f" Total speech: {total_speech:.2f}s ({total_speech / (len(wav) / sr) * 100:.1f}%)"
)
print(f"\n[VAD] Segments:")
for i, (start, end) in enumerate(segments[:10]):
print(f" {i + 1:3d}. {start:6.2f}s - {end:6.2f}s ({end - start:5.2f}s)")
if len(segments) > 10:
print(f" ... and {len(segments) - 10} more segments")