- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
162 lines
4.3 KiB
Python
162 lines
4.3 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
VAD (Voice Activity Detection) - 語音活動檢測
|
|
使用 Silero VAD 模型提取語音片段
|
|
|
|
技術來源:
|
|
- Silero VAD: https://github.com/snakers4/silero-vad
|
|
- 模型基於深度學習,準確度 95%+
|
|
"""
|
|
|
|
import torch
|
|
import numpy as np
|
|
|
|
|
|
def load_vad_model():
|
|
"""
|
|
載入 Silero VAD 模型
|
|
|
|
Returns:
|
|
model: VAD 模型
|
|
utils: 工具函數
|
|
"""
|
|
model, utils = torch.hub.load(
|
|
repo_or_dir="snakers4/silero-vad",
|
|
model="silero_vad",
|
|
force_reload=False,
|
|
trust_repo=True,
|
|
)
|
|
return model, utils
|
|
|
|
|
|
def extract_speech_segments(
|
|
audio_path, model, utils, min_speech_duration_ms=500, min_silence_duration_ms=300
|
|
):
|
|
"""
|
|
使用 VAD 提取語音片段
|
|
|
|
Args:
|
|
audio_path: 音頻文件路徑
|
|
model: VAD 模型
|
|
utils: 工具函數
|
|
min_speech_duration_ms: 最小語音持續時間(毫秒)
|
|
min_silence_duration_ms: 最小靜音持續時間(毫秒)
|
|
|
|
Returns:
|
|
speech_segments: 語音片段列表 [(start_sec, end_sec), ...]
|
|
audio_waveform: 音頻波形 (numpy array)
|
|
sample_rate: 採樣率
|
|
"""
|
|
get_speech_timestamps, save_audio, read_audio, _, _ = utils
|
|
|
|
# 讀取音頻
|
|
wav = read_audio(audio_path, sampling_rate=16000)
|
|
sample_rate = 16000
|
|
|
|
# 獲取語音時間戳
|
|
speech_timestamps = get_speech_timestamps(
|
|
wav,
|
|
model,
|
|
sampling_rate=sample_rate,
|
|
min_speech_duration_ms=min_speech_duration_ms,
|
|
min_silence_duration_ms=min_silence_duration_ms,
|
|
return_seconds=True,
|
|
)
|
|
|
|
# 轉換為片段列表
|
|
speech_segments = [(ts["start"], ts["end"]) for ts in speech_timestamps]
|
|
|
|
return speech_segments, wav.numpy(), sample_rate
|
|
|
|
|
|
def extract_speech_audio(audio_path, model, utils, output_dir=None):
|
|
"""
|
|
提取語音片段並保存為單獨音頻文件
|
|
|
|
Args:
|
|
audio_path: 原始音頻路徑
|
|
model: VAD 模型
|
|
utils: 工具函數
|
|
output_dir: 輸出目錄(可選)
|
|
|
|
Returns:
|
|
speech_audios: 語音音頻列表 [numpy array, ...]
|
|
speech_segments: 語音片段列表
|
|
"""
|
|
get_speech_timestamps, save_audio, read_audio, _, _ = utils
|
|
|
|
# 讀取音頻
|
|
wav = read_audio(audio_path, sampling_rate=16000)
|
|
sample_rate = 16000
|
|
|
|
# 獲取語音時間戳
|
|
speech_timestamps = get_speech_timestamps(
|
|
wav,
|
|
model,
|
|
sampling_rate=sample_rate,
|
|
min_speech_duration_ms=500,
|
|
min_silence_duration_ms=300,
|
|
return_seconds=False, # 使用樣本索引
|
|
)
|
|
|
|
# 提取語音片段
|
|
speech_audios = []
|
|
speech_segments = []
|
|
|
|
for i, ts in enumerate(speech_timestamps):
|
|
start_sample = ts["start"]
|
|
end_sample = ts["end"]
|
|
|
|
# 提取音頻片段
|
|
speech_audio = wav[start_sample:end_sample]
|
|
speech_audios.append(speech_audio.numpy())
|
|
speech_segments.append(
|
|
(
|
|
start_sample / sample_rate, # 轉換為秒
|
|
end_sample / sample_rate,
|
|
)
|
|
)
|
|
|
|
# 保存為文件(可選)
|
|
if output_dir:
|
|
import os
|
|
|
|
output_path = os.path.join(output_dir, f"speech_{i:03d}.wav")
|
|
save_audio(output_path, speech_audio, sample_rate)
|
|
|
|
return speech_audios, speech_segments
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# 測試 VAD
|
|
import sys
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python3 vad.py <audio_path>")
|
|
sys.exit(1)
|
|
|
|
audio_path = sys.argv[1]
|
|
|
|
print("[VAD] Loading model...")
|
|
model, utils = load_vad_model()
|
|
|
|
print(f"[VAD] Processing: {audio_path}")
|
|
segments, wav, sr = extract_speech_segments(audio_path, model, utils)
|
|
|
|
print(f"\n[VAD] Results:")
|
|
print(f" Sample rate: {sr} Hz")
|
|
print(f" Speech segments: {len(segments)}")
|
|
print(f" Total duration: {len(wav) / sr:.2f}s")
|
|
|
|
total_speech = sum(end - start for start, end in segments)
|
|
print(
|
|
f" Total speech: {total_speech:.2f}s ({total_speech / (len(wav) / sr) * 100:.1f}%)"
|
|
)
|
|
|
|
print(f"\n[VAD] Segments:")
|
|
for i, (start, end) in enumerate(segments[:10]):
|
|
print(f" {i + 1:3d}. {start:6.2f}s - {end:6.2f}s ({end - start:5.2f}s)")
|
|
|
|
if len(segments) > 10:
|
|
print(f" ... and {len(segments) - 10} more segments")
|