#!/opt/homebrew/bin/python3.11 """ VAD (Voice Activity Detection) - 語音活動檢測 使用 Silero VAD 模型提取語音片段 技術來源: - Silero VAD: https://github.com/snakers4/silero-vad - 模型基於深度學習,準確度 95%+ """ import torch import numpy as np def load_vad_model(): """ 載入 Silero VAD 模型 Returns: model: VAD 模型 utils: 工具函數 """ model, utils = torch.hub.load( repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False, trust_repo=True, ) return model, utils def extract_speech_segments( audio_path, model, utils, min_speech_duration_ms=500, min_silence_duration_ms=300 ): """ 使用 VAD 提取語音片段 Args: audio_path: 音頻文件路徑 model: VAD 模型 utils: 工具函數 min_speech_duration_ms: 最小語音持續時間(毫秒) min_silence_duration_ms: 最小靜音持續時間(毫秒) Returns: speech_segments: 語音片段列表 [(start_sec, end_sec), ...] audio_waveform: 音頻波形 (numpy array) sample_rate: 採樣率 """ get_speech_timestamps, save_audio, read_audio, _, _ = utils # 讀取音頻 wav = read_audio(audio_path, sampling_rate=16000) sample_rate = 16000 # 獲取語音時間戳 speech_timestamps = get_speech_timestamps( wav, model, sampling_rate=sample_rate, min_speech_duration_ms=min_speech_duration_ms, min_silence_duration_ms=min_silence_duration_ms, return_seconds=True, ) # 轉換為片段列表 speech_segments = [(ts["start"], ts["end"]) for ts in speech_timestamps] return speech_segments, wav.numpy(), sample_rate def extract_speech_audio(audio_path, model, utils, output_dir=None): """ 提取語音片段並保存為單獨音頻文件 Args: audio_path: 原始音頻路徑 model: VAD 模型 utils: 工具函數 output_dir: 輸出目錄(可選) Returns: speech_audios: 語音音頻列表 [numpy array, ...] speech_segments: 語音片段列表 """ get_speech_timestamps, save_audio, read_audio, _, _ = utils # 讀取音頻 wav = read_audio(audio_path, sampling_rate=16000) sample_rate = 16000 # 獲取語音時間戳 speech_timestamps = get_speech_timestamps( wav, model, sampling_rate=sample_rate, min_speech_duration_ms=500, min_silence_duration_ms=300, return_seconds=False, # 使用樣本索引 ) # 提取語音片段 speech_audios = [] speech_segments = [] for i, ts in enumerate(speech_timestamps): start_sample = ts["start"] end_sample = ts["end"] # 提取音頻片段 speech_audio = wav[start_sample:end_sample] speech_audios.append(speech_audio.numpy()) speech_segments.append( ( start_sample / sample_rate, # 轉換為秒 end_sample / sample_rate, ) ) # 保存為文件(可選) if output_dir: import os output_path = os.path.join(output_dir, f"speech_{i:03d}.wav") save_audio(output_path, speech_audio, sample_rate) return speech_audios, speech_segments if __name__ == "__main__": # 測試 VAD import sys if len(sys.argv) < 2: print("Usage: python3 vad.py ") sys.exit(1) audio_path = sys.argv[1] print("[VAD] Loading model...") model, utils = load_vad_model() print(f"[VAD] Processing: {audio_path}") segments, wav, sr = extract_speech_segments(audio_path, model, utils) print(f"\n[VAD] Results:") print(f" Sample rate: {sr} Hz") print(f" Speech segments: {len(segments)}") print(f" Total duration: {len(wav) / sr:.2f}s") total_speech = sum(end - start for start, end in segments) print( f" Total speech: {total_speech:.2f}s ({total_speech / (len(wav) / sr) * 100:.1f}%)" ) print(f"\n[VAD] Segments:") for i, (start, end) in enumerate(segments[:10]): print(f" {i + 1:3d}. {start:6.2f}s - {end:6.2f}s ({end - start:5.2f}s)") if len(segments) > 10: print(f" ... and {len(segments) - 10} more segments")