#!/opt/homebrew/bin/python3.11 """ Sound Event Detector (Impulse/Gunshot) 職責:使用聲學特徵檢測高能量脈衝聲音 (如槍聲、爆炸)。 """ import librosa import numpy as np import json import os import sys # 設定 OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output") UUID = os.getenv("UUID", "384b0ff44aaaa1f1") AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav") OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.sound_events.json") def detect_impulse_sounds(audio_path, threshold_multiplier=1.5): """ 檢測脈衝聲音 (Impulse Sounds) 原理:尋找 RMS 能量的局部峰值,且該峰值顯著高於背景噪音。 """ print(f"🔊 Loading audio: {audio_path}") # 載入音頻 (Mono, 22050Hz) y, sr = librosa.load(audio_path, sr=22050) print("📊 Analyzing energy envelope...") # 1. 計算 RMS 能量 (以 0.05秒 為一幀) frame_length = int(0.05 * sr) hop_length = int(0.02 * sr) rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0] # 2. 計算動態閾值 (背景噪音 + 標準差的倍數) # 使用移動平均來適應不同場景的背景音 background = np.median(rms) threshold = background * threshold_multiplier + 0.05 # 絕對底限 print(f" Background Level: {background:.4f}") print(f" Detection Threshold: {threshold:.4f}") # 3. 尋找超過閾值的峰值 # 使用 scipy 的 find_peaks 或簡單的 numpy 邏輯 from scipy.signal import find_peaks peaks, properties = find_peaks( rms, height=threshold, distance=int(0.2 / 0.02) ) # 至少間隔 0.2秒 # 4. 過濾與分類 events = [] for peak_idx in peaks: # 時間戳 (秒) time_sec = peak_idx * hop_length / sr # 特徵分析:檢查頻譜質心 (Spectral Centroid) - 槍聲通常頻譜質心高 # 取峰值前後一小段 start_frame = max(0, peak_idx - 2) end_frame = min(len(rms), peak_idx + 2) frame_idx = int(time_sec * sr) segment = y[max(0, frame_idx - 1000) : frame_idx + 1000] if len(segment) > 0: # 計算頻譜質心 (聲音的 "亮度") centroid = librosa.feature.spectral_centroid(y=segment, sr=sr)[0] avg_centroid = np.mean(centroid) # 計算頻帶能量 (Gunshot 通常高頻能量豐富) # 這裡簡化:如果 RMS 極高,直接標記為 "Gunshot/Explosion" rms_val = rms[peak_idx] event_type = "Loud Noise" if rms_val > threshold * 2.0: event_type = "Explosion/Gunshot" # 極高能量 elif rms_val > threshold * 1.2: event_type = "Loud Impact" events.append( { "timestamp": round(time_sec, 2), "type": event_type, "energy": round(float(rms_val), 4), "centroid": round(float(avg_centroid), 2), } ) return events if __name__ == "__main__": if not os.path.exists(AUDIO_PATH): # 嘗試從 mp4 提取 AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4") if not os.path.exists(AUDIO_PATH_MP4): AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov") if os.path.exists(AUDIO_PATH_MP4): print("🎥 Extracting audio from video...") os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}") else: print("❌ No audio/video found.") sys.exit(1) print(f"🕵️‍♂️ Starting Sound Event Detection for {UUID}...") # 執行檢測 events = detect_impulse_sounds(AUDIO_PATH) # 保存結果 with open(OUTPUT_JSON, "w") as f: json.dump({"sound_events": events}, f, indent=2) print(f"\n🎉 Found {len(events)} potential sound events.") print(f"💾 Results saved to {OUTPUT_JSON}") # 顯示前 10 個高能量事件 print("\n🔥 Top 10 Loudest Events (Potential Gunshots):") # 按能量排序 sorted_events = sorted(events, key=lambda x: x["energy"], reverse=True)[:10] for i, ev in enumerate(sorted_events): m, s = divmod(ev["timestamp"], 60) print( f" {i + 1}. [{int(m):02d}:{s:05.2f}] {ev['type']} (Energy: {ev['energy']:.4f})" )