#!/opt/homebrew/bin/python3.11 """ Audio Taxonomy Processor (Hugging Face Transformers) 職責:使用 AST 模型進行高精度音頻分類,並映射到業務分類。 """ import json import os import sys import librosa # 依賴檢查 try: from transformers import pipeline HAS_HF = True except ImportError: print("❌ transformers not found. Run: pip install transformers") sys.exit(1) # 設定 UUID = os.getenv("UUID", "384b0ff44aaaa1f1") OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output") AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav") OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.audio_taxonomy.json") # 1. 建立標籤映射字典 (AudioSet -> 業務分類) TAXONOMY_MAP = { "Speech": "Human/Speech", "Male speech, man speaking": "Human/Speech", "Female speech, woman speaking": "Human/Speech", "Conversation": "Human/Speech", "Laughter": "Human/Vocals", "Singing": "Human/Vocals", "Choir": "Human/Vocals", "Cough": "Human/Vocals", "Applause": "Human/Vocals", "Rain": "Nature/Weather", "Raindrop": "Nature/Weather", "Thunder": "Nature/Weather", "Wind": "Nature/Weather", "Ocean": "Nature/Water", "Stream": "Nature/Water", "Bird": "Nature/Flora_Fauna", "Dog": "Nature/Flora_Fauna", "Cat": "Nature/Flora_Fauna", "Gunshot, gunfire": "Artificial/Impact_Weapon", "Explosion": "Artificial/Impact_Weapon", "Glass shatter": "Artificial/Impact_Weapon", "Car": "Artificial/Transport", "Engine": "Artificial/Transport", "Siren": "Artificial/Transport", "Piano": "Artificial/Music", "Guitar": "Artificial/Music", "Drum": "Artificial/Music", "Music": "Artificial/Music", "Keyboard": "Artificial/Household", "Telephone": "Artificial/Household", "Door": "Artificial/Household", } def map_to_taxonomy(predictions): """將 HF 輸出映射到業務分類""" events = {} for pred in predictions: label = pred["label"] score = pred["score"] mapped_cat = TAXONOMY_MAP.get(label) if mapped_cat and score > 0.3: # 過濾低信心度 events[mapped_cat] = round(float(score), 4) return events def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5): """執行分類""" print("🔍 Loading AST model (MIT) from Hugging Face...") # 使用 Audio Spectrogram Transformer,準確率高且支援 MPS/CPU classifier = pipeline( "audio-classification", model="MIT/ast-finetuned-audioset-10-10-0.4593", device=-1, ) print(f"📊 Analyzing audio in {chunk_sec}s chunks (hop: {hop_sec}s)...") y, sr = librosa.load(audio_path, sr=16000, mono=True) total_dur = len(y) / sr results = [] current = 0.0 print(f"⏱️ Total duration: {total_dur:.1f}s") while current + chunk_sec <= total_dur: start_sample = int(current * sr) end_sample = int((current + chunk_sec) * sr) clip = y[start_sample:end_sample] try: # 推斷 Top 5 preds = classifier(clip, sampling_rate=16000, top_k=5) taxonomy = map_to_taxonomy(preds) if taxonomy: results.append({"timestamp": round(current, 1), "categories": taxonomy}) except Exception: pass # 跳過錯誤片段 current += hop_sec if int(current) % 30 == 0: print(f" 🕒 Processed: {int(current)}s / {int(total_dur)}s") return results if __name__ == "__main__": if not os.path.exists(AUDIO_PATH): AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4") if not os.path.exists(AUDIO_PATH_MP4): AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov") if os.path.exists(AUDIO_PATH_MP4): print("🎥 Extracting audio from video...") os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}") else: print("❌ No audio/video found.") sys.exit(1) print(f"🕵️‍♂️ Starting Audio Taxonomy Classification for {UUID}...") events = run_audio_taxonomy(AUDIO_PATH) with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False) print("\n🎉 Classification Complete!") print(f"✅ Found {len(events)} tagged audio segments.") print(f"💾 Saved to {OUTPUT_JSON}")