momentry_core/scripts/audio_taxonomy_processor.py

#!/opt/homebrew/bin/python3.11
"""
Audio Taxonomy Processor (Hugging Face Transformers)
職責：使用 AST 模型進行高精度音頻分類，並映射到業務分類。
"""

import numpy as np
import json
import os
import sys
import librosa

# 依賴檢查
try:
    from transformers import pipeline

    HAS_HF = True
except ImportError:
    print("❌ transformers not found. Run: pip install transformers")
    sys.exit(1)

# 設定
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.audio_taxonomy.json")

# 1. 建立標籤映射字典 (AudioSet -> 業務分類)
TAXONOMY_MAP = {
    "Speech": "Human/Speech",
    "Male speech, man speaking": "Human/Speech",
    "Female speech, woman speaking": "Human/Speech",
    "Conversation": "Human/Speech",
    "Laughter": "Human/Vocals",
    "Singing": "Human/Vocals",
    "Choir": "Human/Vocals",
    "Cough": "Human/Vocals",
    "Applause": "Human/Vocals",
    "Rain": "Nature/Weather",
    "Raindrop": "Nature/Weather",
    "Thunder": "Nature/Weather",
    "Wind": "Nature/Weather",
    "Ocean": "Nature/Water",
    "Stream": "Nature/Water",
    "Bird": "Nature/Flora_Fauna",
    "Dog": "Nature/Flora_Fauna",
    "Cat": "Nature/Flora_Fauna",
    "Gunshot, gunfire": "Artificial/Impact_Weapon",
    "Explosion": "Artificial/Impact_Weapon",
    "Glass shatter": "Artificial/Impact_Weapon",
    "Car": "Artificial/Transport",
    "Engine": "Artificial/Transport",
    "Siren": "Artificial/Transport",
    "Piano": "Artificial/Music",
    "Guitar": "Artificial/Music",
    "Drum": "Artificial/Music",
    "Music": "Artificial/Music",
    "Keyboard": "Artificial/Household",
    "Telephone": "Artificial/Household",
    "Door": "Artificial/Household",
}


def map_to_taxonomy(predictions):
    """將 HF 輸出映射到業務分類"""
    events = {}
    for pred in predictions:
        label = pred["label"]
        score = pred["score"]
        mapped_cat = TAXONOMY_MAP.get(label)
        if mapped_cat and score > 0.3:  # 過濾低信心度
            events[mapped_cat] = round(float(score), 4)
    return events


def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
    """執行分類"""
    print(f"🔍 Loading AST model (MIT) from Hugging Face...")
    # 使用 Audio Spectrogram Transformer，準確率高且支援 MPS/CPU
    classifier = pipeline(
        "audio-classification",
        model="MIT/ast-finetuned-audioset-10-10-0.4593",
        device=-1,
    )

    print(f"📊 Analyzing audio in {chunk_sec}s chunks (hop: {hop_sec}s)...")
    y, sr = librosa.load(audio_path, sr=16000, mono=True)
    total_dur = len(y) / sr

    results = []
    current = 0.0

    print(f"⏱️  Total duration: {total_dur:.1f}s")
    while current + chunk_sec <= total_dur:
        start_sample = int(current * sr)
        end_sample = int((current + chunk_sec) * sr)
        clip = y[start_sample:end_sample]

        try:
            # 推斷 Top 5
            preds = classifier(clip, sampling_rate=16000, top_k=5)
            taxonomy = map_to_taxonomy(preds)

            if taxonomy:
                results.append({"timestamp": round(current, 1), "categories": taxonomy})
        except Exception as e:
            pass  # 跳過錯誤片段

        current += hop_sec
        if int(current) % 30 == 0:
            print(f"   🕒 Processed: {int(current)}s / {int(total_dur)}s")

    return results


if __name__ == "__main__":
    if not os.path.exists(AUDIO_PATH):
        AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
        if not os.path.exists(AUDIO_PATH_MP4):
            AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov")

        if os.path.exists(AUDIO_PATH_MP4):
            print("🎥 Extracting audio from video...")
            os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}")
        else:
            print("❌ No audio/video found.")
            sys.exit(1)

    print(f"🕵️‍♂️ Starting Audio Taxonomy Classification for {UUID}...")
    events = run_audio_taxonomy(AUDIO_PATH)

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)

    print(f"\n🎉 Classification Complete!")
    print(f"✅ Found {len(events)} tagged audio segments.")
    print(f"💾 Saved to {OUTPUT_JSON}")