Files
momentry_core/scripts/audio_taxonomy_processor.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

138 lines
4.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/opt/homebrew/bin/python3.11
"""
Audio Taxonomy Processor (Hugging Face Transformers)
職責:使用 AST 模型進行高精度音頻分類,並映射到業務分類。
"""
import numpy as np
import json
import os
import sys
import librosa
# 依賴檢查
try:
from transformers import pipeline
HAS_HF = True
except ImportError:
print("❌ transformers not found. Run: pip install transformers")
sys.exit(1)
# 設定
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.audio_taxonomy.json")
# 1. 建立標籤映射字典 (AudioSet -> 業務分類)
TAXONOMY_MAP = {
"Speech": "Human/Speech",
"Male speech, man speaking": "Human/Speech",
"Female speech, woman speaking": "Human/Speech",
"Conversation": "Human/Speech",
"Laughter": "Human/Vocals",
"Singing": "Human/Vocals",
"Choir": "Human/Vocals",
"Cough": "Human/Vocals",
"Applause": "Human/Vocals",
"Rain": "Nature/Weather",
"Raindrop": "Nature/Weather",
"Thunder": "Nature/Weather",
"Wind": "Nature/Weather",
"Ocean": "Nature/Water",
"Stream": "Nature/Water",
"Bird": "Nature/Flora_Fauna",
"Dog": "Nature/Flora_Fauna",
"Cat": "Nature/Flora_Fauna",
"Gunshot, gunfire": "Artificial/Impact_Weapon",
"Explosion": "Artificial/Impact_Weapon",
"Glass shatter": "Artificial/Impact_Weapon",
"Car": "Artificial/Transport",
"Engine": "Artificial/Transport",
"Siren": "Artificial/Transport",
"Piano": "Artificial/Music",
"Guitar": "Artificial/Music",
"Drum": "Artificial/Music",
"Music": "Artificial/Music",
"Keyboard": "Artificial/Household",
"Telephone": "Artificial/Household",
"Door": "Artificial/Household",
}
def map_to_taxonomy(predictions):
"""將 HF 輸出映射到業務分類"""
events = {}
for pred in predictions:
label = pred["label"]
score = pred["score"]
mapped_cat = TAXONOMY_MAP.get(label)
if mapped_cat and score > 0.3: # 過濾低信心度
events[mapped_cat] = round(float(score), 4)
return events
def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
"""執行分類"""
print(f"🔍 Loading AST model (MIT) from Hugging Face...")
# 使用 Audio Spectrogram Transformer準確率高且支援 MPS/CPU
classifier = pipeline(
"audio-classification",
model="MIT/ast-finetuned-audioset-10-10-0.4593",
device=-1,
)
print(f"📊 Analyzing audio in {chunk_sec}s chunks (hop: {hop_sec}s)...")
y, sr = librosa.load(audio_path, sr=16000, mono=True)
total_dur = len(y) / sr
results = []
current = 0.0
print(f"⏱️ Total duration: {total_dur:.1f}s")
while current + chunk_sec <= total_dur:
start_sample = int(current * sr)
end_sample = int((current + chunk_sec) * sr)
clip = y[start_sample:end_sample]
try:
# 推斷 Top 5
preds = classifier(clip, sampling_rate=16000, top_k=5)
taxonomy = map_to_taxonomy(preds)
if taxonomy:
results.append({"timestamp": round(current, 1), "categories": taxonomy})
except Exception as e:
pass # 跳過錯誤片段
current += hop_sec
if int(current) % 30 == 0:
print(f" 🕒 Processed: {int(current)}s / {int(total_dur)}s")
return results
if __name__ == "__main__":
if not os.path.exists(AUDIO_PATH):
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
if not os.path.exists(AUDIO_PATH_MP4):
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov")
if os.path.exists(AUDIO_PATH_MP4):
print("🎥 Extracting audio from video...")
os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}")
else:
print("❌ No audio/video found.")
sys.exit(1)
print(f"🕵️‍♂️ Starting Audio Taxonomy Classification for {UUID}...")
events = run_audio_taxonomy(AUDIO_PATH)
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Classification Complete!")
print(f"✅ Found {len(events)} tagged audio segments.")
print(f"💾 Saved to {OUTPUT_JSON}")