#!/opt/homebrew/bin/python3.11 """ Text Semantic Analysis (PoC) 職責:分析 ASR 數據的語義分佈,生成統計報告並演示搜尋效果。 """ import sys import json import os import argparse import numpy as np sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) try: from sentence_transformers import SentenceTransformer from sklearn.cluster import KMeans HAS_DEPS = True except ImportError: HAS_DEPS = False print( "❌ Missing dependencies. Run: pip install sentence-transformers scikit-learn" ) sys.exit(1) OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output") def load_asr_data(uuid): path = os.path.join(OUTPUT_DIR, f"{uuid}.asr.json") if not os.path.exists(path): print(f"❌ ASR file not found: {path}") return None with open(path, "r") as f: return json.load(f) def run_analysis(uuid, num_topics=5): """ 運行語義分析 """ print(f"🚀 Starting Semantic Analysis for {uuid}...") # 1. 加載數據 data = load_asr_data(uuid) if not data: return segments = data.get("segments", []) texts = [ seg["text"] for seg in segments if len(seg["text"].strip()) > 5 ] # 過濾太短的 times = [seg["start"] for seg in segments if len(seg["text"].strip()) > 5] if not texts: print("❌ No valid text found.") return print(f"✅ Loaded {len(texts)} valid text segments.") # 2. 向量化 (使用輕量級模型 all-MiniLM-L6-v2) print("🧠 Generating embeddings (this may take a moment)...") model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = model.encode(texts, show_progress_bar=True) # 3. 統計分析:主題聚類 (K-Means) print(f"🔍 Identifying ~{num_topics} main topics...") kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10) labels = kmeans.fit_predict(embeddings) # 計算每個 Topic 的中心句 (離中心點最近的句子) topic_centers = [] for i in range(num_topics): cluster_indices = np.where(labels == i)[0] if len(cluster_indices) == 0: continue cluster_embeddings = embeddings[cluster_indices] cluster_texts = [texts[idx] for idx in cluster_indices] cluster_times = [times[idx] for idx in cluster_indices] # 計算 Cluster Center center = np.mean(cluster_embeddings, axis=0) # 找最接近中心的文本 sims = np.dot(cluster_embeddings, center) / ( np.linalg.norm(cluster_embeddings, axis=1) * np.linalg.norm(center) ) best_idx_in_cluster = np.argmax(sims) topic_centers.append( { "topic_id": i, "representative_text": cluster_texts[best_idx_in_cluster], "representative_time": cluster_times[best_idx_in_cluster], "count": len(cluster_texts), } ) # 4. 輸出報告 print("\n" + "=" * 60) print(f"📊 ANALYSIS REPORT FOR {uuid}") print("=" * 60) for topic in sorted(topic_centers, key=lambda x: x["count"], reverse=True): print(f"🔹 Topic {topic['topic_id']} ({topic['count']} segments):") print(f" 💬 '{topic['representative_text']}'") print(f" ⏰ Time: {topic['representative_time']:.2f}s") print("-" * 40) # 5. 演示搜尋 (Search Demo) print("\n🔎 SEARCH DEMO") print("-" * 60) query = input( "Enter a search query (e.g., 'money', 'fight', 'love', or press Enter to skip): " ) if query: query_vec = model.encode([query])[0] sims = np.dot(embeddings, query_vec) # 取 Top 3 top_indices = np.argsort(sims)[-3:][::-1] for idx in top_indices: print( f"✅ Match ({sims[idx] * 100:.1f}%): [{times[idx]:.1f}s] {texts[idx]}" ) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Semantic Analysis PoC") parser.add_argument("--uuid", default="384b0ff44aaaa1f1", help="Video UUID") parser.add_argument( "--topics", type=int, default=5, help="Number of topics to find" ) args = parser.parse_args() run_analysis(args.uuid, args.topics)