#!/opt/homebrew/bin/python3.11 """ ASR + Lip 對應分析 分析 ASR 轉錄時間段與 Lip 嘴部檢測的對應關係 """ import json import sys def load_json(path): with open(path) as f: return json.load(f) def analyze_asr_lip(asr_path, lip_path): """分析 ASR 與 Lip 的對應關係""" # 載入數據 print(f"[Load] ASR: {asr_path}") asr_data = load_json(asr_path) print(f"[Load] Lip: {lip_path}") lip_data = load_json(lip_path) asr_segments = asr_data.get('segments', []) lip_frames = lip_data.get('frames', []) print(f"\n[Data] ASR segments: {len(asr_segments)}") print(f"[Data] Lip frames: {len(lip_frames)}") print() # 分析每個 ASR 段對應的 Lip 檢測 print("=" * 80) print("ASR 與 Lip 對應分析") print("=" * 80) print() stats = { 'total_asr_segments': len(asr_segments), 'with_lip_detection': 0, 'without_lip_detection': 0, 'speaking_detected': 0, 'not_speaking': 0, 'avg_openness': [], 'match_rate': 0.0 } print(f"{'ASR 段':<6} {'時間範圍':<15} {'文字':<30} {'Lip 幀數':<10} {'說話':<10} {'平均開合度'}") print("-" * 100) for i, asr_seg in enumerate(asr_segments[:20]): # 只分析前 20 段 asr_start = asr_seg['start'] asr_end = asr_seg['end'] asr_text = asr_seg.get('text', '')[:28] # 找到時間範圍內的 Lip 幀 lip_in_range = [ f for f in lip_frames if asr_start <= f['timestamp'] <= asr_end ] if lip_in_range: stats['with_lip_detection'] += 1 # 統計說話狀態 speaking_count = sum(1 for f in lip_in_range if f.get('is_speaking', False)) openness_values = [f.get('lip_openness', 0) for f in lip_in_range if f['face_detected']] if speaking_count > 0: stats['speaking_detected'] += 1 speak_status = f"✅ {speaking_count}/{len(lip_in_range)}" else: stats['not_speaking'] += 1 speak_status = f"❌ 0/{len(lip_in_range)}" avg_openness = sum(openness_values) / len(openness_values) if openness_values else 0 stats['avg_openness'].append(avg_openness) print(f"{i+1:<6} {asr_start:.1f}-{asr_end:.1f}s{'':<5} {asr_text:<30} {len(lip_in_range):<10} {speak_status:<10} {avg_openness:.3f}") else: stats['without_lip_detection'] += 1 print(f"{i+1:<6} {asr_start:.1f}-{asr_end:.1f}s{'':<5} {asr_text:<30} {'0':<10} {'-':<10} {'-':<10}") # 計算匹配率 if stats['with_lip_detection'] > 0: stats['match_rate'] = stats['speaking_detected'] / stats['with_lip_detection'] * 100 print() print("=" * 80) print("統計摘要") print("=" * 80) print() print(f"ASR 總段數:{stats['total_asr_segments']}") print(f"有 Lip 檢測:{stats['with_lip_detection']} ({stats['with_lip_detection']/stats['total_asr_segments']*100:.1f}%)") print(f"無 Lip 檢測:{stats['without_lip_detection']} ({stats['without_lip_detection']/stats['total_asr_segments']*100:.1f}%)") print() print(f"檢測到說話:{stats['speaking_detected']} ({stats['match_rate']:.1f}%)") print(f"未檢測說話:{stats['not_speaking']}") print() if stats['avg_openness']: overall_avg = sum(stats['avg_openness']) / len(stats['avg_openness']) print(f"平均嘴部開合度:{overall_avg:.4f}") print() return stats if __name__ == "__main__": if len(sys.argv) < 3: print("Usage: python3 analyze_asr_lip.py ") sys.exit(1) analyze_asr_lip(sys.argv[1], sys.argv[2])