Files
momentry_core/scripts/analyze_asr_lip.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

115 lines
3.8 KiB
Python
Executable File

#!/opt/homebrew/bin/python3.11
"""
ASR + Lip 對應分析
分析 ASR 轉錄時間段與 Lip 嘴部檢測的對應關係
"""
import json
import sys
def load_json(path):
with open(path) as f:
return json.load(f)
def analyze_asr_lip(asr_path, lip_path):
"""分析 ASR 與 Lip 的對應關係"""
# 載入數據
print(f"[Load] ASR: {asr_path}")
asr_data = load_json(asr_path)
print(f"[Load] Lip: {lip_path}")
lip_data = load_json(lip_path)
asr_segments = asr_data.get('segments', [])
lip_frames = lip_data.get('frames', [])
print(f"\n[Data] ASR segments: {len(asr_segments)}")
print(f"[Data] Lip frames: {len(lip_frames)}")
print()
# 分析每個 ASR 段對應的 Lip 檢測
print("=" * 80)
print("ASR 與 Lip 對應分析")
print("=" * 80)
print()
stats = {
'total_asr_segments': len(asr_segments),
'with_lip_detection': 0,
'without_lip_detection': 0,
'speaking_detected': 0,
'not_speaking': 0,
'avg_openness': [],
'match_rate': 0.0
}
print(f"{'ASR 段':<6} {'時間範圍':<15} {'文字':<30} {'Lip 幀數':<10} {'說話':<10} {'平均開合度'}")
print("-" * 100)
for i, asr_seg in enumerate(asr_segments[:20]): # 只分析前 20 段
asr_start = asr_seg['start']
asr_end = asr_seg['end']
asr_text = asr_seg.get('text', '')[:28]
# 找到時間範圍內的 Lip 幀
lip_in_range = [
f for f in lip_frames
if asr_start <= f['timestamp'] <= asr_end
]
if lip_in_range:
stats['with_lip_detection'] += 1
# 統計說話狀態
speaking_count = sum(1 for f in lip_in_range if f.get('is_speaking', False))
openness_values = [f.get('lip_openness', 0) for f in lip_in_range if f['face_detected']]
if speaking_count > 0:
stats['speaking_detected'] += 1
speak_status = f"{speaking_count}/{len(lip_in_range)}"
else:
stats['not_speaking'] += 1
speak_status = f"❌ 0/{len(lip_in_range)}"
avg_openness = sum(openness_values) / len(openness_values) if openness_values else 0
stats['avg_openness'].append(avg_openness)
print(f"{i+1:<6} {asr_start:.1f}-{asr_end:.1f}s{'':<5} {asr_text:<30} {len(lip_in_range):<10} {speak_status:<10} {avg_openness:.3f}")
else:
stats['without_lip_detection'] += 1
print(f"{i+1:<6} {asr_start:.1f}-{asr_end:.1f}s{'':<5} {asr_text:<30} {'0':<10} {'-':<10} {'-':<10}")
# 計算匹配率
if stats['with_lip_detection'] > 0:
stats['match_rate'] = stats['speaking_detected'] / stats['with_lip_detection'] * 100
print()
print("=" * 80)
print("統計摘要")
print("=" * 80)
print()
print(f"ASR 總段數:{stats['total_asr_segments']}")
print(f"有 Lip 檢測:{stats['with_lip_detection']} ({stats['with_lip_detection']/stats['total_asr_segments']*100:.1f}%)")
print(f"無 Lip 檢測:{stats['without_lip_detection']} ({stats['without_lip_detection']/stats['total_asr_segments']*100:.1f}%)")
print()
print(f"檢測到說話:{stats['speaking_detected']} ({stats['match_rate']:.1f}%)")
print(f"未檢測說話:{stats['not_speaking']}")
print()
if stats['avg_openness']:
overall_avg = sum(stats['avg_openness']) / len(stats['avg_openness'])
print(f"平均嘴部開合度:{overall_avg:.4f}")
print()
return stats
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python3 analyze_asr_lip.py <asr.json> <lip.json>")
sys.exit(1)
analyze_asr_lip(sys.argv[1], sys.argv[2])