Files
momentry_core/scripts/face_clustering_processor.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

283 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/opt/homebrew/bin/python3.11
"""
Face Clustering Processor
職責:將短暫的 Face ID 聚合為持續的 Person ID並自動綁定 Speaker。
"""
import cv2
import json
import numpy as np
import os
import sys
import psycopg2
from sklearn.cluster import AgglomerativeClustering
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
try:
from deepface import DeepFace
HAS_DEEPFACE = True
except ImportError:
print("❌ DeepFace not found. Run: pip install deepface")
sys.exit(1)
# 設定
UUID = os.getenv("UUID", "quick_preview")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
VIDEO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
FACE_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face.json")
OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face_clustered.json")
ASRX_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.asrx.json")
DB_URL = os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
def optimized_clustering(embeddings):
"""
Optimized Clustering for large datasets (e.g. 25k faces).
Strategy: Sample -> Agglomerative -> Centroid Assignment
"""
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances
n_faces = len(embeddings)
print(f" 🚀 Starting optimized clustering for {n_faces} faces...")
# 1. Sampling
sample_size = min(5000, n_faces)
if n_faces > sample_size:
indices = np.random.choice(n_faces, sample_size, replace=False)
sample_embeddings = embeddings[indices]
else:
sample_embeddings = embeddings
indices = np.arange(n_faces)
print(f" 📊 Sampling {len(sample_embeddings)} faces for clustering structure...")
# 2. Agglomerative Clustering on Sample
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=0.4, metric="cosine", linkage="average"
)
sample_labels = clustering.fit_predict(sample_embeddings)
unique_labels = set(sample_labels)
n_clusters = len(unique_labels)
print(f" 🔍 Found {n_clusters} unique clusters in sample.")
# 3. Compute Centroids for each cluster
centroids = []
for label in unique_labels:
cluster_mask = sample_labels == label
cluster_faces = sample_embeddings[cluster_mask]
# Mean embedding
centroid = np.mean(cluster_faces, axis=0)
centroids.append(centroid)
centroids = np.array(centroids) # Shape: (n_clusters, 512)
# 4. Assign all faces to nearest centroid
# Batch processing to save memory
print(f" 🏃 Assigning {n_faces} faces to {n_clusters} clusters...")
all_labels = np.zeros(n_faces, dtype=int)
batch_size = 5000
for start in range(0, n_faces, batch_size):
end = min(start + batch_size, n_faces)
batch = embeddings[start:end]
dists = cosine_distances(batch, centroids)
all_labels[start:end] = np.argmin(dists, axis=1)
return all_labels
def main():
if not os.path.exists(FACE_JSON_PATH):
print("❌ Face JSON not found.")
return
with open(FACE_JSON_PATH) as f:
face_data = json.load(f)
frames_list = face_data.get("frames", [])
if not frames_list:
print("❌ No frames in JSON.")
return
cap = cv2.VideoCapture(VIDEO_PATH)
embeddings = []
face_refs = []
print(f"🔍 Extracting face embeddings from {UUID}...")
for frame_idx, frame_obj in enumerate(frames_list):
ts = frame_obj.get("timestamp")
faces = frame_obj.get("faces", [])
if not faces:
continue
if ts is not None:
cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
ret, frame = cap.read()
if not ret:
continue
for face_idx, face in enumerate(faces):
x, y, w, h = face["x"], face["y"], face["width"], face["height"]
margin = 5
crop = frame[
max(0, y - margin) : y + h + margin, max(0, x - margin) : x + w + margin
]
if crop is None or crop.size == 0:
continue
try:
res = DeepFace.represent(
img_path=crop, model_name="ArcFace", enforce_detection=False
)
if res and "embedding" in res[0]:
embeddings.append(res[0]["embedding"])
face_refs.append({"frame_idx": frame_idx, "face_idx": face_idx})
except Exception:
pass
cap.release()
if not embeddings:
print("❌ No embeddings extracted.")
return
embeddings = np.array(embeddings)
print(f"✅ Extracted {len(embeddings)} face embeddings.")
# 2. 聚類
print(f"🧠 Clustering {len(embeddings)} faces...")
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=0.4, metric="cosine", linkage="average"
)
labels = clustering.fit_predict(embeddings)
unique_labels = set(labels)
label_to_person = {l: f"Person_{i}" for i, l in enumerate(unique_labels)}
print(
f"👥 Detected {len(unique_labels)} unique persons: {[label_to_person[l] for l in unique_labels]}"
)
# 3. 更新 JSON
for ref, label in zip(face_refs, labels):
f_idx = ref["frame_idx"]
face_idx = ref["face_idx"]
person_id = label_to_person[label]
if f_idx < len(frames_list):
faces = frames_list[f_idx].get("faces", [])
if face_idx < len(faces):
frames_list[f_idx]["faces"][face_idx]["person_id"] = person_id
# 保存
with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
json.dump(face_data, f, indent=2, ensure_ascii=False)
print(f"✅ Saved clustered data to {OUTPUT_JSON_PATH}")
# 4. 自動綁定 Speaker
auto_bind_speakers()
def auto_bind_speakers():
if not os.path.exists(OUTPUT_JSON_PATH) or not os.path.exists(ASRX_JSON_PATH):
print("⚠️ Missing data for speaker binding.")
return
with open(OUTPUT_JSON_PATH) as f:
face_clustered = json.load(f)
with open(ASRX_JSON_PATH) as f:
asrx_data = json.load(f)
print("🔗 Auto-binding Speakers to Persons...")
# 建立 Face 時間列表
face_spans = []
for frame_obj in face_clustered.get("frames", []):
ts = frame_obj.get("timestamp")
for face in frame_obj.get("faces", []):
person_id = face.get("person_id")
if person_id and ts is not None:
face_spans.append({"ts": ts, "person_id": person_id})
speaker_person_counts = {}
# 對於每個說話片段,找出畫面中出現的人
for seg in asrx_data.get("segments", []):
start = seg.get("start")
end = seg.get("end")
speaker = seg.get("speaker_id")
if not speaker:
continue
# 找時間重疊
candidates = [f for f in face_spans if start <= f["ts"] <= end]
if candidates:
# 投票
person_counts = {}
for c in candidates:
pid = c["person_id"]
person_counts[pid] = person_counts.get(pid, 0) + 1
if speaker not in speaker_person_counts:
speaker_person_counts[speaker] = {}
best_person = max(person_counts, key=person_counts.get)
speaker_person_counts[speaker][best_person] = (
speaker_person_counts[speaker].get(best_person, 0) + 1
)
# 寫入資料庫
try:
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
for speaker, persons in speaker_person_counts.items():
if not persons:
continue
best_person = max(persons, key=persons.get)
print(
f" 🎤 {speaker} is likely {best_person} ({persons[best_person]} votes)"
)
# 1. 找或建 Talent
cur.execute("SELECT id FROM talents WHERE real_name = %s", (best_person,))
row = cur.fetchone()
if row:
talent_id = row[0]
else:
cur.execute(
"INSERT INTO talents (real_name) VALUES (%s) RETURNING id",
(best_person,),
)
talent_id = cur.fetchone()[0]
print(f" ✨ Created Talent #{talent_id} ({best_person})")
# 2. 綁定 Speaker
cur.execute(
"""
INSERT INTO identity_bindings (talent_id, binding_type, binding_value, source, confidence)
VALUES (%s, 'speaker', %s, 'auto_cluster', 0.8)
ON CONFLICT (binding_type, binding_value) DO UPDATE SET talent_id = EXCLUDED.talent_id
""",
(talent_id, speaker),
)
print(f" ✅ Bound {speaker} -> {best_person}")
conn.commit()
cur.close()
conn.close()
except Exception as e:
print(f" ❌ DB Error: {e}")
if __name__ == "__main__":
main()