fix: ASRX duplication, TKG edges, trace ingest, and add pipeline progress publishing

- ASRX handler no longer stores duplicate 'asr' pre_chunks
- Pre_chunks storage made idempotent (delete-before-insert)
- Rule 1 + trace_ingest changed to query 'asrx' not 'asr'
- Trace chunks removed (dynamic from TKG/Qdrant)
- TKG scroll_face_points fixed: trace_id >= 1 (not == 1)
- TKG AsrxSegmentEntry: start/end -> start_time/end_time (match ASRX JSON)
- Unregister error handling: log instead of silent discard
- Add publish_pipeline_progress calls at each pipeline stage
  (processors, rule1, face_trace, identity_agent, TKG, rule2, completion)
This commit is contained in:
Accusys
2026-07-02 10:43:46 +08:00
parent d791d138f2
commit 3eabd45882
65 changed files with 9477 additions and 3852 deletions
+237 -74
View File
@@ -1,15 +1,17 @@
#!/opt/homebrew/bin/python3.11
"""
Appearance Processor - HSV color feature extraction for person tracking
Appearance Processor - Body part color extraction using pose keypoints
Input:
- video_path: source video
- pose_json: pose.json with frame bboxes
- pose_json: pose.json with keypoints and bbox
- output_path: output JSON
Output: appearance.json with HSV histogram per person per frame
Output: appearance.json with per-person per-frame body part colors
Depends on pose.json (bbox). Same 0-based frame numbering as face/pose/mediapipe.
Regions: head, neck, front_upper_body, front_lower_body,
back_upper_body, back_lower_body, left_hand, right_hand,
left_foot, right_foot
"""
import sys
@@ -20,82 +22,223 @@ import cv2
import numpy as np
def extract_appearance(frame, bbox):
x, y, w, h = bbox["x"], bbox["y"], bbox["width"], bbox["height"]
if w <= 0 or h <= 0:
return None
def get_kp(keypoints, name):
for kp in keypoints:
if kp.get("name") == name:
return (kp["x"], kp["y"], kp.get("confidence", 1.0))
return None
x1, y1 = max(0, x), max(0, y)
x2 = min(frame.shape[1], x + w)
y2 = min(frame.shape[0], y + h)
if x2 <= x1 or y2 <= y1:
return None
person_roi = frame[y1:y2, x1:x2]
hsv = cv2.cvtColor(person_roi, cv2.COLOR_BGR2HSV)
def determine_facing(keypoints):
nose = get_kp(keypoints, "nose")
left_shoulder = get_kp(keypoints, "left_shoulder")
right_shoulder = get_kp(keypoints, "right_shoulder")
if nose and nose[2] > 0.5:
return "front"
sh_vis = sum(1 for s in [left_shoulder, right_shoulder] if s and s[2] > 0.5)
if sh_vis >= 2 and (not nose or nose[2] < 0.2):
return "back"
if sh_vis >= 1:
return "profile"
return "unknown"
def extract_color(roi_bgr):
"""Extract HSV histogram and dominant colors from an ROI"""
if roi_bgr is None or roi_bgr.size == 0:
return None
if roi_bgr.shape[0] < 2 or roi_bgr.shape[1] < 2:
return None
hsv = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2HSV)
pixels = hsv.reshape(-1, 3).astype(np.float32)
# HSV histograms
h_hist = cv2.calcHist([hsv], [0], None, [30], [0, 180]).flatten()
s_hist = cv2.calcHist([hsv], [1], None, [32], [0, 256]).flatten()
v_hist = cv2.calcHist([hsv], [2], None, [32], [0, 256]).flatten()
h_sum = h_hist.sum() or 1
s_sum = s_hist.sum() or 1
v_sum = v_hist.sum() or 1
hs = h_hist.sum() or 1
ss = s_hist.sum() or 1
vs = v_hist.sum() or 1
# Dominant colors via k-means
dominant = []
if len(pixels) >= 5:
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
_, labels, centers = cv2.kmeans(
pixels, 5, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS
)
_, labels, centers = cv2.kmeans(pixels, 5, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
counts = np.bincount(labels.flatten())
dominant = centers[np.argsort(-counts)[:5]].tolist()
elif len(pixels) > 0:
dominant = [pixels.mean(axis=0).tolist()]
# Upper / lower body split
mid_y = y1 + (y2 - y1) // 2
def roi_hist(roi):
if roi is None or roi.size == 0:
return None
hsv_r = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
hh = cv2.calcHist([hsv_r], [0], None, [30], [0, 180]).flatten()
sh = cv2.calcHist([hsv_r], [1], None, [32], [0, 256]).flatten()
vh = cv2.calcHist([hsv_r], [2], None, [32], [0, 256]).flatten()
hs = hh.sum() or 1
ss = sh.sum() or 1
vs = vh.sum() or 1
return [(hh / hs).tolist(), (sh / ss).tolist(), (vh / vs).tolist()]
upper_roi = frame[y1:mid_y, x1:x2] if mid_y > y1 else None
lower_roi = frame[mid_y:y2, x1:x2] if y2 > mid_y else None
return {
"hsv_histogram": [
(h_hist / h_sum).tolist(),
(s_hist / s_sum).tolist(),
(v_hist / v_sum).tolist(),
],
"hsv_histogram": [(h_hist / hs).tolist(), (s_hist / ss).tolist(), (v_hist / vs).tolist()],
"dominant_colors": dominant,
"upper_body": roi_hist(upper_roi),
"lower_body": roi_hist(lower_roi),
}
def safe_roi(frame, x, y, w, h):
"""Extract a safe ROI, returning None if invalid"""
if w <= 0 or h <= 0:
return None
x1 = max(0, int(x))
y1 = max(0, int(y))
x2 = min(frame.shape[1], int(x + w))
y2 = min(frame.shape[0], int(y + h))
if x2 <= x1 or y2 <= y1:
return None
return frame[y1:y2, x1:x2]
def compute_body_regions(keypoints, face_bbox, frame_shape):
"""Use face bbox for size, pose keypoints for alignment"""
h, w = frame_shape[:2]
fx, fy, fw, fh = face_bbox["x"], face_bbox["y"], face_bbox["width"], face_bbox["height"]
face_cx = fx + fw / 2
nose = get_kp(keypoints, "nose")
ls = get_kp(keypoints, "left_shoulder")
rs = get_kp(keypoints, "right_shoulder")
lw = get_kp(keypoints, "left_wrist")
rw = get_kp(keypoints, "right_wrist")
lh = get_kp(keypoints, "left_hip")
rh = get_kp(keypoints, "right_hip")
la = get_kp(keypoints, "left_ankle")
ra = get_kp(keypoints, "right_ankle")
kp_nose = (nose[0], nose[1]) if nose else (face_cx, fy + fh * 0.5)
kp_sh_l = ls[0] if ls else (face_cx - fw * 1.5)
kp_sh_r = rs[0] if rs else (face_cx + fw * 1.5)
kp_sh_mid_x = (kp_sh_l + kp_sh_r) / 2
kp_sh_mid_y = ((ls[1] + rs[1]) / 2) if (ls and rs) else (fy + fh + fh * 0.3)
kp_hip_y = ((lh[1] + rh[1]) / 2) if (lh and rh) else (kp_sh_mid_y + fw * 2.0)
kp_hip_l = lh[0] if lh else (kp_sh_mid_x - fw * 1.2)
kp_hip_r = rh[0] if rh else (kp_sh_mid_x + fw * 1.2)
regions = {}
# head: nose-aligned, face-proportional
head_w = fw * 1.6
head_h = fh * 1.5
regions["head"] = {
"x": kp_nose[0] - head_w / 2,
"y": kp_nose[1] - head_h * 0.5,
"width": head_w,
"height": head_h,
}
# neck: nose-to-shoulder, face-width
neck_w = fw * 1.5
regions["neck"] = {
"x": kp_sh_mid_x - neck_w / 2,
"y": kp_nose[1] + fh * 0.4,
"width": neck_w,
"height": max(kp_sh_mid_y - kp_nose[1] - fh * 0.4, fh * 0.3),
}
# upper body: shoulder-aligned
ub_w = max(abs(kp_sh_r - kp_sh_l) * 1.3, fw * 3.0)
ub_h = fh * 3.0
regions["front_upper_body"] = {
"x": kp_sh_mid_x - ub_w / 2,
"y": kp_sh_mid_y,
"width": ub_w,
"height": ub_h,
}
regions["back_upper_body"] = dict(regions["front_upper_body"])
# lower body: hip-aligned
lb_w = max(abs(kp_hip_r - kp_hip_l) * 1.3, fw * 3.5)
lb_h = fh * 3.0
regions["front_lower_body"] = {
"x": kp_sh_mid_x - lb_w / 2,
"y": kp_hip_y,
"width": lb_w,
"height": lb_h,
}
regions["back_lower_body"] = dict(regions["front_lower_body"])
# hands: wrist-aligned
hs = fw * 1.0
if lw and lw[2] > 0.3:
regions["left_hand"] = {"x": lw[0] - hs / 2, "y": lw[1] - hs / 2, "width": hs, "height": hs}
else:
regions["left_hand"] = {"x": kp_sh_l - hs, "y": kp_sh_mid_y + fh * 0.5, "width": hs, "height": hs}
if rw and rw[2] > 0.3:
regions["right_hand"] = {"x": rw[0] - hs / 2, "y": rw[1] - hs / 2, "width": hs, "height": hs}
else:
regions["right_hand"] = {"x": kp_sh_r, "y": kp_sh_mid_y + fh * 0.5, "width": hs, "height": hs}
# feet: ankle-aligned
fs = fw * 1.0
if la and la[2] > 0.3:
regions["left_foot"] = {"x": la[0] - fs / 2, "y": la[1], "width": fs, "height": fs * 0.75}
else:
regions["left_foot"] = {"x": kp_sh_mid_x - fw * 1.0, "y": kp_hip_y + fh * 2.5, "width": fs, "height": fs * 0.75}
if ra and ra[2] > 0.3:
regions["right_foot"] = {"x": ra[0] - fs / 2, "y": ra[1], "width": fs, "height": fs * 0.75}
else:
regions["right_foot"] = {"x": kp_sh_mid_x + fw * 1.0 - fs, "y": kp_hip_y + fh * 2.5, "width": fs, "height": fs * 0.75}
# Extrapolate each bbox outward
expanded = {}
margins = {
"head": 0.10, "neck": 0.15,
"front_upper_body": 0.20, "back_upper_body": 0.20,
"front_lower_body": 0.15, "back_lower_body": 0.15,
"left_hand": 0.25, "right_hand": 0.25,
"left_foot": 0.20, "right_foot": 0.20,
}
for name, rb in regions.items():
m = margins.get(name, 0.15)
dx = int(rb["width"] * m)
dy = int(rb["height"] * m)
expanded[name] = {
"x": rb["x"] - dx,
"y": rb["y"] - dy,
"width": rb["width"] + dx * 2,
"height": rb["height"] + dy * 2,
}
return expanded
def filter_by_facing(regions, facing):
if facing == "front":
regions.pop("back_upper_body", None)
regions.pop("back_lower_body", None)
elif facing == "back":
regions.pop("front_upper_body", None)
regions.pop("front_lower_body", None)
return regions
def main():
parser = argparse.ArgumentParser(description="Appearance Processor")
parser.add_argument("video_path", help="Video file path")
parser.add_argument("pose_json", help="Pose JSON path (bbox input)")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("video_path")
parser.add_argument("pose_json")
parser.add_argument("output_path")
parser.add_argument("--uuid", "-u", default="")
args = parser.parse_args()
with open(args.pose_json) as f:
pose_data = json.load(f)
# Load face.json for anchor bbox (same directory as pose_json)
face_path = args.pose_json.replace(".pose.json", ".face.json")
face_data = {}
if os.path.exists(face_path):
with open(face_path) as f:
face_data = json.load(f)
# Build frame -> face bbox lookup
face_by_frame = {}
for fframe in face_data.get("frames", []):
fn = fframe.get("frame")
faces = fframe.get("faces", [])
if faces:
face_by_frame[fn] = faces[0] # first face bbox
fps = pose_data.get("fps", 30.0)
cap = cv2.VideoCapture(args.video_path)
@@ -115,38 +258,58 @@ def main():
if not ret:
continue
# Get face bbox for this frame
face_bbox = face_by_frame.get(frame_num, persons[0].get("bbox", {"x": 0, "y": 0, "width": 0, "height": 0}))
frame_persons = []
for pid, person in enumerate(persons):
keypoints = person.get("keypoints", [])
bbox = person.get("bbox", {})
if bbox.get("width", 0) <= 0 or bbox.get("height", 0) <= 0:
if not keypoints:
continue
appearance = extract_appearance(frame, bbox)
if appearance is None:
continue
frame_persons.append(
{
"person_id": pid,
"bbox": bbox,
**appearance,
}
)
facing = determine_facing(keypoints)
all_regions = compute_body_regions(keypoints, face_bbox, frame.shape)
regions = filter_by_facing(all_regions, facing)
body_parts = []
for name, rb in regions.items():
roi = safe_roi(frame, rb["x"], rb["y"], rb["width"], rb["height"])
color = extract_color(roi)
if color is None:
continue
body_parts.append({
"name": name,
"bbox": rb,
"hsv_histogram": color["hsv_histogram"],
"dominant_colors": color["dominant_colors"],
})
# Full bbox reference colors
full = None
if bbox.get("width", 0) > 0 and bbox.get("height", 0) > 0:
full_roi = safe_roi(frame, bbox["x"], bbox["y"], bbox["width"], bbox["height"])
full = extract_color(full_roi)
frame_persons.append({
"person_id": pid,
"bbox": bbox,
"facing": facing,
"body_parts": body_parts,
"dominant_colors": full["dominant_colors"] if full else [],
"hsv_histogram": full["hsv_histogram"] if full else [[], [], []],
})
if frame_persons:
frames_out.append(
{
"frame": frame_num,
"timestamp": pose_frame.get("timestamp", frame_num / fps),
"persons": frame_persons,
}
)
frames_out.append({
"frame": frame_num,
"timestamp": pose_frame.get("timestamp", frame_num / fps),
"persons": frame_persons,
})
cap.release()
output = {
"frame_count": len(frames_out),
"fps": fps,
"frames": frames_out,
}
output = {"frame_count": len(frames_out), "fps": fps, "frames": frames_out}
with open(args.output_path, "w") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
+37 -15
View File
@@ -201,7 +201,12 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
if not has_audio_stream(video_path):
if publisher:
publisher.info("asr", "No audio stream detected, skipping transcription")
output = {"language": "", "language_probability": 0.0, "segments": []}
output = {
"status": "no_audio_track",
"language": "",
"language_probability": 0.0,
"segments": []
}
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
if publisher:
@@ -336,16 +341,16 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
seg_start = start_t + segment.start
seg_end = start_t + segment.end
scene_idx = find_scene_idx((seg_start + seg_end) / 2)
scene_segments.append({
"start_time": seg_start,
"end_time": seg_end,
"start_frame": int(round(seg_start * fps)),
"end_frame": int(round(seg_end * fps)),
"text": segment.text.strip(),
"scene_number": scene_idx + 1,
"language": seg_language,
})
total_segments += 1
scene_segments.append({
"start_time": seg_start,
"end_time": seg_end,
"start_frame": int(round(seg_start * fps)),
"end_frame": int(round(seg_end * fps)),
"text": segment.text.strip(),
"scene_number": scene_idx + 1,
"language": seg_language,
})
total_segments += 1
# 當前 scene 結果寫入 .asr.tmp
all_segments.extend(scene_segments)
@@ -365,8 +370,18 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
try: os.rmdir(temp_dir)
except: pass
# Determine status for cut_scenes branch
if total_segments > 0:
status = "has_transcript"
else:
status = "silent_audio"
info_language = transcript_language or "unknown"
print(f"[ASR] Segmented transcription complete: {total_segments} segments", file=sys.stderr)
print(f"[ASR] Segmented transcription complete: {total_segments} segments, status={status}", file=sys.stderr)
# Write final output with status
with open(tmp_path, "w") as f:
json.dump({"status": status, "language": info_language, "segments": all_segments}, f)
else:
# 無 CUT 資料,直接轉錄(原有流程)
segments, info = transcribe_with_fallback(model, video_path, publisher)
@@ -386,8 +401,15 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
if total_segments % 100 == 0:
if publisher:
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
# Determine status for direct transcription branch
if total_segments > 0:
status = "has_transcript"
else:
status = "silent_audio"
with open(tmp_path, "w") as f:
json.dump({"language": info_language, "segments": all_segments}, f)
json.dump({"status": status, "language": info_language, "segments": all_segments}, f)
if publisher:
publisher.info("asr", f"ASR_LANGUAGE:{info_language}")
@@ -396,10 +418,10 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
os.rename(tmp_path, output_path)
if publisher:
publisher.complete("asr", f"{len(results)} segments")
publisher.complete("asr", f"{total_segments} segments")
sys.stderr.write(
f"ASR: Transcription complete, {len(results)} segments written to {output_path}\n"
f"ASR: Transcription complete, {total_segments} segments written to {output_path}\n"
)
sys.stderr.flush()
sys.exit(0)
+43 -4
View File
@@ -126,9 +126,17 @@ def _convert_result(result, output_path):
except Exception:
pass
segment_count = len(result.get("segments", []))
if segment_count > 0:
status = "has_transcript"
else:
status = "silent_audio"
output_result = {
"status": status,
"language": result.get("language"),
"segments": [],
"segment_count": segment_count,
"n_speakers": result.get("n_speakers", 0),
"speaker_stats": result.get("speaker_stats", {}),
}
@@ -172,6 +180,37 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
if publisher:
publisher.info("asrx", "ASRX_START")
# Check for audio stream first
tracks = probe_audio_tracks(video_path)
if not tracks:
if publisher:
publisher.info("asrx", "No audio stream detected")
output_result = {"status": "no_audio_track", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments (no audio)")
print("[ASRX] No audio stream, skipping", file=sys.stderr)
return output_result
# Check if ASR already determined no audio/silent - skip processing
asr_path = output_path.replace(".asrx.json", ".asr.json")
if os.path.exists(asr_path):
try:
with open(asr_path) as f:
asr_data = json.load(f)
asr_status = asr_data.get("status", "")
if asr_status in ("no_audio_track", "silent_audio"):
if publisher:
publisher.info("asrx", f"ASR status={asr_status}, skipping ASRX processing")
output_result = {"status": asr_status, "language": asr_data.get("language"), "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", f"0 segments (ASR: {asr_status})")
print(f"[ASRX] ASR status={asr_status}, skipping", file=sys.stderr)
return output_result
except Exception as e:
print(f"[ASRX] Failed to read ASR output: {e}", file=sys.stderr)
checkpoint_path = output_path + ".stage1.json"
# ── Phase 2: Resume from checkpoint (Steps 4-7 only) ──
@@ -189,7 +228,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
if "error" in result:
if publisher:
publisher.error("asrx", result["error"])
output_result = {"language": None, "segments": []}
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
@@ -225,7 +264,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
publisher.error("asrx", str(e))
import traceback
traceback.print_exc()
output_result = {"language": None, "segments": []}
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
@@ -289,7 +328,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
if "error" in result:
if publisher:
publisher.error("asrx", result["error"])
output_result = {"language": None, "segments": []}
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
@@ -320,7 +359,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
import traceback
traceback.print_exc()
output_result = {"language": None, "segments": []}
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
+21 -13
View File
@@ -216,19 +216,27 @@ class SelfASRXFixed:
return {"error": "No speech detected", "segments": []}
# ── Step 2: VAD scan 每個 rough segment 細切 ──
print("\n[Step 2] VAD scan for refined segmentation...")
t2 = time.time()
refined_segments = []
for seg in rough_segments:
s = seg["start"]
e = seg["end"]
sub = self._vad_scan_segment(wav, sample_rate, s, e)
if sub:
refined_segments.extend(sub)
else:
refined_segments.append((s, e))
print(f" Refined segments: {len(refined_segments)}")
print(f" Step 2 time: {time.time() - t2:.2f}s")
# Skip VAD if using ASR segments (preserve all ASR segments)
if asr_segments:
print("\n[Step 2] Skipping VAD scan, using ASR segments directly...")
t2 = time.time()
refined_segments = [(seg["start"], seg["end"]) for seg in rough_segments]
print(f" Refined segments: {len(refined_segments)}")
print(f" Step 2 time: {time.time() - t2:.2f}s")
else:
print("\n[Step 2] VAD scan for refined segmentation...")
t2 = time.time()
refined_segments = []
for seg in rough_segments:
s = seg["start"]
e = seg["end"]
sub = self._vad_scan_segment(wav, sample_rate, s, e)
if sub:
refined_segments.extend(sub)
else:
refined_segments.append((s, e))
print(f" Refined segments: {len(refined_segments)}")
print(f" Step 2 time: {time.time() - t2:.2f}s")
if not refined_segments:
return {"error": "No segments after VAD scan", "segments": []}
+124 -63
View File
@@ -1,91 +1,152 @@
#!/opt/homebrew/bin/python3.11
"""
CUT Processor - Scene Detection
Uses PySceneDetect for scene detection (local)
CUT Processor - Scene Detection & Video Quality Check
Uses ffprobe for video analysis. Always produces at least 1 scene.
"""
import sys
import json
import argparse
import os
import subprocess
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
def get_video_info(video_path: str) -> dict:
"""Get video info via ffprobe"""
try:
result = subprocess.run(
["ffprobe", "-v", "quiet", "-print_format", "json",
"-show_format", "-show_streams", video_path],
capture_output=True, text=True, timeout=30,
)
info = json.loads(result.stdout)
for stream in info.get("streams", []):
if stream.get("codec_type") == "video":
nb_frames = stream.get("nb_frames")
if nb_frames:
fr = stream.get("r_frame_rate", "0/1")
fps = eval(fr) if "/" in fr else float(fr)
return {
"frame_count": int(nb_frames),
"fps": fps,
"duration": float(stream.get("duration", 0)),
"width": int(stream.get("width", 0)),
"height": int(stream.get("height", 0)),
"codec": stream.get("codec_name", ""),
}
dur = float(stream.get("duration", 0))
afr = stream.get("avg_frame_rate", "0/1")
avg_fps = eval(afr) if "/" in afr else float(afr)
if dur > 0 and avg_fps > 0:
return {
"frame_count": int(dur * avg_fps),
"fps": avg_fps,
"duration": dur,
"width": int(stream.get("width", 0)),
"height": int(stream.get("height", 0)),
"codec": stream.get("codec_name", ""),
}
return {
"frame_count": 0, "fps": 0.0, "duration": dur,
"width": 0, "height": 0, "codec": "",
}
return {"frame_count": 0, "fps": 0.0, "duration": 0, "width": 0, "height": 0, "codec": ""}
except Exception:
return {"frame_count": 0, "fps": 0.0, "duration": 0, "width": 0, "height": 0, "codec": ""}
def detect_scenes_ffmpeg(video_path: str, fps: float, duration: float) -> list:
"""Detect scene changes using ffmpeg scene filter"""
try:
result = subprocess.run(
["ffprobe", "-v", "quiet", "-show_entries", "frame=pts_time",
"-of", "default=nk=0",
"-f", "lavfi",
f"movie={video_path},select='gt(scene\\,0.3)',showinfo",
"-show_frames"],
capture_output=True, text=True, timeout=300,
)
times = []
for line in (result.stderr + "\n" + result.stdout).split("\n"):
for prefix in ("pts_time=", "pts_time:"):
if prefix in line:
rest = line.split(prefix)[1].split()[0]
try:
t = float(rest)
times.append(t)
except ValueError:
pass
scenes = []
prev_time = 0.0
for i, t in enumerate(times):
end_frame = round(t * fps)
start_frame = round(prev_time * fps)
if end_frame > start_frame:
scenes.append({
"scene_number": i + 1,
"start_frame": start_frame,
"end_frame": end_frame - 1,
"start_time": prev_time,
"end_time": t - (1.0 / fps) if fps > 0 else t,
})
prev_time = t
last_frame = round(duration * fps) if fps > 0 else 0
prev_frame = round(prev_time * fps) if fps > 0 else 0
if last_frame > prev_frame:
scenes.append({
"scene_number": len(scenes) + 1,
"start_frame": prev_frame,
"end_frame": last_frame - 1,
"start_time": prev_time,
"end_time": duration,
})
return scenes
except Exception:
return []
def process_cut(video_path: str, output_path: str, uuid: str = ""):
"""Process video for scene detection"""
"""Process video for scene detection and quality verification"""
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("cut", "CUT_START")
try:
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
except ImportError:
if publisher:
publisher.error("cut", "scenedetect not installed")
result = {"frame_count": 0, "fps": 0.0, "scenes": []}
if publisher:
publisher.complete("cut", "0 scenes")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
return result
vinfo = get_video_info(video_path)
if publisher:
publisher.info("cut", "CUT_LOADING_VIDEO")
publisher.info("cut", f"fps={vinfo['fps']}, frames={vinfo['frame_count']}, codec={vinfo['codec']}")
# Create video manager and scene manager
video_manager = VideoManager([video_path])
scene_manager = SceneManager()
total_frames = vinfo["frame_count"]
fps = vinfo["fps"]
duration = vinfo["duration"]
# Add content detector (detects scene cuts based on frame differences)
# threshold: sensitivity (lower = more sensitive, default 30)
# min_scene_len: minimum frames per scene (default 15)
scene_manager.add_detector(ContentDetector(threshold=30.0, min_scene_len=15))
# Try ffmpeg scene detection
scenes = detect_scenes_ffmpeg(video_path, fps, duration)
# Set downscale factor for faster processing
video_manager.set_downscale_factor()
if publisher:
publisher.info("cut", "CUT_DETECTING")
# Start video manager
video_manager.start()
# Detect scenes
scene_manager.detect_scenes(frame_source=video_manager)
# Get scene list
scene_list = scene_manager.get_scene_list()
# Get frame rate
fps = video_manager.get_framerate()
if publisher:
publisher.info("cut", f"fps={fps}")
# Get total frame count
frame_count = 0
if scene_list:
frame_count = scene_list[-1][1].get_frames()
# Convert scenes to result format
scenes = []
for i, (start, end) in enumerate(scene_list):
scene = {
"scene_number": i + 1,
"start_frame": start.get_frames(),
"end_frame": end.get_frames() - 1, # end is exclusive
"start_time": start.get_seconds(),
"end_time": end.get_seconds() - (1.0 / fps) if fps > 0 else 0,
}
scenes.append(scene)
# Always ensure at least 1 scene
if not scenes and total_frames > 0:
scenes = [{
"scene_number": 1,
"start_frame": 0,
"end_frame": total_frames - 1,
"start_time": 0.0,
"end_time": duration,
}]
if publisher:
publisher.progress("cut", i + 1, len(scene_list), f"Scene {i + 1}")
publisher.info("cut", "No scene changes detected, using whole video as single scene")
result = {"frame_count": frame_count, "fps": fps, "scenes": scenes}
result = {
"frame_count": total_frames,
"fps": fps,
"scenes": scenes,
}
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
+51 -39
View File
@@ -14,13 +14,9 @@ from sklearn.cluster import AgglomerativeClustering
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
try:
from deepface import DeepFace
HAS_DEEPFACE = True
except ImportError:
print("❌ DeepFace not found. Run: pip install deepface")
sys.exit(1)
# Use FaceNet embeddings from face.json instead of DeepFace
HAS_DEEPFACE = False
print("[FACE_CLUSTER] Using FaceNet embeddings from face.json (DeepFace not required)")
# 設定
UUID = os.getenv("UUID", "quick_preview")
@@ -104,53 +100,69 @@ def main():
print("❌ No frames in JSON.")
return
cap = cv2.VideoCapture(VIDEO_PATH)
# Get embeddings from Qdrant
print(f"[FACE_CLUSTER] Loading embeddings from Qdrant for {UUID}...")
try:
import requests
qdrant_url = "http://localhost:6333"
collection = "_faces"
# Query all embeddings for this file_uuid
response = requests.post(
f"{qdrant_url}/collections/{collection}/points/scroll",
json={
"filter": {
"must": [
{"key": "file_uuid", "match": {"value": UUID}}
]
},
"limit": 10000,
"with_vector": True
}
)
if response.status_code == 200:
result = response.json()
points = result.get("result", {}).get("points", [])
print(f"[FACE_CLUSTER] Loaded {len(points)} embeddings from Qdrant")
# Build face_id -> embedding map
embedding_map = {}
for point in points:
face_id = point.get("payload", {}).get("face_id")
vector = point.get("vector")
if face_id and vector:
embedding_map[face_id] = vector
else:
print(f"[FACE_CLUSTER] Qdrant query failed: {response.status_code}")
embedding_map = {}
except Exception as e:
print(f"[FACE_CLUSTER] Failed to load embeddings from Qdrant: {e}")
embedding_map = {}
# Use embeddings from Qdrant or face.json
embeddings = []
face_refs = []
print(f"🔍 Extracting face embeddings from {UUID}...")
print(f"🔍 Collecting face embeddings for {UUID}...")
for frame_idx, frame_obj in enumerate(frames_list):
ts = frame_obj.get("timestamp")
faces = frame_obj.get("faces", [])
if not faces:
continue
if ts is not None:
cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
ret, frame = cap.read()
if not ret:
continue
for face_idx, face in enumerate(faces):
x, y, w, h = face["x"], face["y"], face["width"], face["height"]
margin = 5
crop = frame[
max(0, y - margin) : y + h + margin, max(0, x - margin) : x + w + margin
]
if crop is None or crop.size == 0:
continue
try:
res = DeepFace.represent(
img_path=crop, model_name="ArcFace", enforce_detection=False
)
if res and "embedding" in res[0]:
embeddings.append(res[0]["embedding"])
face_refs.append({"frame_idx": frame_idx, "face_idx": face_idx})
except Exception:
pass
cap.release()
face_id = face.get("face_id")
if face_id and face_id in embedding_map:
embeddings.append(embedding_map[face_id])
face_refs.append({"frame_idx": frame_idx, "face_idx": face_idx, "face_id": face_id})
if not embeddings:
print("❌ No embeddings extracted.")
print("❌ No embeddings found in Qdrant.")
return
embeddings = np.array(embeddings)
print(f"Extracted {len(embeddings)} face embeddings.")
print(f"Collected {len(embeddings)} face embeddings from Qdrant.")
# 2. 聚類
print(f"🧠 Clustering {len(embeddings)} faces...")
+25 -2
View File
@@ -35,7 +35,7 @@ from redis_publisher import RedisPublisher
from qdrant_faces import push_face_embeddings_batch
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
SWIFT_BIN = os.path.join(SCRIPT_DIR, "swift_processors", ".build", "debug", "swift_face_pose")
SWIFT_BIN = os.path.join(SCRIPT_DIR, "swift_processors", ".build", "release", "swift_face_pose")
FACENET_PATH = os.path.join(SCRIPT_DIR, "..", "models", "facenet512.mlpackage")
# Pose angle classification from roll/yaw
@@ -84,7 +84,12 @@ class FaceProcessorVision:
self.total_frames = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
self.width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))
self.height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Calculate 8Hz sample interval based on FPS
self.sample_interval = max(1, round(self.fps / 8))
print(f"[FACE_V2] Video: {self.width}x{self.height}, {self.fps:.1f}fps, {self.total_frames}f")
print(f"[FACE_V2] 8Hz sample interval: {self.fps:.1f}/8 = {self.sample_interval}")
def extract_face_embedding(self, face_img: np.ndarray) -> Optional[list]:
"""Run CoreML FaceNet on cropped face"""
@@ -126,11 +131,15 @@ class FaceProcessorVision:
output_basename = os.path.basename(self.output_path)
pose_basename = output_basename.replace("face", "pose")
swift_pose_out = os.path.join(output_dir, pose_basename)
# Appearance output: same directory, but replace "face" with "appearance" in filename
appearance_basename = output_basename.replace("face", "appearance")
swift_appearance_out = os.path.join(output_dir, appearance_basename)
cmd = [
SWIFT_BIN,
self.video_path,
swift_face_out,
swift_pose_out,
swift_appearance_out,
"--sample-interval", str(self.sample_interval),
]
if self.uuid:
@@ -286,17 +295,28 @@ class FaceProcessorVision:
# Convert dict frames to list for Rust FaceResult format
frames_list = []
total_faces = 0
for fnum_str, fdata in sorted(face_data["frames"].items(), key=lambda x: int(x[0])):
faces = fdata["faces"]
total_faces += len(faces)
frames_list.append({
"frame": int(fnum_str),
"timestamp": fdata["time_seconds"],
"faces": fdata["faces"],
"faces": faces,
})
# Determine status based on face count
if total_faces > 0:
status = "has_faces"
else:
status = "no_faces"
output = {
"status": status,
"frame_count": len(frames_list),
"fps": self.fps,
"frames": frames_list,
"total_faces": total_faces,
}
with open(self.output_path, "w") as f:
@@ -339,6 +359,9 @@ def main():
args.uuid, args.sample_interval, publisher
)
# Open video to get FPS and calculate sample_interval
processor.open_video()
# Step 1: Vision detection (bbox + pose via ANE)
try:
detection = processor.process_with_swift()
-334
View File
@@ -1,334 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Fast Face Clustering Processor (Linear Scan)
職責:針對長片優化,使用線性讀取取代隨機跳轉,大幅提升速度。
"""
import cv2
import json
import numpy as np
import os
import sys
import psycopg2
from collections import defaultdict
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
try:
from deepface import DeepFace
HAS_DEEPFACE = True
except ImportError:
print("❌ DeepFace not found.")
sys.exit(1)
from sklearn.cluster import AgglomerativeClustering
# 設定
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
VIDEO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
FACE_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face.json")
OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face_clustered.json")
ASRX_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.asrx.json")
DB_URL = os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
def main():
if not os.path.exists(FACE_JSON_PATH):
print(f"❌ Face JSON not found: {FACE_JSON_PATH}")
return
print(f"⚡ 開始執行快速面孔聚類 (Linear Scan Mode) for {UUID}...")
# 1. 載入並建立索引 (以 frame number 為 key)
with open(FACE_JSON_PATH) as f:
face_data = json.load(f)
frames_list = face_data.get("frames", [])
if not frames_list:
print("❌ No frames in JSON.")
return
# 建立 map: frame_index -> faces
# 注意:JSON 中的 frame 是 int,但也許是 float?
# face_processor 輸出通常是 int
faces_map = defaultdict(list)
# 為了安全,我們也建立 timestamp map 以防萬一,但優先使用 frame number
print(f"📂 Indexing {len(frames_list)} frames with faces...")
for frame_obj in frames_list:
# JSON 中可能是 'frame' (int) 或 'frame_number'
idx = frame_obj.get("frame") or frame_obj.get("frame_number")
if idx is not None:
faces_map[int(idx)].extend(frame_obj.get("faces", []))
# 如果沒有 frame number 字段,我們只能依靠 timestamp (比較慢)
if not faces_map:
print("⚠️ No frame numbers found in JSON. Falling back to timestamp seeking.")
# 這裡我們可以呼叫舊的邏輯,但為了簡單,我們假設 face_processor 有寫 frame
# 檢查第一個 frame 的 key
if frames_list:
print(f" Keys: {frames_list[0].keys()}")
return # 暫時中斷
total_faces = sum(len(faces) for faces in faces_map.values())
print(f"✅ Indexed {len(faces_map)} frames, containing {total_faces} faces.")
print("🚀 Starting Linear Video Scan...")
# 2. 線性掃描
video_path = VIDEO_PATH # 使用區域變數避免 global 問題
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
# 嘗試找 mov
alt_path = video_path.replace(".mp4", ".mov")
if os.path.exists(alt_path):
video_path = alt_path
cap = cv2.VideoCapture(video_path)
else:
print("❌ Video file not found.")
return
embeddings = []
face_refs = [] # 存儲 (frame_index, face_index_in_list)
# 為了追蹤進度
processed_frames = 0
current_frame = 0
# 獲取影片總幀數
total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
while True:
ret, frame = cap.read()
if not ret:
break
# 檢查這一幀是否有我們需要處理的臉
# 使用 round 處理可能的浮點誤差 (雖然 face_processor 應該寫的是 int)
# 如果 JSON 的 frame 是 0.0, 1.0...
# 這裡我們直接看 current_frame 是否在 faces_map 中
# 由於 face_processor 可能跳幀,或者時間戳對齊問題
# 我們檢查 current_frame 以及 current_frame +/- 1 的容差
# 但最好的方式是嚴格匹配 frame number
if current_frame in faces_map:
faces = faces_map[current_frame]
for face_idx, face in enumerate(faces):
try:
x, y, w, h = face["x"], face["y"], face["width"], face["height"]
margin = 5
crop = frame[
max(0, y - margin) : y + h + margin,
max(0, x - margin) : x + w + margin,
]
if crop is not None and crop.size > 0:
# 使用 Fast Model: VGG-Face 或 OpenFace 比 ArcFace 快,但 ArcFace 準
# 這裡保持 ArcFace 以求準確,但因為是線性讀取,省去了 seek 時間
# 為了速度,我們可以每 2 秒只取 1 幀?
# 不,我們需要標記所有幀。
# DeepFace 提取
res = DeepFace.represent(
img_path=crop, model_name="ArcFace", enforce_detection=False
)
if res and "embedding" in res[0]:
embeddings.append(res[0]["embedding"])
face_refs.append(
{"frame_idx": current_frame, "face_idx": face_idx}
)
except Exception:
pass
processed_frames += 1
if processed_frames % 500 == 0:
pct = (current_frame / total_video_frames) * 100
print(
f" 📊 Progress: Frame {current_frame}/{total_video_frames} ({pct:.1f}%) | Extracted: {len(embeddings)} embeddings"
)
current_frame += 1
cap.release()
if not embeddings:
print("❌ No embeddings extracted.")
return
embeddings = np.array(embeddings)
print(f"✅ Total Embeddings Extracted: {len(embeddings)}")
# 3. 聚類
print(f"🧠 Clustering {len(embeddings)} faces...")
# 優化:KMeans 或 MiniBatchKMeans 對於大數據集更快
# 但 Agglomerative 對於找任意形狀的簇更好。
# 25000 個點做層次聚類還是慢。
# 我們使用 "Sample -> Cluster -> Assign" 策略
print(" 🚀 Using Sampling Strategy for speed...")
sample_size = 5000
n_faces = len(embeddings)
if n_faces > sample_size:
indices = np.random.choice(n_faces, sample_size, replace=False)
sample_embeddings = embeddings[indices]
else:
sample_embeddings = embeddings
indices = np.arange(n_faces)
clustering = AgglomerativeClustering(
n_clusters=None, distance_threshold=0.45, metric="cosine", linkage="average"
)
sample_labels = clustering.fit_predict(sample_embeddings)
# 計算簇中心
unique_labels = set(sample_labels)
centroids = []
for label in unique_labels:
mask = sample_labels == label
centroids.append(np.mean(sample_embeddings[mask], axis=0))
centroids = np.array(centroids)
# 分配所有數據
print(" 🏃 Assigning remaining faces to clusters...")
from sklearn.metrics.pairwise import cosine_distances
# 批次計算
all_labels = np.zeros(n_faces, dtype=int)
batch_size = 10000
for i in range(0, n_faces, batch_size):
batch = embeddings[i : i + batch_size]
dists = cosine_distances(batch, centroids)
all_labels[i : i + batch_size] = np.argmin(dists, axis=1)
print(f" 👥 Detected {len(unique_labels)} unique persons.")
# 4. 生成標籤
label_to_person = {l: f"Person_{i}" for i, l in enumerate(unique_labels)}
# 5. 寫回 JSON
# face_data 是原始結構,我們需要修改它
# face_data['frames'] 是一個列表
# 我們需要快速找到對應的 frame
# 建立 map frame_idx -> frame_object reference
frame_ref_map = {}
for f_obj in face_data.get("frames", []):
idx = f_obj.get("frame") or f_obj.get("frame_number")
if idx is not None:
frame_ref_map[int(idx)] = f_obj
count = 0
for ref, label in zip(face_refs, all_labels):
f_idx = ref["frame_idx"]
face_idx = ref["face_idx"] # 這是原始 faces list 中的 index
person_id = label_to_person[label]
if f_idx in frame_ref_map:
frame_obj = frame_ref_map[f_idx]
faces_list = frame_obj.get("faces", [])
if face_idx < len(faces_list):
faces_list[face_idx]["person_id"] = person_id
count += 1
print(f" ✅ Tagged {count} faces with Person ID.")
with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
json.dump(face_data, f, indent=2, ensure_ascii=False)
print(f"✅ Saved clustered data to {OUTPUT_JSON_PATH}")
# 6. 綁定 Speaker
auto_bind_speakers()
def auto_bind_speakers():
if not os.path.exists(OUTPUT_JSON_PATH) or not os.path.exists(ASRX_JSON_PATH):
print("⚠️ Missing data for speaker binding.")
return
with open(OUTPUT_JSON_PATH) as f:
face_clustered = json.load(f)
with open(ASRX_JSON_PATH) as f:
asrx_data = json.load(f)
print("🔗 Auto-binding Speakers to Persons...")
face_spans = []
for frame_obj in face_clustered.get("frames", []):
ts = frame_obj.get("timestamp")
for face in frame_obj.get("faces", []):
person_id = face.get("person_id")
if person_id and ts is not None:
face_spans.append({"ts": ts, "person_id": person_id})
speaker_person_counts = {}
for seg in asrx_data.get("segments", []):
start = seg.get("start")
end = seg.get("end")
speaker = seg.get("speaker_id")
if not speaker:
continue
candidates = [f for f in face_spans if start <= f["ts"] <= end]
if candidates:
person_counts = {}
for c in candidates:
pid = c["person_id"]
person_counts[pid] = person_counts.get(pid, 0) + 1
if speaker not in speaker_person_counts:
speaker_person_counts[speaker] = {}
best_person = max(person_counts, key=person_counts.get)
speaker_person_counts[speaker][best_person] = (
speaker_person_counts[speaker].get(best_person, 0) + 1
)
try:
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
for speaker, persons in speaker_person_counts.items():
if not persons:
continue
best_person = max(persons, key=persons.get)
print(
f" 🎤 {speaker} is likely {best_person} ({persons[best_person]} votes)"
)
cur.execute("SELECT id FROM talents WHERE real_name = %s", (best_person,))
row = cur.fetchone()
if row:
talent_id = row[0]
else:
cur.execute(
"INSERT INTO talents (real_name) VALUES (%s) RETURNING id",
(best_person,),
)
talent_id = cur.fetchone()[0]
print(f" ✨ Created Talent #{talent_id} ({best_person})")
cur.execute(
"""
INSERT INTO identity_bindings (talent_id, binding_type, binding_value, source, confidence)
VALUES (%s, 'speaker', %s, 'auto_cluster', 0.8)
ON CONFLICT (binding_type, binding_value) DO UPDATE SET talent_id = EXCLUDED.talent_id
""",
(talent_id, speaker),
)
print(f" ✅ Bound {speaker} -> {best_person}")
conn.commit()
cur.close()
conn.close()
except Exception as e:
print(f" ❌ DB Error: {e}")
if __name__ == "__main__":
main()
+1
View File
@@ -0,0 +1 @@
face_clustering_processor.py
+175 -1
View File
@@ -33,7 +33,54 @@ def process_pose(
uuid: str = "",
sample_interval: int = 3, # Changed from 30 to match Face
publisher: RedisPublisher = None,
target_frames: list = None,
) -> dict:
# Check if pose.json or pose.json.tmp already exists (from swift_face_pose)
# executor.rs renames output to .json.tmp before running Python script
tmp_path = output_path.replace('.json', '.json.tmp')
source_path = None
if os.path.exists(output_path):
source_path = output_path
print(f"[Pose] Output exists from swift_face_pose: {output_path}", file=sys.stderr)
elif os.path.exists(tmp_path):
source_path = tmp_path
print(f"[Pose] Temp output exists from swift_face_pose: {tmp_path}", file=sys.stderr)
if source_path:
with open(source_path) as f:
data = json.load(f)
detected_frames = len(data.get('frames', []))
print(f"[Pose] Loaded {detected_frames} detected frames", file=sys.stderr)
# When target_frames is provided (8Hz sampling), skip interpolation
# Swift already outputs at sample_interval=3, matching 8Hz for 24fps
if target_frames is not None:
print(f"[Pose] 8Hz mode: returning {detected_frames} frames without interpolation", file=sys.stderr)
if publisher:
publisher.progress("pose", 100, 100, f"{detected_frames} frames (8Hz, no interpolation)")
return data
# Interpolate keypoints for all frames
interpolated_data = interpolate_pose(data, video_path)
# Write interpolated output
with open(output_path, 'w') as f:
json.dump(interpolated_data, f)
# Delete .json.tmp file so executor.rs won't restore it
if os.path.exists(tmp_path):
os.remove(tmp_path)
print(f"[Pose] Deleted temp file: {tmp_path}", file=sys.stderr)
total_frames = len(interpolated_data.get('frames', []))
print(f"[Pose] Interpolated to {total_frames} frames", file=sys.stderr)
if publisher:
publisher.progress("pose", 100, 100, f"Interpolated {total_frames} frames")
return interpolated_data
swift_bin = SWIFT_POSE_PATH
if not os.path.exists(swift_bin):
swift_bin = SWIFT_POSE_ALT
@@ -81,6 +128,126 @@ def process_pose(
return json.load(f)
def interpolate_pose(detected_data: dict, video_path: str) -> dict:
"""Interpolate keypoints for all frames between detected frames"""
import cv2
import numpy as np
cap = cv2.VideoCapture(video_path)
total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = detected_data.get('fps', 30.0)
detected_frames = detected_data.get('frames', [])
if not detected_frames:
cap.release()
return detected_data
# Build frame index map
frame_map = {f['frame']: f for f in detected_frames}
detected_frame_nums = sorted(frame_map.keys())
print(f"[Pose] Interpolating from {len(detected_frame_nums)} detected frames to {total_video_frames} total frames", file=sys.stderr)
# Get all persons from detected frames (assume same person tracking)
all_persons = {}
for f in detected_frames:
for i, p in enumerate(f.get('persons', [])):
if i not in all_persons:
all_persons[i] = []
all_persons[i].append((f['frame'], p))
# Interpolate each person's keypoints for each frame
interpolated_frames = []
for frame_num in range(total_video_frames):
ts = frame_num / fps
persons_in_frame = []
for person_id, person_frames in all_persons.items():
# Find closest detected frames before and after
before = None
after = None
for fn, p in person_frames:
if fn <= frame_num:
before = (fn, p)
if fn >= frame_num and after is None:
after = (fn, p)
if before is None and after is None:
continue
# Interpolate keypoints
interpolated_keypoints = []
bbox = None
if before and after and before[0] != after[0]:
# Linear interpolation
t0, t1 = before[0], after[0]
t = (frame_num - t0) / (t1 - t0) if t1 != t0 else 0
kp_before = before[1].get('keypoints', [])
kp_after = after[1].get('keypoints', [])
bbox_before = before[1].get('bbox', {})
bbox_after = after[1].get('bbox', {})
# Interpolate keypoints
for i in range(max(len(kp_before), len(kp_after))):
kp0 = kp_before[i] if i < len(kp_before) else kp_after[i]
kp1 = kp_after[i] if i < len(kp_after) else kp_before[i]
x = kp0['x'] + t * (kp1['x'] - kp0['x'])
y = kp0['y'] + t * (kp1['y'] - kp0['y'])
c = kp0['confidence'] + t * (kp1['confidence'] - kp0['confidence'])
interpolated_keypoints.append({
'name': kp0['name'],
'x': x,
'y': y,
'confidence': c
})
# Interpolate bbox
if bbox_before and bbox_after:
bbox = {
'x': int(bbox_before['x'] + t * (bbox_after['x'] - bbox_before['x'])),
'y': int(bbox_before['y'] + t * (bbox_after['y'] - bbox_before['y'])),
'width': int(bbox_before['width'] + t * (bbox_after['width'] - bbox_before['width'])),
'height': int(bbox_before['height'] + t * (bbox_after['height'] - bbox_before['height']))
}
elif before:
# Use before frame's data
interpolated_keypoints = before[1].get('keypoints', [])
bbox = before[1].get('bbox', {})
elif after:
# Use after frame's data
interpolated_keypoints = after[1].get('keypoints', [])
bbox = after[1].get('bbox', {})
if bbox and bbox.get('width', 0) > 0 and bbox.get('height', 0) > 0:
persons_in_frame.append({
'keypoints': interpolated_keypoints,
'bbox': bbox
})
if persons_in_frame:
interpolated_frames.append({
'frame': frame_num,
'timestamp': ts,
'persons': persons_in_frame
})
cap.release()
return {
'frame_count': len(interpolated_frames),
'fps': fps,
'frames': interpolated_frames
}
def _fallback(video_path, output_path, uuid, sample_interval):
"""Fallback to YOLOv8 Pose"""
from ultralytics import YOLO
@@ -135,14 +302,21 @@ if __name__ == "__main__":
parser.add_argument("output_path")
parser.add_argument("--uuid", "-u", default="")
parser.add_argument("--sample-interval", type=int, default=3) # Changed from 30 to match Face
parser.add_argument("--frames", type=str, default=None,
help="Comma-separated frame numbers for 8Hz sampling")
args = parser.parse_args()
target_frames = None
if args.frames:
target_frames = [int(f) for f in args.frames.split(",") if f.strip()]
print(f"[Pose] 8Hz target frames: {len(target_frames)} frames", file=sys.stderr)
publisher = RedisPublisher(args.uuid) if args.uuid else None
if publisher:
publisher.info("pose", "POSE_START")
result = process_pose(args.video_path, args.output_path, args.uuid,
args.sample_interval, publisher)
args.sample_interval, publisher, target_frames)
with open(args.output_path, "w") as f:
json.dump(result, f, indent=2)
print(f"Pose: {len(result.get('frames', []))} frames with poses")
+27 -82
View File
@@ -21,8 +21,6 @@ import json
import argparse
from collections import defaultdict
import numpy as np
import psycopg2
import psycopg2.extras
from datetime import datetime
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -30,13 +28,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "uti
from qdrant_faces import update_trace_ids
# Config
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
def get_conn():
return psycopg2.connect(DB_URL)
SCHEMA = os.environ.get("DATABASE_SCHEMA", "public")
def merge_traces_within_cuts(face_data: dict, cut_scenes: list) -> dict:
@@ -146,67 +139,17 @@ def run_face_tracker(
def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHEMA):
"""Insert traced face detections into face_detections table with trace_id"""
conn = get_conn()
cur = conn.cursor()
"""Update Qdrant _faces collection with trace_id after face tracking.
face_detections table is deprecated — trace_id is stored only in Qdrant _faces payload.
"""
with open(traced_json_path) as f:
data = json.load(f)
frames = data.get("frames", {})
total_stored = 0
for frame_num_str, frame_data in sorted(frames.items(), key=lambda x: int(x[0])):
frame_num = int(frame_num_str)
faces = frame_data.get("faces", [])
for face in faces:
trace_id = face.get("trace_id")
if trace_id is None:
continue
x = face.get("x", 0)
y = face.get("y", 0)
w = face.get("width", 0)
h = face.get("height", 0)
confidence = face.get("confidence", 0.0)
face_id = face.get("face_id")
if face_id is None:
face_id = f"face_{trace_id}"
attributes = face.get("attributes")
bbox = json.dumps({"x": x, "y": y, "width": w, "height": h})
try:
cur.execute(
f"""
UPDATE {schema}.face_detections
SET trace_id = %s, face_id = %s
WHERE file_uuid = %s AND frame_number = %s
AND x = %s AND y = %s AND width = %s AND height = %s
""",
(
trace_id,
face_id,
file_uuid,
frame_num,
x,
y,
w,
h,
),
)
if cur.rowcount > 0:
total_stored += 1
except Exception as e:
print(f"[TRACE] Error storing face at frame {frame_num}: {e}")
conn.rollback()
continue
conn.commit()
# Build trace_mapping for Qdrant update
trace_mapping = {} # {frame: {bbox_key: trace_id}}
# Build trace_mapping for Qdrant update: {frame: {bbox_key: trace_id}}
trace_mapping = {}
for frame_num_str, frame_data in sorted(frames.items(), key=lambda x: int(x[0])):
frame_num = int(frame_num_str)
trace_mapping[frame_num] = {}
@@ -224,22 +167,26 @@ def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHE
print(f"[TRACE] Warning: Qdrant trace_id update failed: {e}")
qdrant_updated = 0
# Log trace summary
cur.execute(
f"SELECT COUNT(DISTINCT trace_id) FROM {schema}.face_detections WHERE file_uuid = %s AND trace_id IS NOT NULL",
(file_uuid,),
)
db_trace_count = cur.fetchone()[0]
# Count unique traces from Qdrant
try:
from qdrant_faces import get_file_faces
points = get_file_faces(file_uuid)
trace_ids = set()
for p in points:
tid = p.get("payload", {}).get("trace_id")
if tid is not None and tid > 0:
trace_ids.add(tid)
qdrant_trace_count = len(trace_ids)
except Exception as e:
print(f"[TRACE] Warning: Qdrant trace count failed: {e}")
qdrant_trace_count = 0
cur.close()
conn.close()
print(
f"[TRACE] Stored {total_stored} face detections, {db_trace_count} unique traces in DB"
total_faces = sum(
1 for fd in frames.values() for f in fd.get("faces", []) if f.get("trace_id") is not None
)
if qdrant_updated > 0:
print(f"[TRACE] Updated {qdrant_updated} Qdrant points with trace_id")
return total_stored, db_trace_count
print(f"[TRACE] Updated {qdrant_updated} Qdrant points with trace_id, {qdrant_trace_count} unique traces")
return total_faces, qdrant_trace_count
def main():
@@ -248,8 +195,6 @@ def main():
parser.add_argument("--face-json", help="Path to face.json (default: auto-detect)")
parser.add_argument("--schema", default=SCHEMA, help="DB schema name")
parser.add_argument("--uuid", help="UUID for Redis tracking (accepted by executor)")
parser.add_argument(
"--filter-eyes",
@@ -270,8 +215,8 @@ def main():
# Step 1: Run face tracker
run_face_tracker(face_json, traced_json, filter_eyes=args.filter_eyes)
# Step 2: Store in DB with trace_id
total, traces = store_traced_faces(args.file_uuid, traced_json, args.schema)
# Step 2: Store in Qdrant with trace_id
total, traces = store_traced_faces(args.file_uuid, traced_json)
print(f"[TRACE] Done: {total} detections, {traces} traces")
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,409 @@
import Foundation
import Vision
import ArgumentParser
import AVFoundation
/// Swift Face+Pose Processor - one pass, two outputs
/// Runs VNDetectFaceRectanglesRequest, VNDetectFaceLandmarksRequest,
/// and VNDetectHumanBodyPoseRequest on each sampled frame.
/// Uses AVAssetReader sequential read (frame-based), matching cv2 behavior.
@main
struct SwiftFacePose: ParsableCommand {
@Argument(help: "Video file path")
var inputPath: String
@Argument(help: "Output JSON path for face detection")
var faceOutput: String
@Argument(help: "Output JSON path for pose detection")
var poseOutput: String
@Option(name: .long, help: "Sample interval (frames, default=30)")
var sampleInterval: Int = 30
@Option(name: .long, help: "UUID for logging")
var uuid: String = ""
mutating func run() throws {
let startTime = Date()
print("[SwiftFacePose] Vision face+pose detection: \(inputPath)")
let url = URL(fileURLWithPath: inputPath)
let asset = AVAsset(url: url)
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
print("[SwiftFacePose] No video track found")
return
}
let fps = videoTrack.nominalFrameRate
let duration = CMTimeGetSeconds(asset.duration)
let totalFrames = Int(duration * Double(fps))
print("[SwiftFacePose] Video: \(Int(videoTrack.naturalSize.width))x\(Int(videoTrack.naturalSize.height)), \(String(format: "%.1f", fps))fps, \(totalFrames) frames, interval=\(sampleInterval)")
// read sequentially, matching cv2 frame-by-frame behavior
let reader = try AVAssetReader(asset: asset)
let outputSettings: [String: Any] = [
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA
]
let trackOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: outputSettings)
trackOutput.alwaysCopiesSampleData = false
reader.add(trackOutput)
guard reader.startReading() else {
print("[SwiftFacePose] Failed to start AVAssetReader: \(reader.error?.localizedDescription ?? "unknown")")
return
}
var faceFrames: [[String: Any]] = []
var poseFrames: [[String: Any]] = []
var processedCount = 0
var frameIndex = 0
let jointNames: [VNHumanBodyPoseObservation.JointName] = [
.nose, .leftEye, .rightEye, .leftEar, .rightEar,
.neck, .root,
.leftShoulder, .rightShoulder,
.leftElbow, .rightElbow,
.leftWrist, .rightWrist,
.leftHip, .rightHip,
.leftKnee, .rightKnee,
.leftAnkle, .rightAnkle,
]
while let sampleBuffer = trackOutput.copyNextSampleBuffer() {
defer { frameIndex += 1 }
if frameIndex % sampleInterval != 0 {
continue
}
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
continue
}
let imgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
let imgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
let seconds = Double(frameIndex) / Double(fps)
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
let faceReq = VNDetectFaceRectanglesRequest()
let lmReq = VNDetectFaceLandmarksRequest()
let bodyReq = VNDetectHumanBodyPoseRequest()
do {
try handler.perform([faceReq, lmReq, bodyReq])
} catch {
continue
}
// ── Face output ──
let faceObservations = faceReq.results ?? []
let landmarkObservations = lmReq.results ?? []
var faces: [[String: Any]] = []
var hasFace = false
if !faceObservations.isEmpty || !landmarkObservations.isEmpty {
hasFace = true
let MIN_CONFIDENCE = 0.6
let MIN_SIZE = 20
for lmObs in landmarkObservations {
let lmConf = Double(lmObs.confidence)
if lmConf < MIN_CONFIDENCE { continue }
let bb = lmObs.boundingBox
let faceW = Int(bb.size.width * imgW)
let faceH = Int(bb.size.height * imgH)
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
let faceX = Int(bb.origin.x * imgW)
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
var faceData: [String: Any] = [
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
"width": faceW, "height": faceH],
"confidence": Double(lmObs.confidence),
]
if let yaw = lmObs.yaw?.doubleValue,
let roll = lmObs.roll?.doubleValue {
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
if let pitch = lmObs.pitch?.doubleValue {
poseInfo["pitch"] = pitch
}
faceData["pose"] = poseInfo
}
if let lms = lmObs.landmarks {
let imgSize = CGSize(width: imgW, height: imgH)
let leftEye = lms.leftEye?.pointsInImage(imageSize: imgSize) ?? []
let rightEye = lms.rightEye?.pointsInImage(imageSize: imgSize) ?? []
let nose = lms.nose?.pointsInImage(imageSize: imgSize) ?? []
if !leftEye.isEmpty || !rightEye.isEmpty || !nose.isEmpty {
var lm: [String: [[Double]]] = [:]
if !leftEye.isEmpty {
lm["left_eye"] = leftEye.map { [Double($0.x), Double(imgH - $0.y)] }
}
if !rightEye.isEmpty {
lm["right_eye"] = rightEye.map { [Double($0.x), Double(imgH - $0.y)] }
}
if !nose.isEmpty {
lm["nose"] = nose.map { [Double($0.x), Double(imgH - $0.y)] }
}
faceData["landmarks"] = lm
}
let outer = lms.outerLips?.pointsInImage(imageSize: imgSize) ?? []
let inner = lms.innerLips?.pointsInImage(imageSize: imgSize) ?? []
if !outer.isEmpty || !inner.isEmpty {
faceData["lips"] = [
"outer_lips": outer.map { [Double($0.x), Double(imgH - $0.y)] },
"inner_lips": inner.map { [Double($0.x), Double(imgH - $0.y)] }
]
}
}
faces.append(faceData)
}
for faceObs in faceObservations {
let fBB = faceObs.boundingBox
var matched = false
for lmObs in landmarkObservations {
let lBB = lmObs.boundingBox
let ix = max(fBB.origin.x, lBB.origin.x)
let iy = max(fBB.origin.y, lBB.origin.y)
let iw = min(fBB.maxX, lBB.maxX) - ix
let ih = min(fBB.maxY, lBB.maxY) - iy
if iw <= 0 || ih <= 0 { continue }
let intersection = iw * ih
let union = fBB.width * fBB.height + lBB.width * lBB.height - intersection
if intersection / union > 0.3 {
matched = true
break
}
}
if matched { continue }
let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
if faceConf < MIN_CONFIDENCE { continue }
let faceW = Int(fBB.size.width * imgW)
let faceH = Int(fBB.size.height * imgH)
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
let faceX = Int(fBB.origin.x * imgW)
let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
var faceData: [String: Any] = [
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
"width": faceW, "height": faceH],
"confidence": Double(faceObs.faceCaptureQuality ?? faceObs.confidence),
]
if let yaw = faceObs.yaw?.doubleValue,
let roll = faceObs.roll?.doubleValue {
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
if let pitch = faceObs.pitch?.doubleValue {
poseInfo["pitch"] = pitch
}
faceData["pose"] = poseInfo
}
faces.append(faceData)
}
if !faces.isEmpty {
faceFrames.append([
"frame": frameIndex,
"timestamp": seconds,
"faces": faces,
])
}
}
// ── Pose output ──
// Rule: Face ≤ Pose - every face frame must have pose frame
// Face landmarks (nose, leftEye, rightEye) ARE pose keypoints
let poses = bodyReq.results ?? []
var persons: [[String: Any]] = []
// If we have face landmarks, extract pose keypoints from them
// This ensures Face → Pose is always true
if hasFace && landmarkObservations.count > 0 {
for lmObs in landmarkObservations {
let lmConf = Double(lmObs.confidence)
if lmConf < 0.6 { continue }
if let lms = lmObs.landmarks {
let imgSize = CGSize(width: imgW, height: imgH)
var keypoints: [[String: Any]] = []
// Extract face landmarks as pose keypoints
if let nosePoints = lms.nose?.pointsInImage(imageSize: imgSize) {
for pt in nosePoints {
keypoints.append([
"name": "nose",
"x": Double(pt.x),
"y": Double(imgH - pt.y),
"confidence": lmConf
])
}
}
if let leftEyePoints = lms.leftEye?.pointsInImage(imageSize: imgSize) {
for pt in leftEyePoints {
keypoints.append([
"name": "left_eye",
"x": Double(pt.x),
"y": Double(imgH - pt.y),
"confidence": lmConf
])
}
}
if let rightEyePoints = lms.rightEye?.pointsInImage(imageSize: imgSize) {
for pt in rightEyePoints {
keypoints.append([
"name": "right_eye",
"x": Double(pt.x),
"y": Double(imgH - pt.y),
"confidence": lmConf
])
}
}
if !keypoints.isEmpty {
persons.append([
"keypoints": keypoints,
"bbox": ["x": 0, "y": 0, "width": 0, "height": 0]
])
}
}
}
}
// Also process body pose detections (may add more keypoints)
for pose in poses {
var keypoints: [[String: Any]] = []
var minX = CGFloat.greatestFiniteMagnitude
var minY = CGFloat.greatestFiniteMagnitude
var maxX: CGFloat = 0
var maxY: CGFloat = 0
for joint in jointNames {
if let point = try? pose.recognizedPoint(joint) {
let desc = String(describing: joint.rawValue)
var rawName = desc
.replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
.replacingOccurrences(of: ")", with: "")
.trimmingCharacters(in: .whitespaces)
let nameMap: [String: String] = [
"head_joint": "nose",
"left_eye_joint": "left_eye",
"right_eye_joint": "right_eye",
"left_ear_joint": "left_ear",
"right_ear_joint": "right_ear",
"neck_1_joint": "neck",
"left_shoulder_1_joint": "left_shoulder",
"right_shoulder_1_joint": "right_shoulder",
"left_elbow_1_joint": "left_elbow",
"right_elbow_1_joint": "right_elbow",
"left_hand_joint": "left_wrist",
"right_hand_joint": "right_wrist",
"left_hip_1_joint": "left_hip",
"right_hip_1_joint": "right_hip",
"left_knee_1_joint": "left_knee",
"right_knee_1_joint": "right_knee",
"left_ankle_1_joint": "left_ankle",
"right_ankle_1_joint": "right_ankle",
"center_hip_joint": "root",
]
if let mapped = nameMap[rawName] {
rawName = mapped
}
let px = point.location.x * CGFloat(imgW)
let py = CGFloat(imgH) - point.location.y * CGFloat(imgH)
keypoints.append([
"name": rawName.isEmpty ? "\(joint)" : rawName,
"x": px,
"y": py,
"confidence": point.confidence,
])
if point.confidence > 0.1 {
minX = min(minX, px)
minY = min(minY, py)
maxX = max(maxX, px)
maxY = max(maxY, py)
}
}
}
var bbox: [String: Any] = ["x": 0, "y": 0, "width": 0, "height": 0]
if maxX > minX {
bbox = [
"x": Int(minX),
"y": Int(minY),
"width": Int(maxX - minX),
"height": Int(maxY - minY),
]
}
persons.append(["keypoints": keypoints, "bbox": bbox])
}
// Rule: Face ≤ Pose - always add pose frame if has face
if hasFace || !persons.isEmpty {
poseFrames.append([
"frame": frameIndex,
"timestamp": seconds,
"persons": persons,
])
}
processedCount += 1
if processedCount % 100 == 0 {
let elapsed = Date().timeIntervalSince(startTime)
let totalSamples = totalFrames / sampleInterval
let pct = Int(Double(processedCount) / Double(totalSamples) * 100)
print("[SwiftFacePose] \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(pct)% complete, \(Int(elapsed))s elapsed")
fflush(stdout)
}
}
reader.cancelReading()
let faceOutputDict: [String: Any] = [
"frame_count": faceFrames.count,
"fps": Double(fps),
"frames": faceFrames,
]
do {
let faceJson = try JSONSerialization.data(withJSONObject: faceOutputDict, options: [])
try faceJson.write(to: URL(fileURLWithPath: faceOutput))
print("[SwiftFacePose] Face output written: \(faceOutput)")
// Verify file exists
if FileManager.default.fileExists(atPath: faceOutput) {
print("[SwiftFacePose] Verified: file exists at \(faceOutput)")
} else {
print("[SwiftFacePose] ERROR: file not found after write!")
}
} catch {
print("[SwiftFacePose] ERROR writing face output: \(error)")
}
let poseOutputDict: [String: Any] = [
"frame_count": poseFrames.count,
"fps": Double(fps),
"frames": poseFrames,
]
if let poseJson = try? JSONSerialization.data(withJSONObject: poseOutputDict, options: [.prettyPrinted]) {
try poseJson.write(to: URL(fileURLWithPath: poseOutput))
}
let elapsed = Date().timeIntervalSince(startTime)
print("[SwiftFacePose] Done: \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(String(format: "%.1f", elapsed))s")
}
}