fix: ASRX duplication, TKG edges, trace ingest, and add pipeline progress publishing

- ASRX handler no longer stores duplicate 'asr' pre_chunks - Pre_chunks storage made idempotent (delete-before-insert) - Rule 1 + trace_ingest changed to query 'asrx' not 'asr' - Trace chunks removed (dynamic from TKG/Qdrant) - TKG scroll_face_points fixed: trace_id >= 1 (not == 1) - TKG AsrxSegmentEntry: start/end -> start_time/end_time (match ASRX JSON) - Unregister error handling: log instead of silent discard - Add publish_pipeline_progress calls at each pipeline stage (processors, rule1, face_trace, identity_agent, TKG, rule2, completion)
2026-07-02 10:43:46 +08:00
parent d791d138f2
commit 3eabd45882
65 changed files with 9477 additions and 3852 deletions
@@ -1,15 +1,17 @@
 #!/opt/homebrew/bin/python3.11
 """
-Appearance Processor - HSV color feature extraction for person tracking
+Appearance Processor - Body part color extraction using pose keypoints

 Input:
  - video_path: source video
-  - pose_json: pose.json with frame bboxes
+  - pose_json: pose.json with keypoints and bbox
  - output_path: output JSON

-Output: appearance.json with HSV histogram per person per frame
+Output: appearance.json with per-person per-frame body part colors

-Depends on pose.json (bbox). Same 0-based frame numbering as face/pose/mediapipe.
+Regions: head, neck, front_upper_body, front_lower_body,
+         back_upper_body, back_lower_body, left_hand, right_hand,
+         left_foot, right_foot
 """

 import sys
@@ -20,82 +22,223 @@ import cv2
 import numpy as np


-def extract_appearance(frame, bbox):
-    x, y, w, h = bbox["x"], bbox["y"], bbox["width"], bbox["height"]
-    if w <= 0 or h <= 0:
-        return None
+def get_kp(keypoints, name):
+    for kp in keypoints:
+        if kp.get("name") == name:
+            return (kp["x"], kp["y"], kp.get("confidence", 1.0))
+    return None

-    x1, y1 = max(0, x), max(0, y)
-    x2 = min(frame.shape[1], x + w)
-    y2 = min(frame.shape[0], y + h)
-    if x2 <= x1 or y2 <= y1:
-        return None

-    person_roi = frame[y1:y2, x1:x2]
-    hsv = cv2.cvtColor(person_roi, cv2.COLOR_BGR2HSV)
+def determine_facing(keypoints):
+    nose = get_kp(keypoints, "nose")
+    left_shoulder = get_kp(keypoints, "left_shoulder")
+    right_shoulder = get_kp(keypoints, "right_shoulder")
+
+    if nose and nose[2] > 0.5:
+        return "front"
+
+    sh_vis = sum(1 for s in [left_shoulder, right_shoulder] if s and s[2] > 0.5)
+    if sh_vis >= 2 and (not nose or nose[2] < 0.2):
+        return "back"
+
+    if sh_vis >= 1:
+        return "profile"
+
+    return "unknown"
+
+
+def extract_color(roi_bgr):
+    """Extract HSV histogram and dominant colors from an ROI"""
+    if roi_bgr is None or roi_bgr.size == 0:
+        return None
+    if roi_bgr.shape[0] < 2 or roi_bgr.shape[1] < 2:
+        return None
+    hsv = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2HSV)
    pixels = hsv.reshape(-1, 3).astype(np.float32)

-    # HSV histograms
    h_hist = cv2.calcHist([hsv], [0], None, [30], [0, 180]).flatten()
    s_hist = cv2.calcHist([hsv], [1], None, [32], [0, 256]).flatten()
    v_hist = cv2.calcHist([hsv], [2], None, [32], [0, 256]).flatten()
-    h_sum = h_hist.sum() or 1
-    s_sum = s_hist.sum() or 1
-    v_sum = v_hist.sum() or 1
+    hs = h_hist.sum() or 1
+    ss = s_hist.sum() or 1
+    vs = v_hist.sum() or 1

-    # Dominant colors via k-means
    dominant = []
    if len(pixels) >= 5:
        criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
-        _, labels, centers = cv2.kmeans(
-            pixels, 5, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS
-        )
+        _, labels, centers = cv2.kmeans(pixels, 5, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
        counts = np.bincount(labels.flatten())
        dominant = centers[np.argsort(-counts)[:5]].tolist()
    elif len(pixels) > 0:
        dominant = [pixels.mean(axis=0).tolist()]

-    # Upper / lower body split
-    mid_y = y1 + (y2 - y1) // 2
-
-    def roi_hist(roi):
-        if roi is None or roi.size == 0:
-            return None
-        hsv_r = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
-        hh = cv2.calcHist([hsv_r], [0], None, [30], [0, 180]).flatten()
-        sh = cv2.calcHist([hsv_r], [1], None, [32], [0, 256]).flatten()
-        vh = cv2.calcHist([hsv_r], [2], None, [32], [0, 256]).flatten()
-        hs = hh.sum() or 1
-        ss = sh.sum() or 1
-        vs = vh.sum() or 1
-        return [(hh / hs).tolist(), (sh / ss).tolist(), (vh / vs).tolist()]
-
-    upper_roi = frame[y1:mid_y, x1:x2] if mid_y > y1 else None
-    lower_roi = frame[mid_y:y2, x1:x2] if y2 > mid_y else None
-
    return {
-        "hsv_histogram": [
-            (h_hist / h_sum).tolist(),
-            (s_hist / s_sum).tolist(),
-            (v_hist / v_sum).tolist(),
-        ],
+        "hsv_histogram": [(h_hist / hs).tolist(), (s_hist / ss).tolist(), (v_hist / vs).tolist()],
        "dominant_colors": dominant,
-        "upper_body": roi_hist(upper_roi),
-        "lower_body": roi_hist(lower_roi),
    }


+def safe_roi(frame, x, y, w, h):
+    """Extract a safe ROI, returning None if invalid"""
+    if w <= 0 or h <= 0:
+        return None
+    x1 = max(0, int(x))
+    y1 = max(0, int(y))
+    x2 = min(frame.shape[1], int(x + w))
+    y2 = min(frame.shape[0], int(y + h))
+    if x2 <= x1 or y2 <= y1:
+        return None
+    return frame[y1:y2, x1:x2]
+
+
+def compute_body_regions(keypoints, face_bbox, frame_shape):
+    """Use face bbox for size, pose keypoints for alignment"""
+    h, w = frame_shape[:2]
+
+    fx, fy, fw, fh = face_bbox["x"], face_bbox["y"], face_bbox["width"], face_bbox["height"]
+    face_cx = fx + fw / 2
+
+    nose = get_kp(keypoints, "nose")
+    ls = get_kp(keypoints, "left_shoulder")
+    rs = get_kp(keypoints, "right_shoulder")
+    lw = get_kp(keypoints, "left_wrist")
+    rw = get_kp(keypoints, "right_wrist")
+    lh = get_kp(keypoints, "left_hip")
+    rh = get_kp(keypoints, "right_hip")
+    la = get_kp(keypoints, "left_ankle")
+    ra = get_kp(keypoints, "right_ankle")
+
+    kp_nose = (nose[0], nose[1]) if nose else (face_cx, fy + fh * 0.5)
+    kp_sh_l = ls[0] if ls else (face_cx - fw * 1.5)
+    kp_sh_r = rs[0] if rs else (face_cx + fw * 1.5)
+    kp_sh_mid_x = (kp_sh_l + kp_sh_r) / 2
+    kp_sh_mid_y = ((ls[1] + rs[1]) / 2) if (ls and rs) else (fy + fh + fh * 0.3)
+    kp_hip_y = ((lh[1] + rh[1]) / 2) if (lh and rh) else (kp_sh_mid_y + fw * 2.0)
+    kp_hip_l = lh[0] if lh else (kp_sh_mid_x - fw * 1.2)
+    kp_hip_r = rh[0] if rh else (kp_sh_mid_x + fw * 1.2)
+
+    regions = {}
+
+    # head: nose-aligned, face-proportional
+    head_w = fw * 1.6
+    head_h = fh * 1.5
+    regions["head"] = {
+        "x": kp_nose[0] - head_w / 2,
+        "y": kp_nose[1] - head_h * 0.5,
+        "width": head_w,
+        "height": head_h,
+    }
+
+    # neck: nose-to-shoulder, face-width
+    neck_w = fw * 1.5
+    regions["neck"] = {
+        "x": kp_sh_mid_x - neck_w / 2,
+        "y": kp_nose[1] + fh * 0.4,
+        "width": neck_w,
+        "height": max(kp_sh_mid_y - kp_nose[1] - fh * 0.4, fh * 0.3),
+    }
+
+    # upper body: shoulder-aligned
+    ub_w = max(abs(kp_sh_r - kp_sh_l) * 1.3, fw * 3.0)
+    ub_h = fh * 3.0
+    regions["front_upper_body"] = {
+        "x": kp_sh_mid_x - ub_w / 2,
+        "y": kp_sh_mid_y,
+        "width": ub_w,
+        "height": ub_h,
+    }
+    regions["back_upper_body"] = dict(regions["front_upper_body"])
+
+    # lower body: hip-aligned
+    lb_w = max(abs(kp_hip_r - kp_hip_l) * 1.3, fw * 3.5)
+    lb_h = fh * 3.0
+    regions["front_lower_body"] = {
+        "x": kp_sh_mid_x - lb_w / 2,
+        "y": kp_hip_y,
+        "width": lb_w,
+        "height": lb_h,
+    }
+    regions["back_lower_body"] = dict(regions["front_lower_body"])
+
+    # hands: wrist-aligned
+    hs = fw * 1.0
+    if lw and lw[2] > 0.3:
+        regions["left_hand"] = {"x": lw[0] - hs / 2, "y": lw[1] - hs / 2, "width": hs, "height": hs}
+    else:
+        regions["left_hand"] = {"x": kp_sh_l - hs, "y": kp_sh_mid_y + fh * 0.5, "width": hs, "height": hs}
+    if rw and rw[2] > 0.3:
+        regions["right_hand"] = {"x": rw[0] - hs / 2, "y": rw[1] - hs / 2, "width": hs, "height": hs}
+    else:
+        regions["right_hand"] = {"x": kp_sh_r, "y": kp_sh_mid_y + fh * 0.5, "width": hs, "height": hs}
+
+    # feet: ankle-aligned
+    fs = fw * 1.0
+    if la and la[2] > 0.3:
+        regions["left_foot"] = {"x": la[0] - fs / 2, "y": la[1], "width": fs, "height": fs * 0.75}
+    else:
+        regions["left_foot"] = {"x": kp_sh_mid_x - fw * 1.0, "y": kp_hip_y + fh * 2.5, "width": fs, "height": fs * 0.75}
+    if ra and ra[2] > 0.3:
+        regions["right_foot"] = {"x": ra[0] - fs / 2, "y": ra[1], "width": fs, "height": fs * 0.75}
+    else:
+        regions["right_foot"] = {"x": kp_sh_mid_x + fw * 1.0 - fs, "y": kp_hip_y + fh * 2.5, "width": fs, "height": fs * 0.75}
+
+    # Extrapolate each bbox outward
+    expanded = {}
+    margins = {
+        "head": 0.10, "neck": 0.15,
+        "front_upper_body": 0.20, "back_upper_body": 0.20,
+        "front_lower_body": 0.15, "back_lower_body": 0.15,
+        "left_hand": 0.25, "right_hand": 0.25,
+        "left_foot": 0.20, "right_foot": 0.20,
+    }
+    for name, rb in regions.items():
+        m = margins.get(name, 0.15)
+        dx = int(rb["width"] * m)
+        dy = int(rb["height"] * m)
+        expanded[name] = {
+            "x": rb["x"] - dx,
+            "y": rb["y"] - dy,
+            "width": rb["width"] + dx * 2,
+            "height": rb["height"] + dy * 2,
+        }
+    return expanded
+
+
+def filter_by_facing(regions, facing):
+    if facing == "front":
+        regions.pop("back_upper_body", None)
+        regions.pop("back_lower_body", None)
+    elif facing == "back":
+        regions.pop("front_upper_body", None)
+        regions.pop("front_lower_body", None)
+    return regions
+
+
 def main():
    parser = argparse.ArgumentParser(description="Appearance Processor")
-    parser.add_argument("video_path", help="Video file path")
-    parser.add_argument("pose_json", help="Pose JSON path (bbox input)")
-    parser.add_argument("output_path", help="Output JSON path")
+    parser.add_argument("video_path")
+    parser.add_argument("pose_json")
+    parser.add_argument("output_path")
    parser.add_argument("--uuid", "-u", default="")
    args = parser.parse_args()

    with open(args.pose_json) as f:
        pose_data = json.load(f)

+    # Load face.json for anchor bbox (same directory as pose_json)
+    face_path = args.pose_json.replace(".pose.json", ".face.json")
+    face_data = {}
+    if os.path.exists(face_path):
+        with open(face_path) as f:
+            face_data = json.load(f)
+    # Build frame -> face bbox lookup
+    face_by_frame = {}
+    for fframe in face_data.get("frames", []):
+        fn = fframe.get("frame")
+        faces = fframe.get("faces", [])
+        if faces:
+            face_by_frame[fn] = faces[0]  # first face bbox
+
    fps = pose_data.get("fps", 30.0)

    cap = cv2.VideoCapture(args.video_path)
@@ -115,38 +258,58 @@ def main():
        if not ret:
            continue

+        # Get face bbox for this frame
+        face_bbox = face_by_frame.get(frame_num, persons[0].get("bbox", {"x": 0, "y": 0, "width": 0, "height": 0}))
+
        frame_persons = []
        for pid, person in enumerate(persons):
+            keypoints = person.get("keypoints", [])
            bbox = person.get("bbox", {})
-            if bbox.get("width", 0) <= 0 or bbox.get("height", 0) <= 0:
+            if not keypoints:
                continue
-            appearance = extract_appearance(frame, bbox)
-            if appearance is None:
-                continue
-            frame_persons.append(
-                {
-                    "person_id": pid,
-                    "bbox": bbox,
-                    **appearance,
-                }
-            )
+
+            facing = determine_facing(keypoints)
+            all_regions = compute_body_regions(keypoints, face_bbox, frame.shape)
+            regions = filter_by_facing(all_regions, facing)
+
+            body_parts = []
+            for name, rb in regions.items():
+                roi = safe_roi(frame, rb["x"], rb["y"], rb["width"], rb["height"])
+                color = extract_color(roi)
+                if color is None:
+                    continue
+                body_parts.append({
+                    "name": name,
+                    "bbox": rb,
+                    "hsv_histogram": color["hsv_histogram"],
+                    "dominant_colors": color["dominant_colors"],
+                })
+
+            # Full bbox reference colors
+            full = None
+            if bbox.get("width", 0) > 0 and bbox.get("height", 0) > 0:
+                full_roi = safe_roi(frame, bbox["x"], bbox["y"], bbox["width"], bbox["height"])
+                full = extract_color(full_roi)
+
+            frame_persons.append({
+                "person_id": pid,
+                "bbox": bbox,
+                "facing": facing,
+                "body_parts": body_parts,
+                "dominant_colors": full["dominant_colors"] if full else [],
+                "hsv_histogram": full["hsv_histogram"] if full else [[], [], []],
+            })

        if frame_persons:
-            frames_out.append(
-                {
-                    "frame": frame_num,
-                    "timestamp": pose_frame.get("timestamp", frame_num / fps),
-                    "persons": frame_persons,
-                }
-            )
+            frames_out.append({
+                "frame": frame_num,
+                "timestamp": pose_frame.get("timestamp", frame_num / fps),
+                "persons": frame_persons,
+            })

    cap.release()

-    output = {
-        "frame_count": len(frames_out),
-        "fps": fps,
-        "frames": frames_out,
-    }
+    output = {"frame_count": len(frames_out), "fps": fps, "frames": frames_out}
    with open(args.output_path, "w") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

@@ -201,7 +201,12 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
    if not has_audio_stream(video_path):
        if publisher:
            publisher.info("asr", "No audio stream detected, skipping transcription")
-        output = {"language": "", "language_probability": 0.0, "segments": []}
+        output = {
+            "status": "no_audio_track",
+            "language": "",
+            "language_probability": 0.0,
+            "segments": []
+        }
        with open(output_path, "w") as f:
            json.dump(output, f, indent=2)
        if publisher:
@@ -336,16 +341,16 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
                    seg_start = start_t + segment.start
                    seg_end = start_t + segment.end
                    scene_idx = find_scene_idx((seg_start + seg_end) / 2)
-                scene_segments.append({
-                    "start_time": seg_start,
-                    "end_time": seg_end,
-                    "start_frame": int(round(seg_start * fps)),
-                    "end_frame": int(round(seg_end * fps)),
-                    "text": segment.text.strip(),
-                    "scene_number": scene_idx + 1,
-                    "language": seg_language,
-                })
-                total_segments += 1
+                    scene_segments.append({
+                        "start_time": seg_start,
+                        "end_time": seg_end,
+                        "start_frame": int(round(seg_start * fps)),
+                        "end_frame": int(round(seg_end * fps)),
+                        "text": segment.text.strip(),
+                        "scene_number": scene_idx + 1,
+                        "language": seg_language,
+                    })
+                    total_segments += 1

                # 當前 scene 結果寫入 .asr.tmp
                all_segments.extend(scene_segments)
@@ -365,8 +370,18 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
        try: os.rmdir(temp_dir)
        except: pass

+        # Determine status for cut_scenes branch
+        if total_segments > 0:
+            status = "has_transcript"
+        else:
+            status = "silent_audio"
+        
        info_language = transcript_language or "unknown"
-        print(f"[ASR] Segmented transcription complete: {total_segments} segments", file=sys.stderr)
+        print(f"[ASR] Segmented transcription complete: {total_segments} segments, status={status}", file=sys.stderr)
+        
+        # Write final output with status
+        with open(tmp_path, "w") as f:
+            json.dump({"status": status, "language": info_language, "segments": all_segments}, f)
    else:
        # 無 CUT 資料，直接轉錄（原有流程）
        segments, info = transcribe_with_fallback(model, video_path, publisher)
@@ -386,8 +401,15 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
            if total_segments % 100 == 0:
                if publisher:
                    publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
+        
+        # Determine status for direct transcription branch
+        if total_segments > 0:
+            status = "has_transcript"
+        else:
+            status = "silent_audio"
+        
        with open(tmp_path, "w") as f:
-            json.dump({"language": info_language, "segments": all_segments}, f)
+            json.dump({"status": status, "language": info_language, "segments": all_segments}, f)

    if publisher:
        publisher.info("asr", f"ASR_LANGUAGE:{info_language}")
@@ -396,10 +418,10 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
    os.rename(tmp_path, output_path)

    if publisher:
-        publisher.complete("asr", f"{len(results)} segments")
+        publisher.complete("asr", f"{total_segments} segments")

    sys.stderr.write(
-        f"ASR: Transcription complete, {len(results)} segments written to {output_path}\n"
+        f"ASR: Transcription complete, {total_segments} segments written to {output_path}\n"
    )
    sys.stderr.flush()
    sys.exit(0)
@@ -126,9 +126,17 @@ def _convert_result(result, output_path):
        except Exception:
            pass

+    segment_count = len(result.get("segments", []))
+    if segment_count > 0:
+        status = "has_transcript"
+    else:
+        status = "silent_audio"
+
    output_result = {
+        "status": status,
        "language": result.get("language"),
        "segments": [],
+        "segment_count": segment_count,
        "n_speakers": result.get("n_speakers", 0),
        "speaker_stats": result.get("speaker_stats", {}),
    }
@@ -172,6 +180,37 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
    if publisher:
        publisher.info("asrx", "ASRX_START")

+    # Check for audio stream first
+    tracks = probe_audio_tracks(video_path)
+    if not tracks:
+        if publisher:
+            publisher.info("asrx", "No audio stream detected")
+        output_result = {"status": "no_audio_track", "language": None, "segments": [], "segment_count": 0}
+        _atomic_write(output_path, output_result)
+        if publisher:
+            publisher.complete("asrx", "0 segments (no audio)")
+        print("[ASRX] No audio stream, skipping", file=sys.stderr)
+        return output_result
+
+    # Check if ASR already determined no audio/silent - skip processing
+    asr_path = output_path.replace(".asrx.json", ".asr.json")
+    if os.path.exists(asr_path):
+        try:
+            with open(asr_path) as f:
+                asr_data = json.load(f)
+            asr_status = asr_data.get("status", "")
+            if asr_status in ("no_audio_track", "silent_audio"):
+                if publisher:
+                    publisher.info("asrx", f"ASR status={asr_status}, skipping ASRX processing")
+                output_result = {"status": asr_status, "language": asr_data.get("language"), "segments": [], "segment_count": 0}
+                _atomic_write(output_path, output_result)
+                if publisher:
+                    publisher.complete("asrx", f"0 segments (ASR: {asr_status})")
+                print(f"[ASRX] ASR status={asr_status}, skipping", file=sys.stderr)
+                return output_result
+        except Exception as e:
+            print(f"[ASRX] Failed to read ASR output: {e}", file=sys.stderr)
+
    checkpoint_path = output_path + ".stage1.json"

    # ── Phase 2: Resume from checkpoint (Steps 4-7 only) ──
@@ -189,7 +228,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
            if "error" in result:
                if publisher:
                    publisher.error("asrx", result["error"])
-                output_result = {"language": None, "segments": []}
+                output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
                _atomic_write(output_path, output_result)
                if publisher:
                    publisher.complete("asrx", "0 segments")
@@ -225,7 +264,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
                publisher.error("asrx", str(e))
            import traceback
            traceback.print_exc()
-            output_result = {"language": None, "segments": []}
+            output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
            _atomic_write(output_path, output_result)
            if publisher:
                publisher.complete("asrx", "0 segments")
@@ -289,7 +328,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
        if "error" in result:
            if publisher:
                publisher.error("asrx", result["error"])
-            output_result = {"language": None, "segments": []}
+            output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
            _atomic_write(output_path, output_result)
            if publisher:
                publisher.complete("asrx", "0 segments")
@@ -320,7 +359,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
        import traceback
        traceback.print_exc()

-        output_result = {"language": None, "segments": []}
+        output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
        _atomic_write(output_path, output_result)
        if publisher:
            publisher.complete("asrx", "0 segments")
@@ -216,19 +216,27 @@ class SelfASRXFixed:
            return {"error": "No speech detected", "segments": []}

        # ── Step 2: VAD scan 每個 rough segment 細切 ──
-        print("\n[Step 2] VAD scan for refined segmentation...")
-        t2 = time.time()
-        refined_segments = []
-        for seg in rough_segments:
-            s = seg["start"]
-            e = seg["end"]
-            sub = self._vad_scan_segment(wav, sample_rate, s, e)
-            if sub:
-                refined_segments.extend(sub)
-            else:
-                refined_segments.append((s, e))
-        print(f"  Refined segments: {len(refined_segments)}")
-        print(f"  Step 2 time: {time.time() - t2:.2f}s")
+        # Skip VAD if using ASR segments (preserve all ASR segments)
+        if asr_segments:
+            print("\n[Step 2] Skipping VAD scan, using ASR segments directly...")
+            t2 = time.time()
+            refined_segments = [(seg["start"], seg["end"]) for seg in rough_segments]
+            print(f"  Refined segments: {len(refined_segments)}")
+            print(f"  Step 2 time: {time.time() - t2:.2f}s")
+        else:
+            print("\n[Step 2] VAD scan for refined segmentation...")
+            t2 = time.time()
+            refined_segments = []
+            for seg in rough_segments:
+                s = seg["start"]
+                e = seg["end"]
+                sub = self._vad_scan_segment(wav, sample_rate, s, e)
+                if sub:
+                    refined_segments.extend(sub)
+                else:
+                    refined_segments.append((s, e))
+            print(f"  Refined segments: {len(refined_segments)}")
+            print(f"  Step 2 time: {time.time() - t2:.2f}s")

        if not refined_segments:
            return {"error": "No segments after VAD scan", "segments": []}
@@ -1,91 +1,152 @@
 #!/opt/homebrew/bin/python3.11
 """
-CUT Processor - Scene Detection
-Uses PySceneDetect for scene detection (local)
+CUT Processor - Scene Detection & Video Quality Check
+Uses ffprobe for video analysis. Always produces at least 1 scene.
 """

-import sys
 import json
 import argparse
 import os
+import subprocess
+import sys

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from redis_publisher import RedisPublisher


+def get_video_info(video_path: str) -> dict:
+    """Get video info via ffprobe"""
+    try:
+        result = subprocess.run(
+            ["ffprobe", "-v", "quiet", "-print_format", "json",
+             "-show_format", "-show_streams", video_path],
+            capture_output=True, text=True, timeout=30,
+        )
+        info = json.loads(result.stdout)
+        for stream in info.get("streams", []):
+            if stream.get("codec_type") == "video":
+                nb_frames = stream.get("nb_frames")
+                if nb_frames:
+                    fr = stream.get("r_frame_rate", "0/1")
+                    fps = eval(fr) if "/" in fr else float(fr)
+                    return {
+                        "frame_count": int(nb_frames),
+                        "fps": fps,
+                        "duration": float(stream.get("duration", 0)),
+                        "width": int(stream.get("width", 0)),
+                        "height": int(stream.get("height", 0)),
+                        "codec": stream.get("codec_name", ""),
+                    }
+                dur = float(stream.get("duration", 0))
+                afr = stream.get("avg_frame_rate", "0/1")
+                avg_fps = eval(afr) if "/" in afr else float(afr)
+                if dur > 0 and avg_fps > 0:
+                    return {
+                        "frame_count": int(dur * avg_fps),
+                        "fps": avg_fps,
+                        "duration": dur,
+                        "width": int(stream.get("width", 0)),
+                        "height": int(stream.get("height", 0)),
+                        "codec": stream.get("codec_name", ""),
+                    }
+                return {
+                    "frame_count": 0, "fps": 0.0, "duration": dur,
+                    "width": 0, "height": 0, "codec": "",
+                }
+        return {"frame_count": 0, "fps": 0.0, "duration": 0, "width": 0, "height": 0, "codec": ""}
+    except Exception:
+        return {"frame_count": 0, "fps": 0.0, "duration": 0, "width": 0, "height": 0, "codec": ""}
+
+
+def detect_scenes_ffmpeg(video_path: str, fps: float, duration: float) -> list:
+    """Detect scene changes using ffmpeg scene filter"""
+    try:
+        result = subprocess.run(
+            ["ffprobe", "-v", "quiet", "-show_entries", "frame=pts_time",
+             "-of", "default=nk=0",
+             "-f", "lavfi",
+             f"movie={video_path},select='gt(scene\\,0.3)',showinfo",
+             "-show_frames"],
+            capture_output=True, text=True, timeout=300,
+        )
+        times = []
+        for line in (result.stderr + "\n" + result.stdout).split("\n"):
+            for prefix in ("pts_time=", "pts_time:"):
+                if prefix in line:
+                    rest = line.split(prefix)[1].split()[0]
+                    try:
+                        t = float(rest)
+                        times.append(t)
+                    except ValueError:
+                        pass
+
+        scenes = []
+        prev_time = 0.0
+        for i, t in enumerate(times):
+            end_frame = round(t * fps)
+            start_frame = round(prev_time * fps)
+            if end_frame > start_frame:
+                scenes.append({
+                    "scene_number": i + 1,
+                    "start_frame": start_frame,
+                    "end_frame": end_frame - 1,
+                    "start_time": prev_time,
+                    "end_time": t - (1.0 / fps) if fps > 0 else t,
+                })
+            prev_time = t
+
+        last_frame = round(duration * fps) if fps > 0 else 0
+        prev_frame = round(prev_time * fps) if fps > 0 else 0
+        if last_frame > prev_frame:
+            scenes.append({
+                "scene_number": len(scenes) + 1,
+                "start_frame": prev_frame,
+                "end_frame": last_frame - 1,
+                "start_time": prev_time,
+                "end_time": duration,
+            })
+
+        return scenes
+    except Exception:
+        return []
+
+
 def process_cut(video_path: str, output_path: str, uuid: str = ""):
-    """Process video for scene detection"""
+    """Process video for scene detection and quality verification"""

    publisher = RedisPublisher(uuid) if uuid else None
    if publisher:
        publisher.info("cut", "CUT_START")

-    try:
-        from scenedetect import VideoManager, SceneManager
-        from scenedetect.detectors import ContentDetector
-    except ImportError:
-        if publisher:
-            publisher.error("cut", "scenedetect not installed")
-        result = {"frame_count": 0, "fps": 0.0, "scenes": []}
-        if publisher:
-            publisher.complete("cut", "0 scenes")
-        with open(output_path, "w") as f:
-            json.dump(result, f, indent=2)
-        return result
+    vinfo = get_video_info(video_path)

    if publisher:
-        publisher.info("cut", "CUT_LOADING_VIDEO")
+        publisher.info("cut", f"fps={vinfo['fps']}, frames={vinfo['frame_count']}, codec={vinfo['codec']}")

-    # Create video manager and scene manager
-    video_manager = VideoManager([video_path])
-    scene_manager = SceneManager()
+    total_frames = vinfo["frame_count"]
+    fps = vinfo["fps"]
+    duration = vinfo["duration"]

-    # Add content detector (detects scene cuts based on frame differences)
-    # threshold: sensitivity (lower = more sensitive, default 30)
-    # min_scene_len: minimum frames per scene (default 15)
-    scene_manager.add_detector(ContentDetector(threshold=30.0, min_scene_len=15))
+    # Try ffmpeg scene detection
+    scenes = detect_scenes_ffmpeg(video_path, fps, duration)

-    # Set downscale factor for faster processing
-    video_manager.set_downscale_factor()
-
-    if publisher:
-        publisher.info("cut", "CUT_DETECTING")
-
-    # Start video manager
-    video_manager.start()
-
-    # Detect scenes
-    scene_manager.detect_scenes(frame_source=video_manager)
-
-    # Get scene list
-    scene_list = scene_manager.get_scene_list()
-
-    # Get frame rate
-    fps = video_manager.get_framerate()
-
-    if publisher:
-        publisher.info("cut", f"fps={fps}")
-
-    # Get total frame count
-    frame_count = 0
-    if scene_list:
-        frame_count = scene_list[-1][1].get_frames()
-
-    # Convert scenes to result format
-    scenes = []
-    for i, (start, end) in enumerate(scene_list):
-        scene = {
-            "scene_number": i + 1,
-            "start_frame": start.get_frames(),
-            "end_frame": end.get_frames() - 1,  # end is exclusive
-            "start_time": start.get_seconds(),
-            "end_time": end.get_seconds() - (1.0 / fps) if fps > 0 else 0,
-        }
-        scenes.append(scene)
+    # Always ensure at least 1 scene
+    if not scenes and total_frames > 0:
+        scenes = [{
+            "scene_number": 1,
+            "start_frame": 0,
+            "end_frame": total_frames - 1,
+            "start_time": 0.0,
+            "end_time": duration,
+        }]
        if publisher:
-            publisher.progress("cut", i + 1, len(scene_list), f"Scene {i + 1}")
+            publisher.info("cut", "No scene changes detected, using whole video as single scene")

-    result = {"frame_count": frame_count, "fps": fps, "scenes": scenes}
+    result = {
+        "frame_count": total_frames,
+        "fps": fps,
+        "scenes": scenes,
+    }

    with open(output_path, "w") as f:
        json.dump(result, f, indent=2)
@@ -14,13 +14,9 @@ from sklearn.cluster import AgglomerativeClustering

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

-try:
-    from deepface import DeepFace
-
-    HAS_DEEPFACE = True
-except ImportError:
-    print("❌ DeepFace not found. Run: pip install deepface")
-    sys.exit(1)
+# Use FaceNet embeddings from face.json instead of DeepFace
+HAS_DEEPFACE = False
+print("[FACE_CLUSTER] Using FaceNet embeddings from face.json (DeepFace not required)")

 # 設定
 UUID = os.getenv("UUID", "quick_preview")
@@ -104,53 +100,69 @@ def main():
        print("❌ No frames in JSON.")
        return

-    cap = cv2.VideoCapture(VIDEO_PATH)
+    # Get embeddings from Qdrant
+    print(f"[FACE_CLUSTER] Loading embeddings from Qdrant for {UUID}...")
+    try:
+        import requests
+        qdrant_url = "http://localhost:6333"
+        collection = "_faces"
+        
+        # Query all embeddings for this file_uuid
+        response = requests.post(
+            f"{qdrant_url}/collections/{collection}/points/scroll",
+            json={
+                "filter": {
+                    "must": [
+                        {"key": "file_uuid", "match": {"value": UUID}}
+                    ]
+                },
+                "limit": 10000,
+                "with_vector": True
+            }
+        )
+        
+        if response.status_code == 200:
+            result = response.json()
+            points = result.get("result", {}).get("points", [])
+            print(f"[FACE_CLUSTER] Loaded {len(points)} embeddings from Qdrant")
+            
+            # Build face_id -> embedding map
+            embedding_map = {}
+            for point in points:
+                face_id = point.get("payload", {}).get("face_id")
+                vector = point.get("vector")
+                if face_id and vector:
+                    embedding_map[face_id] = vector
+        else:
+            print(f"[FACE_CLUSTER] Qdrant query failed: {response.status_code}")
+            embedding_map = {}
+    except Exception as e:
+        print(f"[FACE_CLUSTER] Failed to load embeddings from Qdrant: {e}")
+        embedding_map = {}
+
+    # Use embeddings from Qdrant or face.json
    embeddings = []
    face_refs = []

-    print(f"🔍 Extracting face embeddings from {UUID}...")
+    print(f"🔍 Collecting face embeddings for {UUID}...")

    for frame_idx, frame_obj in enumerate(frames_list):
-        ts = frame_obj.get("timestamp")
        faces = frame_obj.get("faces", [])
        if not faces:
            continue

-        if ts is not None:
-            cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
-
-        ret, frame = cap.read()
-        if not ret:
-            continue
-
        for face_idx, face in enumerate(faces):
-            x, y, w, h = face["x"], face["y"], face["width"], face["height"]
-            margin = 5
-            crop = frame[
-                max(0, y - margin) : y + h + margin, max(0, x - margin) : x + w + margin
-            ]
-
-            if crop is None or crop.size == 0:
-                continue
-
-            try:
-                res = DeepFace.represent(
-                    img_path=crop, model_name="ArcFace", enforce_detection=False
-                )
-                if res and "embedding" in res[0]:
-                    embeddings.append(res[0]["embedding"])
-                    face_refs.append({"frame_idx": frame_idx, "face_idx": face_idx})
-            except Exception:
-                pass
-
-    cap.release()
+            face_id = face.get("face_id")
+            if face_id and face_id in embedding_map:
+                embeddings.append(embedding_map[face_id])
+                face_refs.append({"frame_idx": frame_idx, "face_idx": face_idx, "face_id": face_id})

    if not embeddings:
-        print("❌ No embeddings extracted.")
+        print("❌ No embeddings found in Qdrant.")
        return

    embeddings = np.array(embeddings)
-    print(f"✅ Extracted {len(embeddings)} face embeddings.")
+    print(f"✅ Collected {len(embeddings)} face embeddings from Qdrant.")

    # 2. 聚類
    print(f"🧠 Clustering {len(embeddings)} faces...")
@@ -35,7 +35,7 @@ from redis_publisher import RedisPublisher
 from qdrant_faces import push_face_embeddings_batch

 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-SWIFT_BIN = os.path.join(SCRIPT_DIR, "swift_processors", ".build", "debug", "swift_face_pose")
+SWIFT_BIN = os.path.join(SCRIPT_DIR, "swift_processors", ".build", "release", "swift_face_pose")
 FACENET_PATH = os.path.join(SCRIPT_DIR, "..", "models", "facenet512.mlpackage")

 # Pose angle classification from roll/yaw
@@ -84,7 +84,12 @@ class FaceProcessorVision:
        self.total_frames = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
        self.width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        
+        # Calculate 8Hz sample interval based on FPS
+        self.sample_interval = max(1, round(self.fps / 8))
+        
        print(f"[FACE_V2] Video: {self.width}x{self.height}, {self.fps:.1f}fps, {self.total_frames}f")
+        print(f"[FACE_V2] 8Hz sample interval: {self.fps:.1f}/8 = {self.sample_interval}")

    def extract_face_embedding(self, face_img: np.ndarray) -> Optional[list]:
        """Run CoreML FaceNet on cropped face"""
@@ -126,11 +131,15 @@ class FaceProcessorVision:
        output_basename = os.path.basename(self.output_path)
        pose_basename = output_basename.replace("face", "pose")
        swift_pose_out = os.path.join(output_dir, pose_basename)
+        # Appearance output: same directory, but replace "face" with "appearance" in filename
+        appearance_basename = output_basename.replace("face", "appearance")
+        swift_appearance_out = os.path.join(output_dir, appearance_basename)
        cmd = [
            SWIFT_BIN,
            self.video_path,
            swift_face_out,
            swift_pose_out,
+            swift_appearance_out,
            "--sample-interval", str(self.sample_interval),
        ]
        if self.uuid:
@@ -286,17 +295,28 @@ class FaceProcessorVision:

        # Convert dict frames to list for Rust FaceResult format
        frames_list = []
+        total_faces = 0
        for fnum_str, fdata in sorted(face_data["frames"].items(), key=lambda x: int(x[0])):
+            faces = fdata["faces"]
+            total_faces += len(faces)
            frames_list.append({
                "frame": int(fnum_str),
                "timestamp": fdata["time_seconds"],
-                "faces": fdata["faces"],
+                "faces": faces,
            })

+        # Determine status based on face count
+        if total_faces > 0:
+            status = "has_faces"
+        else:
+            status = "no_faces"
+
        output = {
+            "status": status,
            "frame_count": len(frames_list),
            "fps": self.fps,
            "frames": frames_list,
+            "total_faces": total_faces,
        }

        with open(self.output_path, "w") as f:
@@ -339,6 +359,9 @@ def main():
        args.uuid, args.sample_interval, publisher
    )

+    # Open video to get FPS and calculate sample_interval
+    processor.open_video()
+
    # Step 1: Vision detection (bbox + pose via ANE)
    try:
        detection = processor.process_with_swift()
@@ -1,334 +0,0 @@
-#!/opt/homebrew/bin/python3.11
-"""
-Fast Face Clustering Processor (Linear Scan)
-職責：針對長片優化，使用線性讀取取代隨機跳轉，大幅提升速度。
-"""
-
-import cv2
-import json
-import numpy as np
-import os
-import sys
-import psycopg2
-from collections import defaultdict
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-
-try:
-    from deepface import DeepFace
-
-    HAS_DEEPFACE = True
-except ImportError:
-    print("❌ DeepFace not found.")
-    sys.exit(1)
-
-from sklearn.cluster import AgglomerativeClustering
-
-# 設定
-UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
-OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
-VIDEO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
-FACE_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face.json")
-OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face_clustered.json")
-ASRX_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.asrx.json")
-DB_URL = os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
-
-
-def main():
-    if not os.path.exists(FACE_JSON_PATH):
-        print(f"❌ Face JSON not found: {FACE_JSON_PATH}")
-        return
-
-    print(f"⚡ 開始執行快速面孔聚類 (Linear Scan Mode) for {UUID}...")
-
-    # 1. 載入並建立索引 (以 frame number 為 key)
-    with open(FACE_JSON_PATH) as f:
-        face_data = json.load(f)
-
-    frames_list = face_data.get("frames", [])
-    if not frames_list:
-        print("❌ No frames in JSON.")
-        return
-
-    # 建立 map: frame_index -> faces
-    # 注意：JSON 中的 frame 是 int，但也許是 float?
-    # face_processor 輸出通常是 int
-    faces_map = defaultdict(list)
-
-    # 為了安全，我們也建立 timestamp map 以防萬一，但優先使用 frame number
-    print(f"📂 Indexing {len(frames_list)} frames with faces...")
-    for frame_obj in frames_list:
-        # JSON 中可能是 'frame' (int) 或 'frame_number'
-        idx = frame_obj.get("frame") or frame_obj.get("frame_number")
-        if idx is not None:
-            faces_map[int(idx)].extend(frame_obj.get("faces", []))
-
-    # 如果沒有 frame number 字段，我們只能依靠 timestamp (比較慢)
-    if not faces_map:
-        print("⚠️ No frame numbers found in JSON. Falling back to timestamp seeking.")
-        # 這裡我們可以呼叫舊的邏輯，但為了簡單，我們假設 face_processor 有寫 frame
-        # 檢查第一個 frame 的 key
-        if frames_list:
-            print(f"   Keys: {frames_list[0].keys()}")
-        return  # 暫時中斷
-
-    total_faces = sum(len(faces) for faces in faces_map.values())
-    print(f"✅ Indexed {len(faces_map)} frames, containing {total_faces} faces.")
-    print("🚀 Starting Linear Video Scan...")
-
-    # 2. 線性掃描
-    video_path = VIDEO_PATH  # 使用區域變數避免 global 問題
-    cap = cv2.VideoCapture(video_path)
-    if not cap.isOpened():
-        # 嘗試找 mov
-        alt_path = video_path.replace(".mp4", ".mov")
-        if os.path.exists(alt_path):
-            video_path = alt_path
-            cap = cv2.VideoCapture(video_path)
-        else:
-            print("❌ Video file not found.")
-            return
-
-    embeddings = []
-    face_refs = []  # 存儲 (frame_index, face_index_in_list)
-
-    # 為了追蹤進度
-    processed_frames = 0
-    current_frame = 0
-
-    # 獲取影片總幀數
-    total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-
-    while True:
-        ret, frame = cap.read()
-        if not ret:
-            break
-
-        # 檢查這一幀是否有我們需要處理的臉
-        # 使用 round 處理可能的浮點誤差 (雖然 face_processor 應該寫的是 int)
-        # 如果 JSON 的 frame 是 0.0, 1.0...
-        # 這裡我們直接看 current_frame 是否在 faces_map 中
-
-        # 由於 face_processor 可能跳幀，或者時間戳對齊問題
-        # 我們檢查 current_frame 以及 current_frame +/- 1 的容差
-        # 但最好的方式是嚴格匹配 frame number
-
-        if current_frame in faces_map:
-            faces = faces_map[current_frame]
-            for face_idx, face in enumerate(faces):
-                try:
-                    x, y, w, h = face["x"], face["y"], face["width"], face["height"]
-                    margin = 5
-                    crop = frame[
-                        max(0, y - margin) : y + h + margin,
-                        max(0, x - margin) : x + w + margin,
-                    ]
-
-                    if crop is not None and crop.size > 0:
-                        # 使用 Fast Model: VGG-Face 或 OpenFace 比 ArcFace 快，但 ArcFace 準
-                        # 這裡保持 ArcFace 以求準確，但因為是線性讀取，省去了 seek 時間
-                        # 為了速度，我們可以每 2 秒只取 1 幀？
-                        # 不，我們需要標記所有幀。
-                        # DeepFace 提取
-                        res = DeepFace.represent(
-                            img_path=crop, model_name="ArcFace", enforce_detection=False
-                        )
-                        if res and "embedding" in res[0]:
-                            embeddings.append(res[0]["embedding"])
-                            face_refs.append(
-                                {"frame_idx": current_frame, "face_idx": face_idx}
-                            )
-                except Exception:
-                    pass
-
-            processed_frames += 1
-            if processed_frames % 500 == 0:
-                pct = (current_frame / total_video_frames) * 100
-                print(
-                    f"   📊 Progress: Frame {current_frame}/{total_video_frames} ({pct:.1f}%) | Extracted: {len(embeddings)} embeddings"
-                )
-
-        current_frame += 1
-
-    cap.release()
-
-    if not embeddings:
-        print("❌ No embeddings extracted.")
-        return
-
-    embeddings = np.array(embeddings)
-    print(f"✅ Total Embeddings Extracted: {len(embeddings)}")
-
-    # 3. 聚類
-    print(f"🧠 Clustering {len(embeddings)} faces...")
-
-    # 優化：KMeans 或 MiniBatchKMeans 對於大數據集更快
-    # 但 Agglomerative 對於找任意形狀的簇更好。
-    # 25000 個點做層次聚類還是慢。
-    # 我們使用 "Sample -> Cluster -> Assign" 策略
-
-    print("   🚀 Using Sampling Strategy for speed...")
-    sample_size = 5000
-    n_faces = len(embeddings)
-
-    if n_faces > sample_size:
-        indices = np.random.choice(n_faces, sample_size, replace=False)
-        sample_embeddings = embeddings[indices]
-    else:
-        sample_embeddings = embeddings
-        indices = np.arange(n_faces)
-
-    clustering = AgglomerativeClustering(
-        n_clusters=None, distance_threshold=0.45, metric="cosine", linkage="average"
-    )
-    sample_labels = clustering.fit_predict(sample_embeddings)
-
-    # 計算簇中心
-    unique_labels = set(sample_labels)
-    centroids = []
-    for label in unique_labels:
-        mask = sample_labels == label
-        centroids.append(np.mean(sample_embeddings[mask], axis=0))
-    centroids = np.array(centroids)
-
-    # 分配所有數據
-    print("   🏃 Assigning remaining faces to clusters...")
-    from sklearn.metrics.pairwise import cosine_distances
-
-    # 批次計算
-    all_labels = np.zeros(n_faces, dtype=int)
-    batch_size = 10000
-    for i in range(0, n_faces, batch_size):
-        batch = embeddings[i : i + batch_size]
-        dists = cosine_distances(batch, centroids)
-        all_labels[i : i + batch_size] = np.argmin(dists, axis=1)
-
-    print(f"   👥 Detected {len(unique_labels)} unique persons.")
-
-    # 4. 生成標籤
-    label_to_person = {l: f"Person_{i}" for i, l in enumerate(unique_labels)}
-
-    # 5. 寫回 JSON
-    # face_data 是原始結構，我們需要修改它
-    # face_data['frames'] 是一個列表
-    # 我們需要快速找到對應的 frame
-
-    # 建立 map frame_idx -> frame_object reference
-    frame_ref_map = {}
-    for f_obj in face_data.get("frames", []):
-        idx = f_obj.get("frame") or f_obj.get("frame_number")
-        if idx is not None:
-            frame_ref_map[int(idx)] = f_obj
-
-    count = 0
-    for ref, label in zip(face_refs, all_labels):
-        f_idx = ref["frame_idx"]
-        face_idx = ref["face_idx"]  # 這是原始 faces list 中的 index
-
-        person_id = label_to_person[label]
-
-        if f_idx in frame_ref_map:
-            frame_obj = frame_ref_map[f_idx]
-            faces_list = frame_obj.get("faces", [])
-            if face_idx < len(faces_list):
-                faces_list[face_idx]["person_id"] = person_id
-                count += 1
-
-    print(f"   ✅ Tagged {count} faces with Person ID.")
-
-    with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
-        json.dump(face_data, f, indent=2, ensure_ascii=False)
-    print(f"✅ Saved clustered data to {OUTPUT_JSON_PATH}")
-
-    # 6. 綁定 Speaker
-    auto_bind_speakers()
-
-
-def auto_bind_speakers():
-    if not os.path.exists(OUTPUT_JSON_PATH) or not os.path.exists(ASRX_JSON_PATH):
-        print("⚠️ Missing data for speaker binding.")
-        return
-
-    with open(OUTPUT_JSON_PATH) as f:
-        face_clustered = json.load(f)
-    with open(ASRX_JSON_PATH) as f:
-        asrx_data = json.load(f)
-
-    print("🔗 Auto-binding Speakers to Persons...")
-
-    face_spans = []
-    for frame_obj in face_clustered.get("frames", []):
-        ts = frame_obj.get("timestamp")
-        for face in frame_obj.get("faces", []):
-            person_id = face.get("person_id")
-            if person_id and ts is not None:
-                face_spans.append({"ts": ts, "person_id": person_id})
-
-    speaker_person_counts = {}
-
-    for seg in asrx_data.get("segments", []):
-        start = seg.get("start")
-        end = seg.get("end")
-        speaker = seg.get("speaker_id")
-        if not speaker:
-            continue
-
-        candidates = [f for f in face_spans if start <= f["ts"] <= end]
-        if candidates:
-            person_counts = {}
-            for c in candidates:
-                pid = c["person_id"]
-                person_counts[pid] = person_counts.get(pid, 0) + 1
-
-            if speaker not in speaker_person_counts:
-                speaker_person_counts[speaker] = {}
-
-            best_person = max(person_counts, key=person_counts.get)
-            speaker_person_counts[speaker][best_person] = (
-                speaker_person_counts[speaker].get(best_person, 0) + 1
-            )
-
-    try:
-        conn = psycopg2.connect(DB_URL)
-        cur = conn.cursor()
-
-        for speaker, persons in speaker_person_counts.items():
-            if not persons:
-                continue
-            best_person = max(persons, key=persons.get)
-            print(
-                f"   🎤 {speaker} is likely {best_person} ({persons[best_person]} votes)"
-            )
-
-            cur.execute("SELECT id FROM talents WHERE real_name = %s", (best_person,))
-            row = cur.fetchone()
-
-            if row:
-                talent_id = row[0]
-            else:
-                cur.execute(
-                    "INSERT INTO talents (real_name) VALUES (%s) RETURNING id",
-                    (best_person,),
-                )
-                talent_id = cur.fetchone()[0]
-                print(f"   ✨ Created Talent #{talent_id} ({best_person})")
-
-            cur.execute(
-                """
-                INSERT INTO identity_bindings (talent_id, binding_type, binding_value, source, confidence)
-                VALUES (%s, 'speaker', %s, 'auto_cluster', 0.8)
-                ON CONFLICT (binding_type, binding_value) DO UPDATE SET talent_id = EXCLUDED.talent_id
-            """,
-                (talent_id, speaker),
-            )
-            print(f"   ✅ Bound {speaker} -> {best_person}")
-
-        conn.commit()
-        cur.close()
-        conn.close()
-    except Exception as e:
-        print(f"   ❌ DB Error: {e}")
-
-
-if __name__ == "__main__":
-    main()
@@ -0,0 +1 @@
+face_clustering_processor.py
@@ -33,7 +33,54 @@ def process_pose(
    uuid: str = "",
    sample_interval: int = 3,  # Changed from 30 to match Face
    publisher: RedisPublisher = None,
+    target_frames: list = None,
 ) -> dict:
+    # Check if pose.json or pose.json.tmp already exists (from swift_face_pose)
+    # executor.rs renames output to .json.tmp before running Python script
+    tmp_path = output_path.replace('.json', '.json.tmp')
+    
+    source_path = None
+    if os.path.exists(output_path):
+        source_path = output_path
+        print(f"[Pose] Output exists from swift_face_pose: {output_path}", file=sys.stderr)
+    elif os.path.exists(tmp_path):
+        source_path = tmp_path
+        print(f"[Pose] Temp output exists from swift_face_pose: {tmp_path}", file=sys.stderr)
+    
+    if source_path:
+        with open(source_path) as f:
+            data = json.load(f)
+        
+        detected_frames = len(data.get('frames', []))
+        print(f"[Pose] Loaded {detected_frames} detected frames", file=sys.stderr)
+        
+        # When target_frames is provided (8Hz sampling), skip interpolation
+        # Swift already outputs at sample_interval=3, matching 8Hz for 24fps
+        if target_frames is not None:
+            print(f"[Pose] 8Hz mode: returning {detected_frames} frames without interpolation", file=sys.stderr)
+            if publisher:
+                publisher.progress("pose", 100, 100, f"{detected_frames} frames (8Hz, no interpolation)")
+            return data
+        
+        # Interpolate keypoints for all frames
+        interpolated_data = interpolate_pose(data, video_path)
+        
+        # Write interpolated output
+        with open(output_path, 'w') as f:
+            json.dump(interpolated_data, f)
+        
+        # Delete .json.tmp file so executor.rs won't restore it
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+            print(f"[Pose] Deleted temp file: {tmp_path}", file=sys.stderr)
+        
+        total_frames = len(interpolated_data.get('frames', []))
+        print(f"[Pose] Interpolated to {total_frames} frames", file=sys.stderr)
+        
+        if publisher:
+            publisher.progress("pose", 100, 100, f"Interpolated {total_frames} frames")
+        return interpolated_data
+
    swift_bin = SWIFT_POSE_PATH
    if not os.path.exists(swift_bin):
        swift_bin = SWIFT_POSE_ALT
@@ -81,6 +128,126 @@ def process_pose(
        return json.load(f)


+def interpolate_pose(detected_data: dict, video_path: str) -> dict:
+    """Interpolate keypoints for all frames between detected frames"""
+    import cv2
+    import numpy as np
+    
+    cap = cv2.VideoCapture(video_path)
+    total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = detected_data.get('fps', 30.0)
+    
+    detected_frames = detected_data.get('frames', [])
+    if not detected_frames:
+        cap.release()
+        return detected_data
+    
+    # Build frame index map
+    frame_map = {f['frame']: f for f in detected_frames}
+    detected_frame_nums = sorted(frame_map.keys())
+    
+    print(f"[Pose] Interpolating from {len(detected_frame_nums)} detected frames to {total_video_frames} total frames", file=sys.stderr)
+    
+    # Get all persons from detected frames (assume same person tracking)
+    all_persons = {}
+    for f in detected_frames:
+        for i, p in enumerate(f.get('persons', [])):
+            if i not in all_persons:
+                all_persons[i] = []
+            all_persons[i].append((f['frame'], p))
+    
+    # Interpolate each person's keypoints for each frame
+    interpolated_frames = []
+    
+    for frame_num in range(total_video_frames):
+        ts = frame_num / fps
+        
+        persons_in_frame = []
+        
+        for person_id, person_frames in all_persons.items():
+            # Find closest detected frames before and after
+            before = None
+            after = None
+            for fn, p in person_frames:
+                if fn <= frame_num:
+                    before = (fn, p)
+                if fn >= frame_num and after is None:
+                    after = (fn, p)
+            
+            if before is None and after is None:
+                continue
+            
+            # Interpolate keypoints
+            interpolated_keypoints = []
+            bbox = None
+            
+            if before and after and before[0] != after[0]:
+                # Linear interpolation
+                t0, t1 = before[0], after[0]
+                t = (frame_num - t0) / (t1 - t0) if t1 != t0 else 0
+                
+                kp_before = before[1].get('keypoints', [])
+                kp_after = after[1].get('keypoints', [])
+                bbox_before = before[1].get('bbox', {})
+                bbox_after = after[1].get('bbox', {})
+                
+                # Interpolate keypoints
+                for i in range(max(len(kp_before), len(kp_after))):
+                    kp0 = kp_before[i] if i < len(kp_before) else kp_after[i]
+                    kp1 = kp_after[i] if i < len(kp_after) else kp_before[i]
+                    
+                    x = kp0['x'] + t * (kp1['x'] - kp0['x'])
+                    y = kp0['y'] + t * (kp1['y'] - kp0['y'])
+                    c = kp0['confidence'] + t * (kp1['confidence'] - kp0['confidence'])
+                    
+                    interpolated_keypoints.append({
+                        'name': kp0['name'],
+                        'x': x,
+                        'y': y,
+                        'confidence': c
+                    })
+                
+                # Interpolate bbox
+                if bbox_before and bbox_after:
+                    bbox = {
+                        'x': int(bbox_before['x'] + t * (bbox_after['x'] - bbox_before['x'])),
+                        'y': int(bbox_before['y'] + t * (bbox_after['y'] - bbox_before['y'])),
+                        'width': int(bbox_before['width'] + t * (bbox_after['width'] - bbox_before['width'])),
+                        'height': int(bbox_before['height'] + t * (bbox_after['height'] - bbox_before['height']))
+                    }
+            
+            elif before:
+                # Use before frame's data
+                interpolated_keypoints = before[1].get('keypoints', [])
+                bbox = before[1].get('bbox', {})
+            
+            elif after:
+                # Use after frame's data
+                interpolated_keypoints = after[1].get('keypoints', [])
+                bbox = after[1].get('bbox', {})
+            
+            if bbox and bbox.get('width', 0) > 0 and bbox.get('height', 0) > 0:
+                persons_in_frame.append({
+                    'keypoints': interpolated_keypoints,
+                    'bbox': bbox
+                })
+        
+        if persons_in_frame:
+            interpolated_frames.append({
+                'frame': frame_num,
+                'timestamp': ts,
+                'persons': persons_in_frame
+            })
+    
+    cap.release()
+    
+    return {
+        'frame_count': len(interpolated_frames),
+        'fps': fps,
+        'frames': interpolated_frames
+    }
+
+
 def _fallback(video_path, output_path, uuid, sample_interval):
    """Fallback to YOLOv8 Pose"""
    from ultralytics import YOLO
@@ -135,14 +302,21 @@ if __name__ == "__main__":
    parser.add_argument("output_path")
    parser.add_argument("--uuid", "-u", default="")
    parser.add_argument("--sample-interval", type=int, default=3)  # Changed from 30 to match Face
+    parser.add_argument("--frames", type=str, default=None,
+                        help="Comma-separated frame numbers for 8Hz sampling")
    args = parser.parse_args()

+    target_frames = None
+    if args.frames:
+        target_frames = [int(f) for f in args.frames.split(",") if f.strip()]
+        print(f"[Pose] 8Hz target frames: {len(target_frames)} frames", file=sys.stderr)
+
    publisher = RedisPublisher(args.uuid) if args.uuid else None
    if publisher:
        publisher.info("pose", "POSE_START")

    result = process_pose(args.video_path, args.output_path, args.uuid,
-                          args.sample_interval, publisher)
+                          args.sample_interval, publisher, target_frames)
    with open(args.output_path, "w") as f:
        json.dump(result, f, indent=2)
    print(f"Pose: {len(result.get('frames', []))} frames with poses")
@@ -21,8 +21,6 @@ import json
 import argparse
 from collections import defaultdict
 import numpy as np
-import psycopg2
-import psycopg2.extras
 from datetime import datetime

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -30,13 +28,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "uti
 from qdrant_faces import update_trace_ids

 # Config
-DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
-SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
 OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
-
-
-def get_conn():
-    return psycopg2.connect(DB_URL)
+SCHEMA = os.environ.get("DATABASE_SCHEMA", "public")


 def merge_traces_within_cuts(face_data: dict, cut_scenes: list) -> dict:
@@ -146,67 +139,17 @@ def run_face_tracker(


 def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHEMA):
-    """Insert traced face detections into face_detections table with trace_id"""
-    conn = get_conn()
-    cur = conn.cursor()
-
+    """Update Qdrant _faces collection with trace_id after face tracking.
+    
+    face_detections table is deprecated — trace_id is stored only in Qdrant _faces payload.
+    """
    with open(traced_json_path) as f:
        data = json.load(f)

    frames = data.get("frames", {})
-    total_stored = 0

-    for frame_num_str, frame_data in sorted(frames.items(), key=lambda x: int(x[0])):
-        frame_num = int(frame_num_str)
-        faces = frame_data.get("faces", [])
-
-        for face in faces:
-            trace_id = face.get("trace_id")
-            if trace_id is None:
-                continue
-
-            x = face.get("x", 0)
-            y = face.get("y", 0)
-            w = face.get("width", 0)
-            h = face.get("height", 0)
-            confidence = face.get("confidence", 0.0)
-            face_id = face.get("face_id")
-            if face_id is None:
-                face_id = f"face_{trace_id}"
-            attributes = face.get("attributes")
-
-            bbox = json.dumps({"x": x, "y": y, "width": w, "height": h})
-
-            try:
-                cur.execute(
-                    f"""
-                    UPDATE {schema}.face_detections
-                    SET trace_id = %s, face_id = %s
-                    WHERE file_uuid = %s AND frame_number = %s
-                      AND x = %s AND y = %s AND width = %s AND height = %s
-                    """,
-                    (
-                        trace_id,
-                        face_id,
-                        file_uuid,
-                        frame_num,
-                        x,
-                        y,
-                        w,
-                        h,
-                    ),
-                )
-                if cur.rowcount > 0:
-                    total_stored += 1
-            except Exception as e:
-                print(f"[TRACE] Error storing face at frame {frame_num}: {e}")
-                conn.rollback()
-                continue
-
-    conn.commit()
-
-    # Build trace_mapping for Qdrant update
-    trace_mapping = {}  # {frame: {bbox_key: trace_id}}
+    # Build trace_mapping for Qdrant update: {frame: {bbox_key: trace_id}}
+    trace_mapping = {}
    for frame_num_str, frame_data in sorted(frames.items(), key=lambda x: int(x[0])):
        frame_num = int(frame_num_str)
        trace_mapping[frame_num] = {}
@@ -224,22 +167,26 @@ def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHE
        print(f"[TRACE] Warning: Qdrant trace_id update failed: {e}")
        qdrant_updated = 0

-    # Log trace summary
-    cur.execute(
-        f"SELECT COUNT(DISTINCT trace_id) FROM {schema}.face_detections WHERE file_uuid = %s AND trace_id IS NOT NULL",
-        (file_uuid,),
-    )
-    db_trace_count = cur.fetchone()[0]
+    # Count unique traces from Qdrant
+    try:
+        from qdrant_faces import get_file_faces
+        points = get_file_faces(file_uuid)
+        trace_ids = set()
+        for p in points:
+            tid = p.get("payload", {}).get("trace_id")
+            if tid is not None and tid > 0:
+                trace_ids.add(tid)
+        qdrant_trace_count = len(trace_ids)
+    except Exception as e:
+        print(f"[TRACE] Warning: Qdrant trace count failed: {e}")
+        qdrant_trace_count = 0

-    cur.close()
-    conn.close()
-
-    print(
-        f"[TRACE] Stored {total_stored} face detections, {db_trace_count} unique traces in DB"
+    total_faces = sum(
+        1 for fd in frames.values() for f in fd.get("faces", []) if f.get("trace_id") is not None
    )
-    if qdrant_updated > 0:
-        print(f"[TRACE] Updated {qdrant_updated} Qdrant points with trace_id")
-    return total_stored, db_trace_count
+
+    print(f"[TRACE] Updated {qdrant_updated} Qdrant points with trace_id, {qdrant_trace_count} unique traces")
+    return total_faces, qdrant_trace_count


 def main():
@@ -248,8 +195,6 @@ def main():

    parser.add_argument("--face-json", help="Path to face.json (default: auto-detect)")

-    parser.add_argument("--schema", default=SCHEMA, help="DB schema name")
-
    parser.add_argument("--uuid", help="UUID for Redis tracking (accepted by executor)")
    parser.add_argument(
        "--filter-eyes",
@@ -270,8 +215,8 @@ def main():
    # Step 1: Run face tracker
    run_face_tracker(face_json, traced_json, filter_eyes=args.filter_eyes)

-    # Step 2: Store in DB with trace_id
-    total, traces = store_traced_faces(args.file_uuid, traced_json, args.schema)
+    # Step 2: Store in Qdrant with trace_id
+    total, traces = store_traced_faces(args.file_uuid, traced_json)
    print(f"[TRACE] Done: {total} detections, {traces} traces")


@@ -0,0 +1,409 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AVFoundation
+
+/// Swift Face+Pose Processor - one pass, two outputs
+/// Runs VNDetectFaceRectanglesRequest, VNDetectFaceLandmarksRequest,
+/// and VNDetectHumanBodyPoseRequest on each sampled frame.
+/// Uses AVAssetReader sequential read (frame-based), matching cv2 behavior.
+@main
+struct SwiftFacePose: ParsableCommand {
+    @Argument(help: "Video file path")
+    var inputPath: String
+
+    @Argument(help: "Output JSON path for face detection")
+    var faceOutput: String
+
+    @Argument(help: "Output JSON path for pose detection")
+    var poseOutput: String
+
+    @Option(name: .long, help: "Sample interval (frames, default=30)")
+    var sampleInterval: Int = 30
+
+    @Option(name: .long, help: "UUID for logging")
+    var uuid: String = ""
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("[SwiftFacePose] Vision face+pose detection: \(inputPath)")
+
+        let url = URL(fileURLWithPath: inputPath)
+        let asset = AVAsset(url: url)
+
+        guard let videoTrack = asset.tracks(withMediaType: .video).first else {
+            print("[SwiftFacePose] No video track found")
+            return
+        }
+
+        let fps = videoTrack.nominalFrameRate
+        let duration = CMTimeGetSeconds(asset.duration)
+        let totalFrames = Int(duration * Double(fps))
+        print("[SwiftFacePose] Video: \(Int(videoTrack.naturalSize.width))x\(Int(videoTrack.naturalSize.height)), \(String(format: "%.1f", fps))fps, \(totalFrames) frames, interval=\(sampleInterval)")
+
+        // read sequentially, matching cv2 frame-by-frame behavior
+        let reader = try AVAssetReader(asset: asset)
+        let outputSettings: [String: Any] = [
+            kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA
+        ]
+        let trackOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: outputSettings)
+        trackOutput.alwaysCopiesSampleData = false
+        reader.add(trackOutput)
+        guard reader.startReading() else {
+            print("[SwiftFacePose] Failed to start AVAssetReader: \(reader.error?.localizedDescription ?? "unknown")")
+            return
+        }
+
+        var faceFrames: [[String: Any]] = []
+        var poseFrames: [[String: Any]] = []
+        var processedCount = 0
+        var frameIndex = 0
+
+        let jointNames: [VNHumanBodyPoseObservation.JointName] = [
+            .nose, .leftEye, .rightEye, .leftEar, .rightEar,
+            .neck, .root,
+            .leftShoulder, .rightShoulder,
+            .leftElbow, .rightElbow,
+            .leftWrist, .rightWrist,
+            .leftHip, .rightHip,
+            .leftKnee, .rightKnee,
+            .leftAnkle, .rightAnkle,
+        ]
+
+        while let sampleBuffer = trackOutput.copyNextSampleBuffer() {
+            defer { frameIndex += 1 }
+
+            if frameIndex % sampleInterval != 0 {
+                continue
+            }
+
+            guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
+                continue
+            }
+
+            let imgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
+            let imgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
+            let seconds = Double(frameIndex) / Double(fps)
+
+            let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
+            let faceReq = VNDetectFaceRectanglesRequest()
+            let lmReq = VNDetectFaceLandmarksRequest()
+            let bodyReq = VNDetectHumanBodyPoseRequest()
+
+            do {
+                try handler.perform([faceReq, lmReq, bodyReq])
+            } catch {
+                continue
+            }
+
+            // ── Face output ──
+            let faceObservations = faceReq.results ?? []
+            let landmarkObservations = lmReq.results ?? []
+            
+            var faces: [[String: Any]] = []
+            var hasFace = false
+
+            if !faceObservations.isEmpty || !landmarkObservations.isEmpty {
+                hasFace = true
+
+                let MIN_CONFIDENCE = 0.6
+                let MIN_SIZE = 20
+
+                for lmObs in landmarkObservations {
+                    let lmConf = Double(lmObs.confidence)
+                    if lmConf < MIN_CONFIDENCE { continue }
+
+                    let bb = lmObs.boundingBox
+                    let faceW = Int(bb.size.width * imgW)
+                    let faceH = Int(bb.size.height * imgH)
+                    if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
+
+                    let faceX = Int(bb.origin.x * imgW)
+                    let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
+
+                    var faceData: [String: Any] = [
+                        "bbox": ["x": max(0, faceX), "y": max(0, faceY),
+                                 "width": faceW, "height": faceH],
+                        "confidence": Double(lmObs.confidence),
+                    ]
+
+                    if let yaw = lmObs.yaw?.doubleValue,
+                       let roll = lmObs.roll?.doubleValue {
+                        var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
+                        if let pitch = lmObs.pitch?.doubleValue {
+                            poseInfo["pitch"] = pitch
+                        }
+                        faceData["pose"] = poseInfo
+                    }
+
+                    if let lms = lmObs.landmarks {
+                        let imgSize = CGSize(width: imgW, height: imgH)
+                        let leftEye = lms.leftEye?.pointsInImage(imageSize: imgSize) ?? []
+                        let rightEye = lms.rightEye?.pointsInImage(imageSize: imgSize) ?? []
+                        let nose = lms.nose?.pointsInImage(imageSize: imgSize) ?? []
+
+                        if !leftEye.isEmpty || !rightEye.isEmpty || !nose.isEmpty {
+                            var lm: [String: [[Double]]] = [:]
+                            if !leftEye.isEmpty {
+                                lm["left_eye"] = leftEye.map { [Double($0.x), Double(imgH - $0.y)] }
+                            }
+                            if !rightEye.isEmpty {
+                                lm["right_eye"] = rightEye.map { [Double($0.x), Double(imgH - $0.y)] }
+                            }
+                            if !nose.isEmpty {
+                                lm["nose"] = nose.map { [Double($0.x), Double(imgH - $0.y)] }
+                            }
+                            faceData["landmarks"] = lm
+                        }
+
+                        let outer = lms.outerLips?.pointsInImage(imageSize: imgSize) ?? []
+                        let inner = lms.innerLips?.pointsInImage(imageSize: imgSize) ?? []
+                        if !outer.isEmpty || !inner.isEmpty {
+                            faceData["lips"] = [
+                                "outer_lips": outer.map { [Double($0.x), Double(imgH - $0.y)] },
+                                "inner_lips": inner.map { [Double($0.x), Double(imgH - $0.y)] }
+                            ]
+                        }
+                    }
+
+                    faces.append(faceData)
+                }
+
+                for faceObs in faceObservations {
+                    let fBB = faceObs.boundingBox
+                    var matched = false
+                    for lmObs in landmarkObservations {
+                        let lBB = lmObs.boundingBox
+                        let ix = max(fBB.origin.x, lBB.origin.x)
+                        let iy = max(fBB.origin.y, lBB.origin.y)
+                        let iw = min(fBB.maxX, lBB.maxX) - ix
+                        let ih = min(fBB.maxY, lBB.maxY) - iy
+                        if iw <= 0 || ih <= 0 { continue }
+                        let intersection = iw * ih
+                        let union = fBB.width * fBB.height + lBB.width * lBB.height - intersection
+                        if intersection / union > 0.3 {
+                            matched = true
+                            break
+                        }
+                    }
+                    if matched { continue }
+
+                    let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
+                    if faceConf < MIN_CONFIDENCE { continue }
+
+                    let faceW = Int(fBB.size.width * imgW)
+                    let faceH = Int(fBB.size.height * imgH)
+                    if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
+
+                    let faceX = Int(fBB.origin.x * imgW)
+                    let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
+
+                    var faceData: [String: Any] = [
+                        "bbox": ["x": max(0, faceX), "y": max(0, faceY),
+                                 "width": faceW, "height": faceH],
+                        "confidence": Double(faceObs.faceCaptureQuality ?? faceObs.confidence),
+                    ]
+                    if let yaw = faceObs.yaw?.doubleValue,
+                       let roll = faceObs.roll?.doubleValue {
+                        var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
+                        if let pitch = faceObs.pitch?.doubleValue {
+                            poseInfo["pitch"] = pitch
+                        }
+                        faceData["pose"] = poseInfo
+                    }
+                    faces.append(faceData)
+                }
+
+                if !faces.isEmpty {
+                    faceFrames.append([
+                        "frame": frameIndex,
+                        "timestamp": seconds,
+                        "faces": faces,
+                    ])
+                }
+            }
+
+            // ── Pose output ──
+            // Rule: Face ≤ Pose - every face frame must have pose frame
+            // Face landmarks (nose, leftEye, rightEye) ARE pose keypoints
+            let poses = bodyReq.results ?? []
+            var persons: [[String: Any]] = []
+            
+            // If we have face landmarks, extract pose keypoints from them
+            // This ensures Face → Pose is always true
+            if hasFace && landmarkObservations.count > 0 {
+                for lmObs in landmarkObservations {
+                    let lmConf = Double(lmObs.confidence)
+                    if lmConf < 0.6 { continue }
+                    
+                    if let lms = lmObs.landmarks {
+                        let imgSize = CGSize(width: imgW, height: imgH)
+                        var keypoints: [[String: Any]] = []
+                        
+                        // Extract face landmarks as pose keypoints
+                        if let nosePoints = lms.nose?.pointsInImage(imageSize: imgSize) {
+                            for pt in nosePoints {
+                                keypoints.append([
+                                    "name": "nose",
+                                    "x": Double(pt.x),
+                                    "y": Double(imgH - pt.y),
+                                    "confidence": lmConf
+                                ])
+                            }
+                        }
+                        
+                        if let leftEyePoints = lms.leftEye?.pointsInImage(imageSize: imgSize) {
+                            for pt in leftEyePoints {
+                                keypoints.append([
+                                    "name": "left_eye",
+                                    "x": Double(pt.x),
+                                    "y": Double(imgH - pt.y),
+                                    "confidence": lmConf
+                                ])
+                            }
+                        }
+                        
+                        if let rightEyePoints = lms.rightEye?.pointsInImage(imageSize: imgSize) {
+                            for pt in rightEyePoints {
+                                keypoints.append([
+                                    "name": "right_eye",
+                                    "x": Double(pt.x),
+                                    "y": Double(imgH - pt.y),
+                                    "confidence": lmConf
+                                ])
+                            }
+                        }
+                        
+                        if !keypoints.isEmpty {
+                            persons.append([
+                                "keypoints": keypoints,
+                                "bbox": ["x": 0, "y": 0, "width": 0, "height": 0]
+                            ])
+                        }
+                    }
+                }
+            }
+            
+            // Also process body pose detections (may add more keypoints)
+            for pose in poses {
+                var keypoints: [[String: Any]] = []
+                var minX = CGFloat.greatestFiniteMagnitude
+                var minY = CGFloat.greatestFiniteMagnitude
+                var maxX: CGFloat = 0
+                var maxY: CGFloat = 0
+
+                for joint in jointNames {
+                    if let point = try? pose.recognizedPoint(joint) {
+                        let desc = String(describing: joint.rawValue)
+                        var rawName = desc
+                            .replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
+                            .replacingOccurrences(of: ")", with: "")
+                            .trimmingCharacters(in: .whitespaces)
+                        let nameMap: [String: String] = [
+                            "head_joint": "nose",
+                            "left_eye_joint": "left_eye",
+                            "right_eye_joint": "right_eye",
+                            "left_ear_joint": "left_ear",
+                            "right_ear_joint": "right_ear",
+                            "neck_1_joint": "neck",
+                            "left_shoulder_1_joint": "left_shoulder",
+                            "right_shoulder_1_joint": "right_shoulder",
+                            "left_elbow_1_joint": "left_elbow",
+                            "right_elbow_1_joint": "right_elbow",
+                            "left_hand_joint": "left_wrist",
+                            "right_hand_joint": "right_wrist",
+                            "left_hip_1_joint": "left_hip",
+                            "right_hip_1_joint": "right_hip",
+                            "left_knee_1_joint": "left_knee",
+                            "right_knee_1_joint": "right_knee",
+                            "left_ankle_1_joint": "left_ankle",
+                            "right_ankle_1_joint": "right_ankle",
+                            "center_hip_joint": "root",
+                        ]
+                        if let mapped = nameMap[rawName] {
+                            rawName = mapped
+                        }
+                        let px = point.location.x * CGFloat(imgW)
+                        let py = CGFloat(imgH) - point.location.y * CGFloat(imgH)
+                        keypoints.append([
+                            "name": rawName.isEmpty ? "\(joint)" : rawName,
+                            "x": px,
+                            "y": py,
+                            "confidence": point.confidence,
+                        ])
+                        if point.confidence > 0.1 {
+                            minX = min(minX, px)
+                            minY = min(minY, py)
+                            maxX = max(maxX, px)
+                            maxY = max(maxY, py)
+                        }
+                    }
+                }
+
+                var bbox: [String: Any] = ["x": 0, "y": 0, "width": 0, "height": 0]
+                if maxX > minX {
+                    bbox = [
+                        "x": Int(minX),
+                        "y": Int(minY),
+                        "width": Int(maxX - minX),
+                        "height": Int(maxY - minY),
+                    ]
+                }
+
+                persons.append(["keypoints": keypoints, "bbox": bbox])
+            }
+
+            // Rule: Face ≤ Pose - always add pose frame if has face
+            if hasFace || !persons.isEmpty {
+                poseFrames.append([
+                    "frame": frameIndex,
+                    "timestamp": seconds,
+                    "persons": persons,
+                ])
+            }
+
+            processedCount += 1
+
+            if processedCount % 100 == 0 {
+                let elapsed = Date().timeIntervalSince(startTime)
+                let totalSamples = totalFrames / sampleInterval
+                let pct = Int(Double(processedCount) / Double(totalSamples) * 100)
+                print("[SwiftFacePose] \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(pct)% complete, \(Int(elapsed))s elapsed")
+                fflush(stdout)
+            }
+        }
+
+        reader.cancelReading()
+
+        let faceOutputDict: [String: Any] = [
+            "frame_count": faceFrames.count,
+            "fps": Double(fps),
+            "frames": faceFrames,
+        ]
+        do {
+            let faceJson = try JSONSerialization.data(withJSONObject: faceOutputDict, options: [])
+            try faceJson.write(to: URL(fileURLWithPath: faceOutput))
+            print("[SwiftFacePose] Face output written: \(faceOutput)")
+            // Verify file exists
+            if FileManager.default.fileExists(atPath: faceOutput) {
+                print("[SwiftFacePose] Verified: file exists at \(faceOutput)")
+            } else {
+                print("[SwiftFacePose] ERROR: file not found after write!")
+            }
+        } catch {
+            print("[SwiftFacePose] ERROR writing face output: \(error)")
+        }
+
+        let poseOutputDict: [String: Any] = [
+            "frame_count": poseFrames.count,
+            "fps": Double(fps),
+            "frames": poseFrames,
+        ]
+        if let poseJson = try? JSONSerialization.data(withJSONObject: poseOutputDict, options: [.prettyPrinted]) {
+            try poseJson.write(to: URL(fileURLWithPath: poseOutput))
+        }
+
+        let elapsed = Date().timeIntervalSince(startTime)
+        print("[SwiftFacePose] Done: \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(String(format: "%.1f", elapsed))s")
+    }
+}