fix: ASRX duplication, TKG edges, trace ingest, and add pipeline progress publishing
- ASRX handler no longer stores duplicate 'asr' pre_chunks - Pre_chunks storage made idempotent (delete-before-insert) - Rule 1 + trace_ingest changed to query 'asrx' not 'asr' - Trace chunks removed (dynamic from TKG/Qdrant) - TKG scroll_face_points fixed: trace_id >= 1 (not == 1) - TKG AsrxSegmentEntry: start/end -> start_time/end_time (match ASRX JSON) - Unregister error handling: log instead of silent discard - Add publish_pipeline_progress calls at each pipeline stage (processors, rule1, face_trace, identity_agent, TKG, rule2, completion)
This commit is contained in:
+237
-74
@@ -1,15 +1,17 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Appearance Processor - HSV color feature extraction for person tracking
|
||||
Appearance Processor - Body part color extraction using pose keypoints
|
||||
|
||||
Input:
|
||||
- video_path: source video
|
||||
- pose_json: pose.json with frame bboxes
|
||||
- pose_json: pose.json with keypoints and bbox
|
||||
- output_path: output JSON
|
||||
|
||||
Output: appearance.json with HSV histogram per person per frame
|
||||
Output: appearance.json with per-person per-frame body part colors
|
||||
|
||||
Depends on pose.json (bbox). Same 0-based frame numbering as face/pose/mediapipe.
|
||||
Regions: head, neck, front_upper_body, front_lower_body,
|
||||
back_upper_body, back_lower_body, left_hand, right_hand,
|
||||
left_foot, right_foot
|
||||
"""
|
||||
|
||||
import sys
|
||||
@@ -20,82 +22,223 @@ import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
def extract_appearance(frame, bbox):
|
||||
x, y, w, h = bbox["x"], bbox["y"], bbox["width"], bbox["height"]
|
||||
if w <= 0 or h <= 0:
|
||||
return None
|
||||
def get_kp(keypoints, name):
|
||||
for kp in keypoints:
|
||||
if kp.get("name") == name:
|
||||
return (kp["x"], kp["y"], kp.get("confidence", 1.0))
|
||||
return None
|
||||
|
||||
x1, y1 = max(0, x), max(0, y)
|
||||
x2 = min(frame.shape[1], x + w)
|
||||
y2 = min(frame.shape[0], y + h)
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
return None
|
||||
|
||||
person_roi = frame[y1:y2, x1:x2]
|
||||
hsv = cv2.cvtColor(person_roi, cv2.COLOR_BGR2HSV)
|
||||
def determine_facing(keypoints):
|
||||
nose = get_kp(keypoints, "nose")
|
||||
left_shoulder = get_kp(keypoints, "left_shoulder")
|
||||
right_shoulder = get_kp(keypoints, "right_shoulder")
|
||||
|
||||
if nose and nose[2] > 0.5:
|
||||
return "front"
|
||||
|
||||
sh_vis = sum(1 for s in [left_shoulder, right_shoulder] if s and s[2] > 0.5)
|
||||
if sh_vis >= 2 and (not nose or nose[2] < 0.2):
|
||||
return "back"
|
||||
|
||||
if sh_vis >= 1:
|
||||
return "profile"
|
||||
|
||||
return "unknown"
|
||||
|
||||
|
||||
def extract_color(roi_bgr):
|
||||
"""Extract HSV histogram and dominant colors from an ROI"""
|
||||
if roi_bgr is None or roi_bgr.size == 0:
|
||||
return None
|
||||
if roi_bgr.shape[0] < 2 or roi_bgr.shape[1] < 2:
|
||||
return None
|
||||
hsv = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2HSV)
|
||||
pixels = hsv.reshape(-1, 3).astype(np.float32)
|
||||
|
||||
# HSV histograms
|
||||
h_hist = cv2.calcHist([hsv], [0], None, [30], [0, 180]).flatten()
|
||||
s_hist = cv2.calcHist([hsv], [1], None, [32], [0, 256]).flatten()
|
||||
v_hist = cv2.calcHist([hsv], [2], None, [32], [0, 256]).flatten()
|
||||
h_sum = h_hist.sum() or 1
|
||||
s_sum = s_hist.sum() or 1
|
||||
v_sum = v_hist.sum() or 1
|
||||
hs = h_hist.sum() or 1
|
||||
ss = s_hist.sum() or 1
|
||||
vs = v_hist.sum() or 1
|
||||
|
||||
# Dominant colors via k-means
|
||||
dominant = []
|
||||
if len(pixels) >= 5:
|
||||
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
|
||||
_, labels, centers = cv2.kmeans(
|
||||
pixels, 5, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS
|
||||
)
|
||||
_, labels, centers = cv2.kmeans(pixels, 5, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
|
||||
counts = np.bincount(labels.flatten())
|
||||
dominant = centers[np.argsort(-counts)[:5]].tolist()
|
||||
elif len(pixels) > 0:
|
||||
dominant = [pixels.mean(axis=0).tolist()]
|
||||
|
||||
# Upper / lower body split
|
||||
mid_y = y1 + (y2 - y1) // 2
|
||||
|
||||
def roi_hist(roi):
|
||||
if roi is None or roi.size == 0:
|
||||
return None
|
||||
hsv_r = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
|
||||
hh = cv2.calcHist([hsv_r], [0], None, [30], [0, 180]).flatten()
|
||||
sh = cv2.calcHist([hsv_r], [1], None, [32], [0, 256]).flatten()
|
||||
vh = cv2.calcHist([hsv_r], [2], None, [32], [0, 256]).flatten()
|
||||
hs = hh.sum() or 1
|
||||
ss = sh.sum() or 1
|
||||
vs = vh.sum() or 1
|
||||
return [(hh / hs).tolist(), (sh / ss).tolist(), (vh / vs).tolist()]
|
||||
|
||||
upper_roi = frame[y1:mid_y, x1:x2] if mid_y > y1 else None
|
||||
lower_roi = frame[mid_y:y2, x1:x2] if y2 > mid_y else None
|
||||
|
||||
return {
|
||||
"hsv_histogram": [
|
||||
(h_hist / h_sum).tolist(),
|
||||
(s_hist / s_sum).tolist(),
|
||||
(v_hist / v_sum).tolist(),
|
||||
],
|
||||
"hsv_histogram": [(h_hist / hs).tolist(), (s_hist / ss).tolist(), (v_hist / vs).tolist()],
|
||||
"dominant_colors": dominant,
|
||||
"upper_body": roi_hist(upper_roi),
|
||||
"lower_body": roi_hist(lower_roi),
|
||||
}
|
||||
|
||||
|
||||
def safe_roi(frame, x, y, w, h):
|
||||
"""Extract a safe ROI, returning None if invalid"""
|
||||
if w <= 0 or h <= 0:
|
||||
return None
|
||||
x1 = max(0, int(x))
|
||||
y1 = max(0, int(y))
|
||||
x2 = min(frame.shape[1], int(x + w))
|
||||
y2 = min(frame.shape[0], int(y + h))
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
return None
|
||||
return frame[y1:y2, x1:x2]
|
||||
|
||||
|
||||
def compute_body_regions(keypoints, face_bbox, frame_shape):
|
||||
"""Use face bbox for size, pose keypoints for alignment"""
|
||||
h, w = frame_shape[:2]
|
||||
|
||||
fx, fy, fw, fh = face_bbox["x"], face_bbox["y"], face_bbox["width"], face_bbox["height"]
|
||||
face_cx = fx + fw / 2
|
||||
|
||||
nose = get_kp(keypoints, "nose")
|
||||
ls = get_kp(keypoints, "left_shoulder")
|
||||
rs = get_kp(keypoints, "right_shoulder")
|
||||
lw = get_kp(keypoints, "left_wrist")
|
||||
rw = get_kp(keypoints, "right_wrist")
|
||||
lh = get_kp(keypoints, "left_hip")
|
||||
rh = get_kp(keypoints, "right_hip")
|
||||
la = get_kp(keypoints, "left_ankle")
|
||||
ra = get_kp(keypoints, "right_ankle")
|
||||
|
||||
kp_nose = (nose[0], nose[1]) if nose else (face_cx, fy + fh * 0.5)
|
||||
kp_sh_l = ls[0] if ls else (face_cx - fw * 1.5)
|
||||
kp_sh_r = rs[0] if rs else (face_cx + fw * 1.5)
|
||||
kp_sh_mid_x = (kp_sh_l + kp_sh_r) / 2
|
||||
kp_sh_mid_y = ((ls[1] + rs[1]) / 2) if (ls and rs) else (fy + fh + fh * 0.3)
|
||||
kp_hip_y = ((lh[1] + rh[1]) / 2) if (lh and rh) else (kp_sh_mid_y + fw * 2.0)
|
||||
kp_hip_l = lh[0] if lh else (kp_sh_mid_x - fw * 1.2)
|
||||
kp_hip_r = rh[0] if rh else (kp_sh_mid_x + fw * 1.2)
|
||||
|
||||
regions = {}
|
||||
|
||||
# head: nose-aligned, face-proportional
|
||||
head_w = fw * 1.6
|
||||
head_h = fh * 1.5
|
||||
regions["head"] = {
|
||||
"x": kp_nose[0] - head_w / 2,
|
||||
"y": kp_nose[1] - head_h * 0.5,
|
||||
"width": head_w,
|
||||
"height": head_h,
|
||||
}
|
||||
|
||||
# neck: nose-to-shoulder, face-width
|
||||
neck_w = fw * 1.5
|
||||
regions["neck"] = {
|
||||
"x": kp_sh_mid_x - neck_w / 2,
|
||||
"y": kp_nose[1] + fh * 0.4,
|
||||
"width": neck_w,
|
||||
"height": max(kp_sh_mid_y - kp_nose[1] - fh * 0.4, fh * 0.3),
|
||||
}
|
||||
|
||||
# upper body: shoulder-aligned
|
||||
ub_w = max(abs(kp_sh_r - kp_sh_l) * 1.3, fw * 3.0)
|
||||
ub_h = fh * 3.0
|
||||
regions["front_upper_body"] = {
|
||||
"x": kp_sh_mid_x - ub_w / 2,
|
||||
"y": kp_sh_mid_y,
|
||||
"width": ub_w,
|
||||
"height": ub_h,
|
||||
}
|
||||
regions["back_upper_body"] = dict(regions["front_upper_body"])
|
||||
|
||||
# lower body: hip-aligned
|
||||
lb_w = max(abs(kp_hip_r - kp_hip_l) * 1.3, fw * 3.5)
|
||||
lb_h = fh * 3.0
|
||||
regions["front_lower_body"] = {
|
||||
"x": kp_sh_mid_x - lb_w / 2,
|
||||
"y": kp_hip_y,
|
||||
"width": lb_w,
|
||||
"height": lb_h,
|
||||
}
|
||||
regions["back_lower_body"] = dict(regions["front_lower_body"])
|
||||
|
||||
# hands: wrist-aligned
|
||||
hs = fw * 1.0
|
||||
if lw and lw[2] > 0.3:
|
||||
regions["left_hand"] = {"x": lw[0] - hs / 2, "y": lw[1] - hs / 2, "width": hs, "height": hs}
|
||||
else:
|
||||
regions["left_hand"] = {"x": kp_sh_l - hs, "y": kp_sh_mid_y + fh * 0.5, "width": hs, "height": hs}
|
||||
if rw and rw[2] > 0.3:
|
||||
regions["right_hand"] = {"x": rw[0] - hs / 2, "y": rw[1] - hs / 2, "width": hs, "height": hs}
|
||||
else:
|
||||
regions["right_hand"] = {"x": kp_sh_r, "y": kp_sh_mid_y + fh * 0.5, "width": hs, "height": hs}
|
||||
|
||||
# feet: ankle-aligned
|
||||
fs = fw * 1.0
|
||||
if la and la[2] > 0.3:
|
||||
regions["left_foot"] = {"x": la[0] - fs / 2, "y": la[1], "width": fs, "height": fs * 0.75}
|
||||
else:
|
||||
regions["left_foot"] = {"x": kp_sh_mid_x - fw * 1.0, "y": kp_hip_y + fh * 2.5, "width": fs, "height": fs * 0.75}
|
||||
if ra and ra[2] > 0.3:
|
||||
regions["right_foot"] = {"x": ra[0] - fs / 2, "y": ra[1], "width": fs, "height": fs * 0.75}
|
||||
else:
|
||||
regions["right_foot"] = {"x": kp_sh_mid_x + fw * 1.0 - fs, "y": kp_hip_y + fh * 2.5, "width": fs, "height": fs * 0.75}
|
||||
|
||||
# Extrapolate each bbox outward
|
||||
expanded = {}
|
||||
margins = {
|
||||
"head": 0.10, "neck": 0.15,
|
||||
"front_upper_body": 0.20, "back_upper_body": 0.20,
|
||||
"front_lower_body": 0.15, "back_lower_body": 0.15,
|
||||
"left_hand": 0.25, "right_hand": 0.25,
|
||||
"left_foot": 0.20, "right_foot": 0.20,
|
||||
}
|
||||
for name, rb in regions.items():
|
||||
m = margins.get(name, 0.15)
|
||||
dx = int(rb["width"] * m)
|
||||
dy = int(rb["height"] * m)
|
||||
expanded[name] = {
|
||||
"x": rb["x"] - dx,
|
||||
"y": rb["y"] - dy,
|
||||
"width": rb["width"] + dx * 2,
|
||||
"height": rb["height"] + dy * 2,
|
||||
}
|
||||
return expanded
|
||||
|
||||
|
||||
def filter_by_facing(regions, facing):
|
||||
if facing == "front":
|
||||
regions.pop("back_upper_body", None)
|
||||
regions.pop("back_lower_body", None)
|
||||
elif facing == "back":
|
||||
regions.pop("front_upper_body", None)
|
||||
regions.pop("front_lower_body", None)
|
||||
return regions
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Appearance Processor")
|
||||
parser.add_argument("video_path", help="Video file path")
|
||||
parser.add_argument("pose_json", help="Pose JSON path (bbox input)")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("video_path")
|
||||
parser.add_argument("pose_json")
|
||||
parser.add_argument("output_path")
|
||||
parser.add_argument("--uuid", "-u", default="")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.pose_json) as f:
|
||||
pose_data = json.load(f)
|
||||
|
||||
# Load face.json for anchor bbox (same directory as pose_json)
|
||||
face_path = args.pose_json.replace(".pose.json", ".face.json")
|
||||
face_data = {}
|
||||
if os.path.exists(face_path):
|
||||
with open(face_path) as f:
|
||||
face_data = json.load(f)
|
||||
# Build frame -> face bbox lookup
|
||||
face_by_frame = {}
|
||||
for fframe in face_data.get("frames", []):
|
||||
fn = fframe.get("frame")
|
||||
faces = fframe.get("faces", [])
|
||||
if faces:
|
||||
face_by_frame[fn] = faces[0] # first face bbox
|
||||
|
||||
fps = pose_data.get("fps", 30.0)
|
||||
|
||||
cap = cv2.VideoCapture(args.video_path)
|
||||
@@ -115,38 +258,58 @@ def main():
|
||||
if not ret:
|
||||
continue
|
||||
|
||||
# Get face bbox for this frame
|
||||
face_bbox = face_by_frame.get(frame_num, persons[0].get("bbox", {"x": 0, "y": 0, "width": 0, "height": 0}))
|
||||
|
||||
frame_persons = []
|
||||
for pid, person in enumerate(persons):
|
||||
keypoints = person.get("keypoints", [])
|
||||
bbox = person.get("bbox", {})
|
||||
if bbox.get("width", 0) <= 0 or bbox.get("height", 0) <= 0:
|
||||
if not keypoints:
|
||||
continue
|
||||
appearance = extract_appearance(frame, bbox)
|
||||
if appearance is None:
|
||||
continue
|
||||
frame_persons.append(
|
||||
{
|
||||
"person_id": pid,
|
||||
"bbox": bbox,
|
||||
**appearance,
|
||||
}
|
||||
)
|
||||
|
||||
facing = determine_facing(keypoints)
|
||||
all_regions = compute_body_regions(keypoints, face_bbox, frame.shape)
|
||||
regions = filter_by_facing(all_regions, facing)
|
||||
|
||||
body_parts = []
|
||||
for name, rb in regions.items():
|
||||
roi = safe_roi(frame, rb["x"], rb["y"], rb["width"], rb["height"])
|
||||
color = extract_color(roi)
|
||||
if color is None:
|
||||
continue
|
||||
body_parts.append({
|
||||
"name": name,
|
||||
"bbox": rb,
|
||||
"hsv_histogram": color["hsv_histogram"],
|
||||
"dominant_colors": color["dominant_colors"],
|
||||
})
|
||||
|
||||
# Full bbox reference colors
|
||||
full = None
|
||||
if bbox.get("width", 0) > 0 and bbox.get("height", 0) > 0:
|
||||
full_roi = safe_roi(frame, bbox["x"], bbox["y"], bbox["width"], bbox["height"])
|
||||
full = extract_color(full_roi)
|
||||
|
||||
frame_persons.append({
|
||||
"person_id": pid,
|
||||
"bbox": bbox,
|
||||
"facing": facing,
|
||||
"body_parts": body_parts,
|
||||
"dominant_colors": full["dominant_colors"] if full else [],
|
||||
"hsv_histogram": full["hsv_histogram"] if full else [[], [], []],
|
||||
})
|
||||
|
||||
if frame_persons:
|
||||
frames_out.append(
|
||||
{
|
||||
"frame": frame_num,
|
||||
"timestamp": pose_frame.get("timestamp", frame_num / fps),
|
||||
"persons": frame_persons,
|
||||
}
|
||||
)
|
||||
frames_out.append({
|
||||
"frame": frame_num,
|
||||
"timestamp": pose_frame.get("timestamp", frame_num / fps),
|
||||
"persons": frame_persons,
|
||||
})
|
||||
|
||||
cap.release()
|
||||
|
||||
output = {
|
||||
"frame_count": len(frames_out),
|
||||
"fps": fps,
|
||||
"frames": frames_out,
|
||||
}
|
||||
output = {"frame_count": len(frames_out), "fps": fps, "frames": frames_out}
|
||||
with open(args.output_path, "w") as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
+37
-15
@@ -201,7 +201,12 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
|
||||
if not has_audio_stream(video_path):
|
||||
if publisher:
|
||||
publisher.info("asr", "No audio stream detected, skipping transcription")
|
||||
output = {"language": "", "language_probability": 0.0, "segments": []}
|
||||
output = {
|
||||
"status": "no_audio_track",
|
||||
"language": "",
|
||||
"language_probability": 0.0,
|
||||
"segments": []
|
||||
}
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
if publisher:
|
||||
@@ -336,16 +341,16 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
|
||||
seg_start = start_t + segment.start
|
||||
seg_end = start_t + segment.end
|
||||
scene_idx = find_scene_idx((seg_start + seg_end) / 2)
|
||||
scene_segments.append({
|
||||
"start_time": seg_start,
|
||||
"end_time": seg_end,
|
||||
"start_frame": int(round(seg_start * fps)),
|
||||
"end_frame": int(round(seg_end * fps)),
|
||||
"text": segment.text.strip(),
|
||||
"scene_number": scene_idx + 1,
|
||||
"language": seg_language,
|
||||
})
|
||||
total_segments += 1
|
||||
scene_segments.append({
|
||||
"start_time": seg_start,
|
||||
"end_time": seg_end,
|
||||
"start_frame": int(round(seg_start * fps)),
|
||||
"end_frame": int(round(seg_end * fps)),
|
||||
"text": segment.text.strip(),
|
||||
"scene_number": scene_idx + 1,
|
||||
"language": seg_language,
|
||||
})
|
||||
total_segments += 1
|
||||
|
||||
# 當前 scene 結果寫入 .asr.tmp
|
||||
all_segments.extend(scene_segments)
|
||||
@@ -365,8 +370,18 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
|
||||
try: os.rmdir(temp_dir)
|
||||
except: pass
|
||||
|
||||
# Determine status for cut_scenes branch
|
||||
if total_segments > 0:
|
||||
status = "has_transcript"
|
||||
else:
|
||||
status = "silent_audio"
|
||||
|
||||
info_language = transcript_language or "unknown"
|
||||
print(f"[ASR] Segmented transcription complete: {total_segments} segments", file=sys.stderr)
|
||||
print(f"[ASR] Segmented transcription complete: {total_segments} segments, status={status}", file=sys.stderr)
|
||||
|
||||
# Write final output with status
|
||||
with open(tmp_path, "w") as f:
|
||||
json.dump({"status": status, "language": info_language, "segments": all_segments}, f)
|
||||
else:
|
||||
# 無 CUT 資料,直接轉錄(原有流程)
|
||||
segments, info = transcribe_with_fallback(model, video_path, publisher)
|
||||
@@ -386,8 +401,15 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
|
||||
if total_segments % 100 == 0:
|
||||
if publisher:
|
||||
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
|
||||
|
||||
# Determine status for direct transcription branch
|
||||
if total_segments > 0:
|
||||
status = "has_transcript"
|
||||
else:
|
||||
status = "silent_audio"
|
||||
|
||||
with open(tmp_path, "w") as f:
|
||||
json.dump({"language": info_language, "segments": all_segments}, f)
|
||||
json.dump({"status": status, "language": info_language, "segments": all_segments}, f)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", f"ASR_LANGUAGE:{info_language}")
|
||||
@@ -396,10 +418,10 @@ def run_asr(video_path, output_path, uuid: str = "", fps: float = None):
|
||||
os.rename(tmp_path, output_path)
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asr", f"{len(results)} segments")
|
||||
publisher.complete("asr", f"{total_segments} segments")
|
||||
|
||||
sys.stderr.write(
|
||||
f"ASR: Transcription complete, {len(results)} segments written to {output_path}\n"
|
||||
f"ASR: Transcription complete, {total_segments} segments written to {output_path}\n"
|
||||
)
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
@@ -126,9 +126,17 @@ def _convert_result(result, output_path):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
segment_count = len(result.get("segments", []))
|
||||
if segment_count > 0:
|
||||
status = "has_transcript"
|
||||
else:
|
||||
status = "silent_audio"
|
||||
|
||||
output_result = {
|
||||
"status": status,
|
||||
"language": result.get("language"),
|
||||
"segments": [],
|
||||
"segment_count": segment_count,
|
||||
"n_speakers": result.get("n_speakers", 0),
|
||||
"speaker_stats": result.get("speaker_stats", {}),
|
||||
}
|
||||
@@ -172,6 +180,37 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_START")
|
||||
|
||||
# Check for audio stream first
|
||||
tracks = probe_audio_tracks(video_path)
|
||||
if not tracks:
|
||||
if publisher:
|
||||
publisher.info("asrx", "No audio stream detected")
|
||||
output_result = {"status": "no_audio_track", "language": None, "segments": [], "segment_count": 0}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments (no audio)")
|
||||
print("[ASRX] No audio stream, skipping", file=sys.stderr)
|
||||
return output_result
|
||||
|
||||
# Check if ASR already determined no audio/silent - skip processing
|
||||
asr_path = output_path.replace(".asrx.json", ".asr.json")
|
||||
if os.path.exists(asr_path):
|
||||
try:
|
||||
with open(asr_path) as f:
|
||||
asr_data = json.load(f)
|
||||
asr_status = asr_data.get("status", "")
|
||||
if asr_status in ("no_audio_track", "silent_audio"):
|
||||
if publisher:
|
||||
publisher.info("asrx", f"ASR status={asr_status}, skipping ASRX processing")
|
||||
output_result = {"status": asr_status, "language": asr_data.get("language"), "segments": [], "segment_count": 0}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", f"0 segments (ASR: {asr_status})")
|
||||
print(f"[ASRX] ASR status={asr_status}, skipping", file=sys.stderr)
|
||||
return output_result
|
||||
except Exception as e:
|
||||
print(f"[ASRX] Failed to read ASR output: {e}", file=sys.stderr)
|
||||
|
||||
checkpoint_path = output_path + ".stage1.json"
|
||||
|
||||
# ── Phase 2: Resume from checkpoint (Steps 4-7 only) ──
|
||||
@@ -189,7 +228,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
|
||||
if "error" in result:
|
||||
if publisher:
|
||||
publisher.error("asrx", result["error"])
|
||||
output_result = {"language": None, "segments": []}
|
||||
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
@@ -225,7 +264,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
|
||||
publisher.error("asrx", str(e))
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
output_result = {"language": None, "segments": []}
|
||||
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
@@ -289,7 +328,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
|
||||
if "error" in result:
|
||||
if publisher:
|
||||
publisher.error("asrx", result["error"])
|
||||
output_result = {"language": None, "segments": []}
|
||||
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
@@ -320,7 +359,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = "",
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
output_result = {"language": None, "segments": []}
|
||||
output_result = {"status": "silent_audio", "language": None, "segments": [], "segment_count": 0}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
|
||||
@@ -216,19 +216,27 @@ class SelfASRXFixed:
|
||||
return {"error": "No speech detected", "segments": []}
|
||||
|
||||
# ── Step 2: VAD scan 每個 rough segment 細切 ──
|
||||
print("\n[Step 2] VAD scan for refined segmentation...")
|
||||
t2 = time.time()
|
||||
refined_segments = []
|
||||
for seg in rough_segments:
|
||||
s = seg["start"]
|
||||
e = seg["end"]
|
||||
sub = self._vad_scan_segment(wav, sample_rate, s, e)
|
||||
if sub:
|
||||
refined_segments.extend(sub)
|
||||
else:
|
||||
refined_segments.append((s, e))
|
||||
print(f" Refined segments: {len(refined_segments)}")
|
||||
print(f" Step 2 time: {time.time() - t2:.2f}s")
|
||||
# Skip VAD if using ASR segments (preserve all ASR segments)
|
||||
if asr_segments:
|
||||
print("\n[Step 2] Skipping VAD scan, using ASR segments directly...")
|
||||
t2 = time.time()
|
||||
refined_segments = [(seg["start"], seg["end"]) for seg in rough_segments]
|
||||
print(f" Refined segments: {len(refined_segments)}")
|
||||
print(f" Step 2 time: {time.time() - t2:.2f}s")
|
||||
else:
|
||||
print("\n[Step 2] VAD scan for refined segmentation...")
|
||||
t2 = time.time()
|
||||
refined_segments = []
|
||||
for seg in rough_segments:
|
||||
s = seg["start"]
|
||||
e = seg["end"]
|
||||
sub = self._vad_scan_segment(wav, sample_rate, s, e)
|
||||
if sub:
|
||||
refined_segments.extend(sub)
|
||||
else:
|
||||
refined_segments.append((s, e))
|
||||
print(f" Refined segments: {len(refined_segments)}")
|
||||
print(f" Step 2 time: {time.time() - t2:.2f}s")
|
||||
|
||||
if not refined_segments:
|
||||
return {"error": "No segments after VAD scan", "segments": []}
|
||||
|
||||
+124
-63
@@ -1,91 +1,152 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
CUT Processor - Scene Detection
|
||||
Uses PySceneDetect for scene detection (local)
|
||||
CUT Processor - Scene Detection & Video Quality Check
|
||||
Uses ffprobe for video analysis. Always produces at least 1 scene.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def get_video_info(video_path: str) -> dict:
|
||||
"""Get video info via ffprobe"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["ffprobe", "-v", "quiet", "-print_format", "json",
|
||||
"-show_format", "-show_streams", video_path],
|
||||
capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
info = json.loads(result.stdout)
|
||||
for stream in info.get("streams", []):
|
||||
if stream.get("codec_type") == "video":
|
||||
nb_frames = stream.get("nb_frames")
|
||||
if nb_frames:
|
||||
fr = stream.get("r_frame_rate", "0/1")
|
||||
fps = eval(fr) if "/" in fr else float(fr)
|
||||
return {
|
||||
"frame_count": int(nb_frames),
|
||||
"fps": fps,
|
||||
"duration": float(stream.get("duration", 0)),
|
||||
"width": int(stream.get("width", 0)),
|
||||
"height": int(stream.get("height", 0)),
|
||||
"codec": stream.get("codec_name", ""),
|
||||
}
|
||||
dur = float(stream.get("duration", 0))
|
||||
afr = stream.get("avg_frame_rate", "0/1")
|
||||
avg_fps = eval(afr) if "/" in afr else float(afr)
|
||||
if dur > 0 and avg_fps > 0:
|
||||
return {
|
||||
"frame_count": int(dur * avg_fps),
|
||||
"fps": avg_fps,
|
||||
"duration": dur,
|
||||
"width": int(stream.get("width", 0)),
|
||||
"height": int(stream.get("height", 0)),
|
||||
"codec": stream.get("codec_name", ""),
|
||||
}
|
||||
return {
|
||||
"frame_count": 0, "fps": 0.0, "duration": dur,
|
||||
"width": 0, "height": 0, "codec": "",
|
||||
}
|
||||
return {"frame_count": 0, "fps": 0.0, "duration": 0, "width": 0, "height": 0, "codec": ""}
|
||||
except Exception:
|
||||
return {"frame_count": 0, "fps": 0.0, "duration": 0, "width": 0, "height": 0, "codec": ""}
|
||||
|
||||
|
||||
def detect_scenes_ffmpeg(video_path: str, fps: float, duration: float) -> list:
|
||||
"""Detect scene changes using ffmpeg scene filter"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["ffprobe", "-v", "quiet", "-show_entries", "frame=pts_time",
|
||||
"-of", "default=nk=0",
|
||||
"-f", "lavfi",
|
||||
f"movie={video_path},select='gt(scene\\,0.3)',showinfo",
|
||||
"-show_frames"],
|
||||
capture_output=True, text=True, timeout=300,
|
||||
)
|
||||
times = []
|
||||
for line in (result.stderr + "\n" + result.stdout).split("\n"):
|
||||
for prefix in ("pts_time=", "pts_time:"):
|
||||
if prefix in line:
|
||||
rest = line.split(prefix)[1].split()[0]
|
||||
try:
|
||||
t = float(rest)
|
||||
times.append(t)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
scenes = []
|
||||
prev_time = 0.0
|
||||
for i, t in enumerate(times):
|
||||
end_frame = round(t * fps)
|
||||
start_frame = round(prev_time * fps)
|
||||
if end_frame > start_frame:
|
||||
scenes.append({
|
||||
"scene_number": i + 1,
|
||||
"start_frame": start_frame,
|
||||
"end_frame": end_frame - 1,
|
||||
"start_time": prev_time,
|
||||
"end_time": t - (1.0 / fps) if fps > 0 else t,
|
||||
})
|
||||
prev_time = t
|
||||
|
||||
last_frame = round(duration * fps) if fps > 0 else 0
|
||||
prev_frame = round(prev_time * fps) if fps > 0 else 0
|
||||
if last_frame > prev_frame:
|
||||
scenes.append({
|
||||
"scene_number": len(scenes) + 1,
|
||||
"start_frame": prev_frame,
|
||||
"end_frame": last_frame - 1,
|
||||
"start_time": prev_time,
|
||||
"end_time": duration,
|
||||
})
|
||||
|
||||
return scenes
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def process_cut(video_path: str, output_path: str, uuid: str = ""):
|
||||
"""Process video for scene detection"""
|
||||
"""Process video for scene detection and quality verification"""
|
||||
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("cut", "CUT_START")
|
||||
|
||||
try:
|
||||
from scenedetect import VideoManager, SceneManager
|
||||
from scenedetect.detectors import ContentDetector
|
||||
except ImportError:
|
||||
if publisher:
|
||||
publisher.error("cut", "scenedetect not installed")
|
||||
result = {"frame_count": 0, "fps": 0.0, "scenes": []}
|
||||
if publisher:
|
||||
publisher.complete("cut", "0 scenes")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
return result
|
||||
vinfo = get_video_info(video_path)
|
||||
|
||||
if publisher:
|
||||
publisher.info("cut", "CUT_LOADING_VIDEO")
|
||||
publisher.info("cut", f"fps={vinfo['fps']}, frames={vinfo['frame_count']}, codec={vinfo['codec']}")
|
||||
|
||||
# Create video manager and scene manager
|
||||
video_manager = VideoManager([video_path])
|
||||
scene_manager = SceneManager()
|
||||
total_frames = vinfo["frame_count"]
|
||||
fps = vinfo["fps"]
|
||||
duration = vinfo["duration"]
|
||||
|
||||
# Add content detector (detects scene cuts based on frame differences)
|
||||
# threshold: sensitivity (lower = more sensitive, default 30)
|
||||
# min_scene_len: minimum frames per scene (default 15)
|
||||
scene_manager.add_detector(ContentDetector(threshold=30.0, min_scene_len=15))
|
||||
# Try ffmpeg scene detection
|
||||
scenes = detect_scenes_ffmpeg(video_path, fps, duration)
|
||||
|
||||
# Set downscale factor for faster processing
|
||||
video_manager.set_downscale_factor()
|
||||
|
||||
if publisher:
|
||||
publisher.info("cut", "CUT_DETECTING")
|
||||
|
||||
# Start video manager
|
||||
video_manager.start()
|
||||
|
||||
# Detect scenes
|
||||
scene_manager.detect_scenes(frame_source=video_manager)
|
||||
|
||||
# Get scene list
|
||||
scene_list = scene_manager.get_scene_list()
|
||||
|
||||
# Get frame rate
|
||||
fps = video_manager.get_framerate()
|
||||
|
||||
if publisher:
|
||||
publisher.info("cut", f"fps={fps}")
|
||||
|
||||
# Get total frame count
|
||||
frame_count = 0
|
||||
if scene_list:
|
||||
frame_count = scene_list[-1][1].get_frames()
|
||||
|
||||
# Convert scenes to result format
|
||||
scenes = []
|
||||
for i, (start, end) in enumerate(scene_list):
|
||||
scene = {
|
||||
"scene_number": i + 1,
|
||||
"start_frame": start.get_frames(),
|
||||
"end_frame": end.get_frames() - 1, # end is exclusive
|
||||
"start_time": start.get_seconds(),
|
||||
"end_time": end.get_seconds() - (1.0 / fps) if fps > 0 else 0,
|
||||
}
|
||||
scenes.append(scene)
|
||||
# Always ensure at least 1 scene
|
||||
if not scenes and total_frames > 0:
|
||||
scenes = [{
|
||||
"scene_number": 1,
|
||||
"start_frame": 0,
|
||||
"end_frame": total_frames - 1,
|
||||
"start_time": 0.0,
|
||||
"end_time": duration,
|
||||
}]
|
||||
if publisher:
|
||||
publisher.progress("cut", i + 1, len(scene_list), f"Scene {i + 1}")
|
||||
publisher.info("cut", "No scene changes detected, using whole video as single scene")
|
||||
|
||||
result = {"frame_count": frame_count, "fps": fps, "scenes": scenes}
|
||||
result = {
|
||||
"frame_count": total_frames,
|
||||
"fps": fps,
|
||||
"scenes": scenes,
|
||||
}
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
|
||||
@@ -14,13 +14,9 @@ from sklearn.cluster import AgglomerativeClustering
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
try:
|
||||
from deepface import DeepFace
|
||||
|
||||
HAS_DEEPFACE = True
|
||||
except ImportError:
|
||||
print("❌ DeepFace not found. Run: pip install deepface")
|
||||
sys.exit(1)
|
||||
# Use FaceNet embeddings from face.json instead of DeepFace
|
||||
HAS_DEEPFACE = False
|
||||
print("[FACE_CLUSTER] Using FaceNet embeddings from face.json (DeepFace not required)")
|
||||
|
||||
# 設定
|
||||
UUID = os.getenv("UUID", "quick_preview")
|
||||
@@ -104,53 +100,69 @@ def main():
|
||||
print("❌ No frames in JSON.")
|
||||
return
|
||||
|
||||
cap = cv2.VideoCapture(VIDEO_PATH)
|
||||
# Get embeddings from Qdrant
|
||||
print(f"[FACE_CLUSTER] Loading embeddings from Qdrant for {UUID}...")
|
||||
try:
|
||||
import requests
|
||||
qdrant_url = "http://localhost:6333"
|
||||
collection = "_faces"
|
||||
|
||||
# Query all embeddings for this file_uuid
|
||||
response = requests.post(
|
||||
f"{qdrant_url}/collections/{collection}/points/scroll",
|
||||
json={
|
||||
"filter": {
|
||||
"must": [
|
||||
{"key": "file_uuid", "match": {"value": UUID}}
|
||||
]
|
||||
},
|
||||
"limit": 10000,
|
||||
"with_vector": True
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
points = result.get("result", {}).get("points", [])
|
||||
print(f"[FACE_CLUSTER] Loaded {len(points)} embeddings from Qdrant")
|
||||
|
||||
# Build face_id -> embedding map
|
||||
embedding_map = {}
|
||||
for point in points:
|
||||
face_id = point.get("payload", {}).get("face_id")
|
||||
vector = point.get("vector")
|
||||
if face_id and vector:
|
||||
embedding_map[face_id] = vector
|
||||
else:
|
||||
print(f"[FACE_CLUSTER] Qdrant query failed: {response.status_code}")
|
||||
embedding_map = {}
|
||||
except Exception as e:
|
||||
print(f"[FACE_CLUSTER] Failed to load embeddings from Qdrant: {e}")
|
||||
embedding_map = {}
|
||||
|
||||
# Use embeddings from Qdrant or face.json
|
||||
embeddings = []
|
||||
face_refs = []
|
||||
|
||||
print(f"🔍 Extracting face embeddings from {UUID}...")
|
||||
print(f"🔍 Collecting face embeddings for {UUID}...")
|
||||
|
||||
for frame_idx, frame_obj in enumerate(frames_list):
|
||||
ts = frame_obj.get("timestamp")
|
||||
faces = frame_obj.get("faces", [])
|
||||
if not faces:
|
||||
continue
|
||||
|
||||
if ts is not None:
|
||||
cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
|
||||
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
continue
|
||||
|
||||
for face_idx, face in enumerate(faces):
|
||||
x, y, w, h = face["x"], face["y"], face["width"], face["height"]
|
||||
margin = 5
|
||||
crop = frame[
|
||||
max(0, y - margin) : y + h + margin, max(0, x - margin) : x + w + margin
|
||||
]
|
||||
|
||||
if crop is None or crop.size == 0:
|
||||
continue
|
||||
|
||||
try:
|
||||
res = DeepFace.represent(
|
||||
img_path=crop, model_name="ArcFace", enforce_detection=False
|
||||
)
|
||||
if res and "embedding" in res[0]:
|
||||
embeddings.append(res[0]["embedding"])
|
||||
face_refs.append({"frame_idx": frame_idx, "face_idx": face_idx})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
cap.release()
|
||||
face_id = face.get("face_id")
|
||||
if face_id and face_id in embedding_map:
|
||||
embeddings.append(embedding_map[face_id])
|
||||
face_refs.append({"frame_idx": frame_idx, "face_idx": face_idx, "face_id": face_id})
|
||||
|
||||
if not embeddings:
|
||||
print("❌ No embeddings extracted.")
|
||||
print("❌ No embeddings found in Qdrant.")
|
||||
return
|
||||
|
||||
embeddings = np.array(embeddings)
|
||||
print(f"✅ Extracted {len(embeddings)} face embeddings.")
|
||||
print(f"✅ Collected {len(embeddings)} face embeddings from Qdrant.")
|
||||
|
||||
# 2. 聚類
|
||||
print(f"🧠 Clustering {len(embeddings)} faces...")
|
||||
|
||||
@@ -35,7 +35,7 @@ from redis_publisher import RedisPublisher
|
||||
from qdrant_faces import push_face_embeddings_batch
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
SWIFT_BIN = os.path.join(SCRIPT_DIR, "swift_processors", ".build", "debug", "swift_face_pose")
|
||||
SWIFT_BIN = os.path.join(SCRIPT_DIR, "swift_processors", ".build", "release", "swift_face_pose")
|
||||
FACENET_PATH = os.path.join(SCRIPT_DIR, "..", "models", "facenet512.mlpackage")
|
||||
|
||||
# Pose angle classification from roll/yaw
|
||||
@@ -84,7 +84,12 @@ class FaceProcessorVision:
|
||||
self.total_frames = int(self.video.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
self.width = int(self.video.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
self.height = int(self.video.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
|
||||
# Calculate 8Hz sample interval based on FPS
|
||||
self.sample_interval = max(1, round(self.fps / 8))
|
||||
|
||||
print(f"[FACE_V2] Video: {self.width}x{self.height}, {self.fps:.1f}fps, {self.total_frames}f")
|
||||
print(f"[FACE_V2] 8Hz sample interval: {self.fps:.1f}/8 = {self.sample_interval}")
|
||||
|
||||
def extract_face_embedding(self, face_img: np.ndarray) -> Optional[list]:
|
||||
"""Run CoreML FaceNet on cropped face"""
|
||||
@@ -126,11 +131,15 @@ class FaceProcessorVision:
|
||||
output_basename = os.path.basename(self.output_path)
|
||||
pose_basename = output_basename.replace("face", "pose")
|
||||
swift_pose_out = os.path.join(output_dir, pose_basename)
|
||||
# Appearance output: same directory, but replace "face" with "appearance" in filename
|
||||
appearance_basename = output_basename.replace("face", "appearance")
|
||||
swift_appearance_out = os.path.join(output_dir, appearance_basename)
|
||||
cmd = [
|
||||
SWIFT_BIN,
|
||||
self.video_path,
|
||||
swift_face_out,
|
||||
swift_pose_out,
|
||||
swift_appearance_out,
|
||||
"--sample-interval", str(self.sample_interval),
|
||||
]
|
||||
if self.uuid:
|
||||
@@ -286,17 +295,28 @@ class FaceProcessorVision:
|
||||
|
||||
# Convert dict frames to list for Rust FaceResult format
|
||||
frames_list = []
|
||||
total_faces = 0
|
||||
for fnum_str, fdata in sorted(face_data["frames"].items(), key=lambda x: int(x[0])):
|
||||
faces = fdata["faces"]
|
||||
total_faces += len(faces)
|
||||
frames_list.append({
|
||||
"frame": int(fnum_str),
|
||||
"timestamp": fdata["time_seconds"],
|
||||
"faces": fdata["faces"],
|
||||
"faces": faces,
|
||||
})
|
||||
|
||||
# Determine status based on face count
|
||||
if total_faces > 0:
|
||||
status = "has_faces"
|
||||
else:
|
||||
status = "no_faces"
|
||||
|
||||
output = {
|
||||
"status": status,
|
||||
"frame_count": len(frames_list),
|
||||
"fps": self.fps,
|
||||
"frames": frames_list,
|
||||
"total_faces": total_faces,
|
||||
}
|
||||
|
||||
with open(self.output_path, "w") as f:
|
||||
@@ -339,6 +359,9 @@ def main():
|
||||
args.uuid, args.sample_interval, publisher
|
||||
)
|
||||
|
||||
# Open video to get FPS and calculate sample_interval
|
||||
processor.open_video()
|
||||
|
||||
# Step 1: Vision detection (bbox + pose via ANE)
|
||||
try:
|
||||
detection = processor.process_with_swift()
|
||||
|
||||
@@ -1,334 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Fast Face Clustering Processor (Linear Scan)
|
||||
職責:針對長片優化,使用線性讀取取代隨機跳轉,大幅提升速度。
|
||||
"""
|
||||
|
||||
import cv2
|
||||
import json
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import psycopg2
|
||||
from collections import defaultdict
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
try:
|
||||
from deepface import DeepFace
|
||||
|
||||
HAS_DEEPFACE = True
|
||||
except ImportError:
|
||||
print("❌ DeepFace not found.")
|
||||
sys.exit(1)
|
||||
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
|
||||
# 設定
|
||||
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
|
||||
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
|
||||
VIDEO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
|
||||
FACE_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face.json")
|
||||
OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.face_clustered.json")
|
||||
ASRX_JSON_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.asrx.json")
|
||||
DB_URL = os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
|
||||
|
||||
|
||||
def main():
|
||||
if not os.path.exists(FACE_JSON_PATH):
|
||||
print(f"❌ Face JSON not found: {FACE_JSON_PATH}")
|
||||
return
|
||||
|
||||
print(f"⚡ 開始執行快速面孔聚類 (Linear Scan Mode) for {UUID}...")
|
||||
|
||||
# 1. 載入並建立索引 (以 frame number 為 key)
|
||||
with open(FACE_JSON_PATH) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
frames_list = face_data.get("frames", [])
|
||||
if not frames_list:
|
||||
print("❌ No frames in JSON.")
|
||||
return
|
||||
|
||||
# 建立 map: frame_index -> faces
|
||||
# 注意:JSON 中的 frame 是 int,但也許是 float?
|
||||
# face_processor 輸出通常是 int
|
||||
faces_map = defaultdict(list)
|
||||
|
||||
# 為了安全,我們也建立 timestamp map 以防萬一,但優先使用 frame number
|
||||
print(f"📂 Indexing {len(frames_list)} frames with faces...")
|
||||
for frame_obj in frames_list:
|
||||
# JSON 中可能是 'frame' (int) 或 'frame_number'
|
||||
idx = frame_obj.get("frame") or frame_obj.get("frame_number")
|
||||
if idx is not None:
|
||||
faces_map[int(idx)].extend(frame_obj.get("faces", []))
|
||||
|
||||
# 如果沒有 frame number 字段,我們只能依靠 timestamp (比較慢)
|
||||
if not faces_map:
|
||||
print("⚠️ No frame numbers found in JSON. Falling back to timestamp seeking.")
|
||||
# 這裡我們可以呼叫舊的邏輯,但為了簡單,我們假設 face_processor 有寫 frame
|
||||
# 檢查第一個 frame 的 key
|
||||
if frames_list:
|
||||
print(f" Keys: {frames_list[0].keys()}")
|
||||
return # 暫時中斷
|
||||
|
||||
total_faces = sum(len(faces) for faces in faces_map.values())
|
||||
print(f"✅ Indexed {len(faces_map)} frames, containing {total_faces} faces.")
|
||||
print("🚀 Starting Linear Video Scan...")
|
||||
|
||||
# 2. 線性掃描
|
||||
video_path = VIDEO_PATH # 使用區域變數避免 global 問題
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
if not cap.isOpened():
|
||||
# 嘗試找 mov
|
||||
alt_path = video_path.replace(".mp4", ".mov")
|
||||
if os.path.exists(alt_path):
|
||||
video_path = alt_path
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
else:
|
||||
print("❌ Video file not found.")
|
||||
return
|
||||
|
||||
embeddings = []
|
||||
face_refs = [] # 存儲 (frame_index, face_index_in_list)
|
||||
|
||||
# 為了追蹤進度
|
||||
processed_frames = 0
|
||||
current_frame = 0
|
||||
|
||||
# 獲取影片總幀數
|
||||
total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
# 檢查這一幀是否有我們需要處理的臉
|
||||
# 使用 round 處理可能的浮點誤差 (雖然 face_processor 應該寫的是 int)
|
||||
# 如果 JSON 的 frame 是 0.0, 1.0...
|
||||
# 這裡我們直接看 current_frame 是否在 faces_map 中
|
||||
|
||||
# 由於 face_processor 可能跳幀,或者時間戳對齊問題
|
||||
# 我們檢查 current_frame 以及 current_frame +/- 1 的容差
|
||||
# 但最好的方式是嚴格匹配 frame number
|
||||
|
||||
if current_frame in faces_map:
|
||||
faces = faces_map[current_frame]
|
||||
for face_idx, face in enumerate(faces):
|
||||
try:
|
||||
x, y, w, h = face["x"], face["y"], face["width"], face["height"]
|
||||
margin = 5
|
||||
crop = frame[
|
||||
max(0, y - margin) : y + h + margin,
|
||||
max(0, x - margin) : x + w + margin,
|
||||
]
|
||||
|
||||
if crop is not None and crop.size > 0:
|
||||
# 使用 Fast Model: VGG-Face 或 OpenFace 比 ArcFace 快,但 ArcFace 準
|
||||
# 這裡保持 ArcFace 以求準確,但因為是線性讀取,省去了 seek 時間
|
||||
# 為了速度,我們可以每 2 秒只取 1 幀?
|
||||
# 不,我們需要標記所有幀。
|
||||
# DeepFace 提取
|
||||
res = DeepFace.represent(
|
||||
img_path=crop, model_name="ArcFace", enforce_detection=False
|
||||
)
|
||||
if res and "embedding" in res[0]:
|
||||
embeddings.append(res[0]["embedding"])
|
||||
face_refs.append(
|
||||
{"frame_idx": current_frame, "face_idx": face_idx}
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
processed_frames += 1
|
||||
if processed_frames % 500 == 0:
|
||||
pct = (current_frame / total_video_frames) * 100
|
||||
print(
|
||||
f" 📊 Progress: Frame {current_frame}/{total_video_frames} ({pct:.1f}%) | Extracted: {len(embeddings)} embeddings"
|
||||
)
|
||||
|
||||
current_frame += 1
|
||||
|
||||
cap.release()
|
||||
|
||||
if not embeddings:
|
||||
print("❌ No embeddings extracted.")
|
||||
return
|
||||
|
||||
embeddings = np.array(embeddings)
|
||||
print(f"✅ Total Embeddings Extracted: {len(embeddings)}")
|
||||
|
||||
# 3. 聚類
|
||||
print(f"🧠 Clustering {len(embeddings)} faces...")
|
||||
|
||||
# 優化:KMeans 或 MiniBatchKMeans 對於大數據集更快
|
||||
# 但 Agglomerative 對於找任意形狀的簇更好。
|
||||
# 25000 個點做層次聚類還是慢。
|
||||
# 我們使用 "Sample -> Cluster -> Assign" 策略
|
||||
|
||||
print(" 🚀 Using Sampling Strategy for speed...")
|
||||
sample_size = 5000
|
||||
n_faces = len(embeddings)
|
||||
|
||||
if n_faces > sample_size:
|
||||
indices = np.random.choice(n_faces, sample_size, replace=False)
|
||||
sample_embeddings = embeddings[indices]
|
||||
else:
|
||||
sample_embeddings = embeddings
|
||||
indices = np.arange(n_faces)
|
||||
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=None, distance_threshold=0.45, metric="cosine", linkage="average"
|
||||
)
|
||||
sample_labels = clustering.fit_predict(sample_embeddings)
|
||||
|
||||
# 計算簇中心
|
||||
unique_labels = set(sample_labels)
|
||||
centroids = []
|
||||
for label in unique_labels:
|
||||
mask = sample_labels == label
|
||||
centroids.append(np.mean(sample_embeddings[mask], axis=0))
|
||||
centroids = np.array(centroids)
|
||||
|
||||
# 分配所有數據
|
||||
print(" 🏃 Assigning remaining faces to clusters...")
|
||||
from sklearn.metrics.pairwise import cosine_distances
|
||||
|
||||
# 批次計算
|
||||
all_labels = np.zeros(n_faces, dtype=int)
|
||||
batch_size = 10000
|
||||
for i in range(0, n_faces, batch_size):
|
||||
batch = embeddings[i : i + batch_size]
|
||||
dists = cosine_distances(batch, centroids)
|
||||
all_labels[i : i + batch_size] = np.argmin(dists, axis=1)
|
||||
|
||||
print(f" 👥 Detected {len(unique_labels)} unique persons.")
|
||||
|
||||
# 4. 生成標籤
|
||||
label_to_person = {l: f"Person_{i}" for i, l in enumerate(unique_labels)}
|
||||
|
||||
# 5. 寫回 JSON
|
||||
# face_data 是原始結構,我們需要修改它
|
||||
# face_data['frames'] 是一個列表
|
||||
# 我們需要快速找到對應的 frame
|
||||
|
||||
# 建立 map frame_idx -> frame_object reference
|
||||
frame_ref_map = {}
|
||||
for f_obj in face_data.get("frames", []):
|
||||
idx = f_obj.get("frame") or f_obj.get("frame_number")
|
||||
if idx is not None:
|
||||
frame_ref_map[int(idx)] = f_obj
|
||||
|
||||
count = 0
|
||||
for ref, label in zip(face_refs, all_labels):
|
||||
f_idx = ref["frame_idx"]
|
||||
face_idx = ref["face_idx"] # 這是原始 faces list 中的 index
|
||||
|
||||
person_id = label_to_person[label]
|
||||
|
||||
if f_idx in frame_ref_map:
|
||||
frame_obj = frame_ref_map[f_idx]
|
||||
faces_list = frame_obj.get("faces", [])
|
||||
if face_idx < len(faces_list):
|
||||
faces_list[face_idx]["person_id"] = person_id
|
||||
count += 1
|
||||
|
||||
print(f" ✅ Tagged {count} faces with Person ID.")
|
||||
|
||||
with open(OUTPUT_JSON_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(face_data, f, indent=2, ensure_ascii=False)
|
||||
print(f"✅ Saved clustered data to {OUTPUT_JSON_PATH}")
|
||||
|
||||
# 6. 綁定 Speaker
|
||||
auto_bind_speakers()
|
||||
|
||||
|
||||
def auto_bind_speakers():
|
||||
if not os.path.exists(OUTPUT_JSON_PATH) or not os.path.exists(ASRX_JSON_PATH):
|
||||
print("⚠️ Missing data for speaker binding.")
|
||||
return
|
||||
|
||||
with open(OUTPUT_JSON_PATH) as f:
|
||||
face_clustered = json.load(f)
|
||||
with open(ASRX_JSON_PATH) as f:
|
||||
asrx_data = json.load(f)
|
||||
|
||||
print("🔗 Auto-binding Speakers to Persons...")
|
||||
|
||||
face_spans = []
|
||||
for frame_obj in face_clustered.get("frames", []):
|
||||
ts = frame_obj.get("timestamp")
|
||||
for face in frame_obj.get("faces", []):
|
||||
person_id = face.get("person_id")
|
||||
if person_id and ts is not None:
|
||||
face_spans.append({"ts": ts, "person_id": person_id})
|
||||
|
||||
speaker_person_counts = {}
|
||||
|
||||
for seg in asrx_data.get("segments", []):
|
||||
start = seg.get("start")
|
||||
end = seg.get("end")
|
||||
speaker = seg.get("speaker_id")
|
||||
if not speaker:
|
||||
continue
|
||||
|
||||
candidates = [f for f in face_spans if start <= f["ts"] <= end]
|
||||
if candidates:
|
||||
person_counts = {}
|
||||
for c in candidates:
|
||||
pid = c["person_id"]
|
||||
person_counts[pid] = person_counts.get(pid, 0) + 1
|
||||
|
||||
if speaker not in speaker_person_counts:
|
||||
speaker_person_counts[speaker] = {}
|
||||
|
||||
best_person = max(person_counts, key=person_counts.get)
|
||||
speaker_person_counts[speaker][best_person] = (
|
||||
speaker_person_counts[speaker].get(best_person, 0) + 1
|
||||
)
|
||||
|
||||
try:
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
for speaker, persons in speaker_person_counts.items():
|
||||
if not persons:
|
||||
continue
|
||||
best_person = max(persons, key=persons.get)
|
||||
print(
|
||||
f" 🎤 {speaker} is likely {best_person} ({persons[best_person]} votes)"
|
||||
)
|
||||
|
||||
cur.execute("SELECT id FROM talents WHERE real_name = %s", (best_person,))
|
||||
row = cur.fetchone()
|
||||
|
||||
if row:
|
||||
talent_id = row[0]
|
||||
else:
|
||||
cur.execute(
|
||||
"INSERT INTO talents (real_name) VALUES (%s) RETURNING id",
|
||||
(best_person,),
|
||||
)
|
||||
talent_id = cur.fetchone()[0]
|
||||
print(f" ✨ Created Talent #{talent_id} ({best_person})")
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO identity_bindings (talent_id, binding_type, binding_value, source, confidence)
|
||||
VALUES (%s, 'speaker', %s, 'auto_cluster', 0.8)
|
||||
ON CONFLICT (binding_type, binding_value) DO UPDATE SET talent_id = EXCLUDED.talent_id
|
||||
""",
|
||||
(talent_id, speaker),
|
||||
)
|
||||
print(f" ✅ Bound {speaker} -> {best_person}")
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
except Exception as e:
|
||||
print(f" ❌ DB Error: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1 @@
|
||||
face_clustering_processor.py
|
||||
+175
-1
@@ -33,7 +33,54 @@ def process_pose(
|
||||
uuid: str = "",
|
||||
sample_interval: int = 3, # Changed from 30 to match Face
|
||||
publisher: RedisPublisher = None,
|
||||
target_frames: list = None,
|
||||
) -> dict:
|
||||
# Check if pose.json or pose.json.tmp already exists (from swift_face_pose)
|
||||
# executor.rs renames output to .json.tmp before running Python script
|
||||
tmp_path = output_path.replace('.json', '.json.tmp')
|
||||
|
||||
source_path = None
|
||||
if os.path.exists(output_path):
|
||||
source_path = output_path
|
||||
print(f"[Pose] Output exists from swift_face_pose: {output_path}", file=sys.stderr)
|
||||
elif os.path.exists(tmp_path):
|
||||
source_path = tmp_path
|
||||
print(f"[Pose] Temp output exists from swift_face_pose: {tmp_path}", file=sys.stderr)
|
||||
|
||||
if source_path:
|
||||
with open(source_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
detected_frames = len(data.get('frames', []))
|
||||
print(f"[Pose] Loaded {detected_frames} detected frames", file=sys.stderr)
|
||||
|
||||
# When target_frames is provided (8Hz sampling), skip interpolation
|
||||
# Swift already outputs at sample_interval=3, matching 8Hz for 24fps
|
||||
if target_frames is not None:
|
||||
print(f"[Pose] 8Hz mode: returning {detected_frames} frames without interpolation", file=sys.stderr)
|
||||
if publisher:
|
||||
publisher.progress("pose", 100, 100, f"{detected_frames} frames (8Hz, no interpolation)")
|
||||
return data
|
||||
|
||||
# Interpolate keypoints for all frames
|
||||
interpolated_data = interpolate_pose(data, video_path)
|
||||
|
||||
# Write interpolated output
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(interpolated_data, f)
|
||||
|
||||
# Delete .json.tmp file so executor.rs won't restore it
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
print(f"[Pose] Deleted temp file: {tmp_path}", file=sys.stderr)
|
||||
|
||||
total_frames = len(interpolated_data.get('frames', []))
|
||||
print(f"[Pose] Interpolated to {total_frames} frames", file=sys.stderr)
|
||||
|
||||
if publisher:
|
||||
publisher.progress("pose", 100, 100, f"Interpolated {total_frames} frames")
|
||||
return interpolated_data
|
||||
|
||||
swift_bin = SWIFT_POSE_PATH
|
||||
if not os.path.exists(swift_bin):
|
||||
swift_bin = SWIFT_POSE_ALT
|
||||
@@ -81,6 +128,126 @@ def process_pose(
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def interpolate_pose(detected_data: dict, video_path: str) -> dict:
|
||||
"""Interpolate keypoints for all frames between detected frames"""
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
fps = detected_data.get('fps', 30.0)
|
||||
|
||||
detected_frames = detected_data.get('frames', [])
|
||||
if not detected_frames:
|
||||
cap.release()
|
||||
return detected_data
|
||||
|
||||
# Build frame index map
|
||||
frame_map = {f['frame']: f for f in detected_frames}
|
||||
detected_frame_nums = sorted(frame_map.keys())
|
||||
|
||||
print(f"[Pose] Interpolating from {len(detected_frame_nums)} detected frames to {total_video_frames} total frames", file=sys.stderr)
|
||||
|
||||
# Get all persons from detected frames (assume same person tracking)
|
||||
all_persons = {}
|
||||
for f in detected_frames:
|
||||
for i, p in enumerate(f.get('persons', [])):
|
||||
if i not in all_persons:
|
||||
all_persons[i] = []
|
||||
all_persons[i].append((f['frame'], p))
|
||||
|
||||
# Interpolate each person's keypoints for each frame
|
||||
interpolated_frames = []
|
||||
|
||||
for frame_num in range(total_video_frames):
|
||||
ts = frame_num / fps
|
||||
|
||||
persons_in_frame = []
|
||||
|
||||
for person_id, person_frames in all_persons.items():
|
||||
# Find closest detected frames before and after
|
||||
before = None
|
||||
after = None
|
||||
for fn, p in person_frames:
|
||||
if fn <= frame_num:
|
||||
before = (fn, p)
|
||||
if fn >= frame_num and after is None:
|
||||
after = (fn, p)
|
||||
|
||||
if before is None and after is None:
|
||||
continue
|
||||
|
||||
# Interpolate keypoints
|
||||
interpolated_keypoints = []
|
||||
bbox = None
|
||||
|
||||
if before and after and before[0] != after[0]:
|
||||
# Linear interpolation
|
||||
t0, t1 = before[0], after[0]
|
||||
t = (frame_num - t0) / (t1 - t0) if t1 != t0 else 0
|
||||
|
||||
kp_before = before[1].get('keypoints', [])
|
||||
kp_after = after[1].get('keypoints', [])
|
||||
bbox_before = before[1].get('bbox', {})
|
||||
bbox_after = after[1].get('bbox', {})
|
||||
|
||||
# Interpolate keypoints
|
||||
for i in range(max(len(kp_before), len(kp_after))):
|
||||
kp0 = kp_before[i] if i < len(kp_before) else kp_after[i]
|
||||
kp1 = kp_after[i] if i < len(kp_after) else kp_before[i]
|
||||
|
||||
x = kp0['x'] + t * (kp1['x'] - kp0['x'])
|
||||
y = kp0['y'] + t * (kp1['y'] - kp0['y'])
|
||||
c = kp0['confidence'] + t * (kp1['confidence'] - kp0['confidence'])
|
||||
|
||||
interpolated_keypoints.append({
|
||||
'name': kp0['name'],
|
||||
'x': x,
|
||||
'y': y,
|
||||
'confidence': c
|
||||
})
|
||||
|
||||
# Interpolate bbox
|
||||
if bbox_before and bbox_after:
|
||||
bbox = {
|
||||
'x': int(bbox_before['x'] + t * (bbox_after['x'] - bbox_before['x'])),
|
||||
'y': int(bbox_before['y'] + t * (bbox_after['y'] - bbox_before['y'])),
|
||||
'width': int(bbox_before['width'] + t * (bbox_after['width'] - bbox_before['width'])),
|
||||
'height': int(bbox_before['height'] + t * (bbox_after['height'] - bbox_before['height']))
|
||||
}
|
||||
|
||||
elif before:
|
||||
# Use before frame's data
|
||||
interpolated_keypoints = before[1].get('keypoints', [])
|
||||
bbox = before[1].get('bbox', {})
|
||||
|
||||
elif after:
|
||||
# Use after frame's data
|
||||
interpolated_keypoints = after[1].get('keypoints', [])
|
||||
bbox = after[1].get('bbox', {})
|
||||
|
||||
if bbox and bbox.get('width', 0) > 0 and bbox.get('height', 0) > 0:
|
||||
persons_in_frame.append({
|
||||
'keypoints': interpolated_keypoints,
|
||||
'bbox': bbox
|
||||
})
|
||||
|
||||
if persons_in_frame:
|
||||
interpolated_frames.append({
|
||||
'frame': frame_num,
|
||||
'timestamp': ts,
|
||||
'persons': persons_in_frame
|
||||
})
|
||||
|
||||
cap.release()
|
||||
|
||||
return {
|
||||
'frame_count': len(interpolated_frames),
|
||||
'fps': fps,
|
||||
'frames': interpolated_frames
|
||||
}
|
||||
|
||||
|
||||
def _fallback(video_path, output_path, uuid, sample_interval):
|
||||
"""Fallback to YOLOv8 Pose"""
|
||||
from ultralytics import YOLO
|
||||
@@ -135,14 +302,21 @@ if __name__ == "__main__":
|
||||
parser.add_argument("output_path")
|
||||
parser.add_argument("--uuid", "-u", default="")
|
||||
parser.add_argument("--sample-interval", type=int, default=3) # Changed from 30 to match Face
|
||||
parser.add_argument("--frames", type=str, default=None,
|
||||
help="Comma-separated frame numbers for 8Hz sampling")
|
||||
args = parser.parse_args()
|
||||
|
||||
target_frames = None
|
||||
if args.frames:
|
||||
target_frames = [int(f) for f in args.frames.split(",") if f.strip()]
|
||||
print(f"[Pose] 8Hz target frames: {len(target_frames)} frames", file=sys.stderr)
|
||||
|
||||
publisher = RedisPublisher(args.uuid) if args.uuid else None
|
||||
if publisher:
|
||||
publisher.info("pose", "POSE_START")
|
||||
|
||||
result = process_pose(args.video_path, args.output_path, args.uuid,
|
||||
args.sample_interval, publisher)
|
||||
args.sample_interval, publisher, target_frames)
|
||||
with open(args.output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
print(f"Pose: {len(result.get('frames', []))} frames with poses")
|
||||
|
||||
@@ -21,8 +21,6 @@ import json
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
@@ -30,13 +28,8 @@ sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "uti
|
||||
from qdrant_faces import update_trace_ids
|
||||
|
||||
# Config
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
|
||||
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
|
||||
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
|
||||
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(DB_URL)
|
||||
SCHEMA = os.environ.get("DATABASE_SCHEMA", "public")
|
||||
|
||||
|
||||
def merge_traces_within_cuts(face_data: dict, cut_scenes: list) -> dict:
|
||||
@@ -146,67 +139,17 @@ def run_face_tracker(
|
||||
|
||||
|
||||
def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHEMA):
|
||||
"""Insert traced face detections into face_detections table with trace_id"""
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
"""Update Qdrant _faces collection with trace_id after face tracking.
|
||||
|
||||
face_detections table is deprecated — trace_id is stored only in Qdrant _faces payload.
|
||||
"""
|
||||
with open(traced_json_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
frames = data.get("frames", {})
|
||||
total_stored = 0
|
||||
|
||||
for frame_num_str, frame_data in sorted(frames.items(), key=lambda x: int(x[0])):
|
||||
frame_num = int(frame_num_str)
|
||||
faces = frame_data.get("faces", [])
|
||||
|
||||
for face in faces:
|
||||
trace_id = face.get("trace_id")
|
||||
if trace_id is None:
|
||||
continue
|
||||
|
||||
x = face.get("x", 0)
|
||||
y = face.get("y", 0)
|
||||
w = face.get("width", 0)
|
||||
h = face.get("height", 0)
|
||||
confidence = face.get("confidence", 0.0)
|
||||
face_id = face.get("face_id")
|
||||
if face_id is None:
|
||||
face_id = f"face_{trace_id}"
|
||||
attributes = face.get("attributes")
|
||||
|
||||
bbox = json.dumps({"x": x, "y": y, "width": w, "height": h})
|
||||
|
||||
try:
|
||||
cur.execute(
|
||||
f"""
|
||||
UPDATE {schema}.face_detections
|
||||
SET trace_id = %s, face_id = %s
|
||||
WHERE file_uuid = %s AND frame_number = %s
|
||||
AND x = %s AND y = %s AND width = %s AND height = %s
|
||||
""",
|
||||
(
|
||||
trace_id,
|
||||
face_id,
|
||||
file_uuid,
|
||||
frame_num,
|
||||
x,
|
||||
y,
|
||||
w,
|
||||
h,
|
||||
),
|
||||
)
|
||||
if cur.rowcount > 0:
|
||||
total_stored += 1
|
||||
except Exception as e:
|
||||
print(f"[TRACE] Error storing face at frame {frame_num}: {e}")
|
||||
conn.rollback()
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
|
||||
# Build trace_mapping for Qdrant update
|
||||
trace_mapping = {} # {frame: {bbox_key: trace_id}}
|
||||
# Build trace_mapping for Qdrant update: {frame: {bbox_key: trace_id}}
|
||||
trace_mapping = {}
|
||||
for frame_num_str, frame_data in sorted(frames.items(), key=lambda x: int(x[0])):
|
||||
frame_num = int(frame_num_str)
|
||||
trace_mapping[frame_num] = {}
|
||||
@@ -224,22 +167,26 @@ def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHE
|
||||
print(f"[TRACE] Warning: Qdrant trace_id update failed: {e}")
|
||||
qdrant_updated = 0
|
||||
|
||||
# Log trace summary
|
||||
cur.execute(
|
||||
f"SELECT COUNT(DISTINCT trace_id) FROM {schema}.face_detections WHERE file_uuid = %s AND trace_id IS NOT NULL",
|
||||
(file_uuid,),
|
||||
)
|
||||
db_trace_count = cur.fetchone()[0]
|
||||
# Count unique traces from Qdrant
|
||||
try:
|
||||
from qdrant_faces import get_file_faces
|
||||
points = get_file_faces(file_uuid)
|
||||
trace_ids = set()
|
||||
for p in points:
|
||||
tid = p.get("payload", {}).get("trace_id")
|
||||
if tid is not None and tid > 0:
|
||||
trace_ids.add(tid)
|
||||
qdrant_trace_count = len(trace_ids)
|
||||
except Exception as e:
|
||||
print(f"[TRACE] Warning: Qdrant trace count failed: {e}")
|
||||
qdrant_trace_count = 0
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(
|
||||
f"[TRACE] Stored {total_stored} face detections, {db_trace_count} unique traces in DB"
|
||||
total_faces = sum(
|
||||
1 for fd in frames.values() for f in fd.get("faces", []) if f.get("trace_id") is not None
|
||||
)
|
||||
if qdrant_updated > 0:
|
||||
print(f"[TRACE] Updated {qdrant_updated} Qdrant points with trace_id")
|
||||
return total_stored, db_trace_count
|
||||
|
||||
print(f"[TRACE] Updated {qdrant_updated} Qdrant points with trace_id, {qdrant_trace_count} unique traces")
|
||||
return total_faces, qdrant_trace_count
|
||||
|
||||
|
||||
def main():
|
||||
@@ -248,8 +195,6 @@ def main():
|
||||
|
||||
parser.add_argument("--face-json", help="Path to face.json (default: auto-detect)")
|
||||
|
||||
parser.add_argument("--schema", default=SCHEMA, help="DB schema name")
|
||||
|
||||
parser.add_argument("--uuid", help="UUID for Redis tracking (accepted by executor)")
|
||||
parser.add_argument(
|
||||
"--filter-eyes",
|
||||
@@ -270,8 +215,8 @@ def main():
|
||||
# Step 1: Run face tracker
|
||||
run_face_tracker(face_json, traced_json, filter_eyes=args.filter_eyes)
|
||||
|
||||
# Step 2: Store in DB with trace_id
|
||||
total, traces = store_traced_faces(args.file_uuid, traced_json, args.schema)
|
||||
# Step 2: Store in Qdrant with trace_id
|
||||
total, traces = store_traced_faces(args.file_uuid, traced_json)
|
||||
print(f"[TRACE] Done: {total} detections, {traces} traces")
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,409 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AVFoundation
|
||||
|
||||
/// Swift Face+Pose Processor - one pass, two outputs
|
||||
/// Runs VNDetectFaceRectanglesRequest, VNDetectFaceLandmarksRequest,
|
||||
/// and VNDetectHumanBodyPoseRequest on each sampled frame.
|
||||
/// Uses AVAssetReader sequential read (frame-based), matching cv2 behavior.
|
||||
@main
|
||||
struct SwiftFacePose: ParsableCommand {
|
||||
@Argument(help: "Video file path")
|
||||
var inputPath: String
|
||||
|
||||
@Argument(help: "Output JSON path for face detection")
|
||||
var faceOutput: String
|
||||
|
||||
@Argument(help: "Output JSON path for pose detection")
|
||||
var poseOutput: String
|
||||
|
||||
@Option(name: .long, help: "Sample interval (frames, default=30)")
|
||||
var sampleInterval: Int = 30
|
||||
|
||||
@Option(name: .long, help: "UUID for logging")
|
||||
var uuid: String = ""
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("[SwiftFacePose] Vision face+pose detection: \(inputPath)")
|
||||
|
||||
let url = URL(fileURLWithPath: inputPath)
|
||||
let asset = AVAsset(url: url)
|
||||
|
||||
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
|
||||
print("[SwiftFacePose] No video track found")
|
||||
return
|
||||
}
|
||||
|
||||
let fps = videoTrack.nominalFrameRate
|
||||
let duration = CMTimeGetSeconds(asset.duration)
|
||||
let totalFrames = Int(duration * Double(fps))
|
||||
print("[SwiftFacePose] Video: \(Int(videoTrack.naturalSize.width))x\(Int(videoTrack.naturalSize.height)), \(String(format: "%.1f", fps))fps, \(totalFrames) frames, interval=\(sampleInterval)")
|
||||
|
||||
// read sequentially, matching cv2 frame-by-frame behavior
|
||||
let reader = try AVAssetReader(asset: asset)
|
||||
let outputSettings: [String: Any] = [
|
||||
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA
|
||||
]
|
||||
let trackOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: outputSettings)
|
||||
trackOutput.alwaysCopiesSampleData = false
|
||||
reader.add(trackOutput)
|
||||
guard reader.startReading() else {
|
||||
print("[SwiftFacePose] Failed to start AVAssetReader: \(reader.error?.localizedDescription ?? "unknown")")
|
||||
return
|
||||
}
|
||||
|
||||
var faceFrames: [[String: Any]] = []
|
||||
var poseFrames: [[String: Any]] = []
|
||||
var processedCount = 0
|
||||
var frameIndex = 0
|
||||
|
||||
let jointNames: [VNHumanBodyPoseObservation.JointName] = [
|
||||
.nose, .leftEye, .rightEye, .leftEar, .rightEar,
|
||||
.neck, .root,
|
||||
.leftShoulder, .rightShoulder,
|
||||
.leftElbow, .rightElbow,
|
||||
.leftWrist, .rightWrist,
|
||||
.leftHip, .rightHip,
|
||||
.leftKnee, .rightKnee,
|
||||
.leftAnkle, .rightAnkle,
|
||||
]
|
||||
|
||||
while let sampleBuffer = trackOutput.copyNextSampleBuffer() {
|
||||
defer { frameIndex += 1 }
|
||||
|
||||
if frameIndex % sampleInterval != 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
|
||||
continue
|
||||
}
|
||||
|
||||
let imgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
|
||||
let imgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
|
||||
let seconds = Double(frameIndex) / Double(fps)
|
||||
|
||||
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
|
||||
let faceReq = VNDetectFaceRectanglesRequest()
|
||||
let lmReq = VNDetectFaceLandmarksRequest()
|
||||
let bodyReq = VNDetectHumanBodyPoseRequest()
|
||||
|
||||
do {
|
||||
try handler.perform([faceReq, lmReq, bodyReq])
|
||||
} catch {
|
||||
continue
|
||||
}
|
||||
|
||||
// ── Face output ──
|
||||
let faceObservations = faceReq.results ?? []
|
||||
let landmarkObservations = lmReq.results ?? []
|
||||
|
||||
var faces: [[String: Any]] = []
|
||||
var hasFace = false
|
||||
|
||||
if !faceObservations.isEmpty || !landmarkObservations.isEmpty {
|
||||
hasFace = true
|
||||
|
||||
let MIN_CONFIDENCE = 0.6
|
||||
let MIN_SIZE = 20
|
||||
|
||||
for lmObs in landmarkObservations {
|
||||
let lmConf = Double(lmObs.confidence)
|
||||
if lmConf < MIN_CONFIDENCE { continue }
|
||||
|
||||
let bb = lmObs.boundingBox
|
||||
let faceW = Int(bb.size.width * imgW)
|
||||
let faceH = Int(bb.size.height * imgH)
|
||||
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
|
||||
|
||||
let faceX = Int(bb.origin.x * imgW)
|
||||
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
|
||||
|
||||
var faceData: [String: Any] = [
|
||||
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
|
||||
"width": faceW, "height": faceH],
|
||||
"confidence": Double(lmObs.confidence),
|
||||
]
|
||||
|
||||
if let yaw = lmObs.yaw?.doubleValue,
|
||||
let roll = lmObs.roll?.doubleValue {
|
||||
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
|
||||
if let pitch = lmObs.pitch?.doubleValue {
|
||||
poseInfo["pitch"] = pitch
|
||||
}
|
||||
faceData["pose"] = poseInfo
|
||||
}
|
||||
|
||||
if let lms = lmObs.landmarks {
|
||||
let imgSize = CGSize(width: imgW, height: imgH)
|
||||
let leftEye = lms.leftEye?.pointsInImage(imageSize: imgSize) ?? []
|
||||
let rightEye = lms.rightEye?.pointsInImage(imageSize: imgSize) ?? []
|
||||
let nose = lms.nose?.pointsInImage(imageSize: imgSize) ?? []
|
||||
|
||||
if !leftEye.isEmpty || !rightEye.isEmpty || !nose.isEmpty {
|
||||
var lm: [String: [[Double]]] = [:]
|
||||
if !leftEye.isEmpty {
|
||||
lm["left_eye"] = leftEye.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
}
|
||||
if !rightEye.isEmpty {
|
||||
lm["right_eye"] = rightEye.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
}
|
||||
if !nose.isEmpty {
|
||||
lm["nose"] = nose.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
}
|
||||
faceData["landmarks"] = lm
|
||||
}
|
||||
|
||||
let outer = lms.outerLips?.pointsInImage(imageSize: imgSize) ?? []
|
||||
let inner = lms.innerLips?.pointsInImage(imageSize: imgSize) ?? []
|
||||
if !outer.isEmpty || !inner.isEmpty {
|
||||
faceData["lips"] = [
|
||||
"outer_lips": outer.map { [Double($0.x), Double(imgH - $0.y)] },
|
||||
"inner_lips": inner.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
faces.append(faceData)
|
||||
}
|
||||
|
||||
for faceObs in faceObservations {
|
||||
let fBB = faceObs.boundingBox
|
||||
var matched = false
|
||||
for lmObs in landmarkObservations {
|
||||
let lBB = lmObs.boundingBox
|
||||
let ix = max(fBB.origin.x, lBB.origin.x)
|
||||
let iy = max(fBB.origin.y, lBB.origin.y)
|
||||
let iw = min(fBB.maxX, lBB.maxX) - ix
|
||||
let ih = min(fBB.maxY, lBB.maxY) - iy
|
||||
if iw <= 0 || ih <= 0 { continue }
|
||||
let intersection = iw * ih
|
||||
let union = fBB.width * fBB.height + lBB.width * lBB.height - intersection
|
||||
if intersection / union > 0.3 {
|
||||
matched = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if matched { continue }
|
||||
|
||||
let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
|
||||
if faceConf < MIN_CONFIDENCE { continue }
|
||||
|
||||
let faceW = Int(fBB.size.width * imgW)
|
||||
let faceH = Int(fBB.size.height * imgH)
|
||||
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
|
||||
|
||||
let faceX = Int(fBB.origin.x * imgW)
|
||||
let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
|
||||
|
||||
var faceData: [String: Any] = [
|
||||
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
|
||||
"width": faceW, "height": faceH],
|
||||
"confidence": Double(faceObs.faceCaptureQuality ?? faceObs.confidence),
|
||||
]
|
||||
if let yaw = faceObs.yaw?.doubleValue,
|
||||
let roll = faceObs.roll?.doubleValue {
|
||||
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
|
||||
if let pitch = faceObs.pitch?.doubleValue {
|
||||
poseInfo["pitch"] = pitch
|
||||
}
|
||||
faceData["pose"] = poseInfo
|
||||
}
|
||||
faces.append(faceData)
|
||||
}
|
||||
|
||||
if !faces.isEmpty {
|
||||
faceFrames.append([
|
||||
"frame": frameIndex,
|
||||
"timestamp": seconds,
|
||||
"faces": faces,
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
// ── Pose output ──
|
||||
// Rule: Face ≤ Pose - every face frame must have pose frame
|
||||
// Face landmarks (nose, leftEye, rightEye) ARE pose keypoints
|
||||
let poses = bodyReq.results ?? []
|
||||
var persons: [[String: Any]] = []
|
||||
|
||||
// If we have face landmarks, extract pose keypoints from them
|
||||
// This ensures Face → Pose is always true
|
||||
if hasFace && landmarkObservations.count > 0 {
|
||||
for lmObs in landmarkObservations {
|
||||
let lmConf = Double(lmObs.confidence)
|
||||
if lmConf < 0.6 { continue }
|
||||
|
||||
if let lms = lmObs.landmarks {
|
||||
let imgSize = CGSize(width: imgW, height: imgH)
|
||||
var keypoints: [[String: Any]] = []
|
||||
|
||||
// Extract face landmarks as pose keypoints
|
||||
if let nosePoints = lms.nose?.pointsInImage(imageSize: imgSize) {
|
||||
for pt in nosePoints {
|
||||
keypoints.append([
|
||||
"name": "nose",
|
||||
"x": Double(pt.x),
|
||||
"y": Double(imgH - pt.y),
|
||||
"confidence": lmConf
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
if let leftEyePoints = lms.leftEye?.pointsInImage(imageSize: imgSize) {
|
||||
for pt in leftEyePoints {
|
||||
keypoints.append([
|
||||
"name": "left_eye",
|
||||
"x": Double(pt.x),
|
||||
"y": Double(imgH - pt.y),
|
||||
"confidence": lmConf
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
if let rightEyePoints = lms.rightEye?.pointsInImage(imageSize: imgSize) {
|
||||
for pt in rightEyePoints {
|
||||
keypoints.append([
|
||||
"name": "right_eye",
|
||||
"x": Double(pt.x),
|
||||
"y": Double(imgH - pt.y),
|
||||
"confidence": lmConf
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
if !keypoints.isEmpty {
|
||||
persons.append([
|
||||
"keypoints": keypoints,
|
||||
"bbox": ["x": 0, "y": 0, "width": 0, "height": 0]
|
||||
])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also process body pose detections (may add more keypoints)
|
||||
for pose in poses {
|
||||
var keypoints: [[String: Any]] = []
|
||||
var minX = CGFloat.greatestFiniteMagnitude
|
||||
var minY = CGFloat.greatestFiniteMagnitude
|
||||
var maxX: CGFloat = 0
|
||||
var maxY: CGFloat = 0
|
||||
|
||||
for joint in jointNames {
|
||||
if let point = try? pose.recognizedPoint(joint) {
|
||||
let desc = String(describing: joint.rawValue)
|
||||
var rawName = desc
|
||||
.replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
|
||||
.replacingOccurrences(of: ")", with: "")
|
||||
.trimmingCharacters(in: .whitespaces)
|
||||
let nameMap: [String: String] = [
|
||||
"head_joint": "nose",
|
||||
"left_eye_joint": "left_eye",
|
||||
"right_eye_joint": "right_eye",
|
||||
"left_ear_joint": "left_ear",
|
||||
"right_ear_joint": "right_ear",
|
||||
"neck_1_joint": "neck",
|
||||
"left_shoulder_1_joint": "left_shoulder",
|
||||
"right_shoulder_1_joint": "right_shoulder",
|
||||
"left_elbow_1_joint": "left_elbow",
|
||||
"right_elbow_1_joint": "right_elbow",
|
||||
"left_hand_joint": "left_wrist",
|
||||
"right_hand_joint": "right_wrist",
|
||||
"left_hip_1_joint": "left_hip",
|
||||
"right_hip_1_joint": "right_hip",
|
||||
"left_knee_1_joint": "left_knee",
|
||||
"right_knee_1_joint": "right_knee",
|
||||
"left_ankle_1_joint": "left_ankle",
|
||||
"right_ankle_1_joint": "right_ankle",
|
||||
"center_hip_joint": "root",
|
||||
]
|
||||
if let mapped = nameMap[rawName] {
|
||||
rawName = mapped
|
||||
}
|
||||
let px = point.location.x * CGFloat(imgW)
|
||||
let py = CGFloat(imgH) - point.location.y * CGFloat(imgH)
|
||||
keypoints.append([
|
||||
"name": rawName.isEmpty ? "\(joint)" : rawName,
|
||||
"x": px,
|
||||
"y": py,
|
||||
"confidence": point.confidence,
|
||||
])
|
||||
if point.confidence > 0.1 {
|
||||
minX = min(minX, px)
|
||||
minY = min(minY, py)
|
||||
maxX = max(maxX, px)
|
||||
maxY = max(maxY, py)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var bbox: [String: Any] = ["x": 0, "y": 0, "width": 0, "height": 0]
|
||||
if maxX > minX {
|
||||
bbox = [
|
||||
"x": Int(minX),
|
||||
"y": Int(minY),
|
||||
"width": Int(maxX - minX),
|
||||
"height": Int(maxY - minY),
|
||||
]
|
||||
}
|
||||
|
||||
persons.append(["keypoints": keypoints, "bbox": bbox])
|
||||
}
|
||||
|
||||
// Rule: Face ≤ Pose - always add pose frame if has face
|
||||
if hasFace || !persons.isEmpty {
|
||||
poseFrames.append([
|
||||
"frame": frameIndex,
|
||||
"timestamp": seconds,
|
||||
"persons": persons,
|
||||
])
|
||||
}
|
||||
|
||||
processedCount += 1
|
||||
|
||||
if processedCount % 100 == 0 {
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
let totalSamples = totalFrames / sampleInterval
|
||||
let pct = Int(Double(processedCount) / Double(totalSamples) * 100)
|
||||
print("[SwiftFacePose] \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(pct)% complete, \(Int(elapsed))s elapsed")
|
||||
fflush(stdout)
|
||||
}
|
||||
}
|
||||
|
||||
reader.cancelReading()
|
||||
|
||||
let faceOutputDict: [String: Any] = [
|
||||
"frame_count": faceFrames.count,
|
||||
"fps": Double(fps),
|
||||
"frames": faceFrames,
|
||||
]
|
||||
do {
|
||||
let faceJson = try JSONSerialization.data(withJSONObject: faceOutputDict, options: [])
|
||||
try faceJson.write(to: URL(fileURLWithPath: faceOutput))
|
||||
print("[SwiftFacePose] Face output written: \(faceOutput)")
|
||||
// Verify file exists
|
||||
if FileManager.default.fileExists(atPath: faceOutput) {
|
||||
print("[SwiftFacePose] Verified: file exists at \(faceOutput)")
|
||||
} else {
|
||||
print("[SwiftFacePose] ERROR: file not found after write!")
|
||||
}
|
||||
} catch {
|
||||
print("[SwiftFacePose] ERROR writing face output: \(error)")
|
||||
}
|
||||
|
||||
let poseOutputDict: [String: Any] = [
|
||||
"frame_count": poseFrames.count,
|
||||
"fps": Double(fps),
|
||||
"frames": poseFrames,
|
||||
]
|
||||
if let poseJson = try? JSONSerialization.data(withJSONObject: poseOutputDict, options: [.prettyPrinted]) {
|
||||
try poseJson.write(to: URL(fileURLWithPath: poseOutput))
|
||||
}
|
||||
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
print("[SwiftFacePose] Done: \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(String(format: "%.1f", elapsed))s")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user