Files
momentry_core/scripts/find_kids_pose.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

170 lines
7.1 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Find "Kids" in pose data based on Head-to-Body Ratio.
Heuristic: Kids have a larger head relative to their body height (approx 1:5 or 1:6) compared to adults (approx 1:7.5).
"""
import json
import math
import sys
# Configuration
POSE_JSON_PATH = "output/384b0ff44aaaa1f1/384b0ff44aaaa1f1.pose.json"
# Heuristic Threshold: Kids typically have a body length < 6.0 * head_width
# Adults are usually > 6.5.
# We look for Ratio < 5.5 to be safe (smaller is "more kid-like" relative to head size)
BODY_TO_HEAD_RATIO_THRESHOLD = 5.8
def distance(p1, p2):
return math.sqrt((p1['x'] - p2['x'])**2 + (p1['y'] - p2['y'])**2)
def get_midpoint(p1, p2):
return {'x': (p1['x'] + p2['x'])/2, 'y': (p1['y'] + p2['y'])/2}
def find_kids():
try:
with open(POSE_JSON_PATH, 'r') as f:
data = json.load(f)
except Exception as e:
print(f"Error loading JSON: {e}")
return
frames = data.get("frames", {})
potential_kids = []
# Counters for debugging
total_poses = 0
analyzed_poses = 0
for frame_idx_str, frame_data in frames.items():
# Structure: frames -> { "frame_index": { "timestamp": ..., "poses": [...] } }
# Or maybe just "poses" list directly?
# Checking structure: result["frames"][str(idx)] = { "timestamp": ..., "poses": frame_poses }
# Wait, in the processor code:
# result["frames"][str(idx)] = { "timestamp": idx / fps ..., "poses": frame_poses }
# But the loop iterates over `frames.items()`.
# Actually, looking at the JSON structure saved:
# It saves the whole result dict.
# result = { ... "frames": { "0": { ... }, "10": { ... } } }
# So `frame_data` is { "timestamp": ..., "poses": [...] }
timestamp = frame_data.get("timestamp", 0)
# "poses" in this JSON is the list of person detections
# Each detection has "keypoints" list
# But wait, looking at the processor code:
# frame_poses.append({"keypoints": person_keypoints, "person_id": person_idx})
# The saved JSON structure in process_video_pose is:
# result["frames"][str(idx)] = { "timestamp": ..., "poses": frame_poses }
# Let's check the actual JSON structure of the file generated.
# It is likely: frames -> { "frame_index": { "timestamp": ..., "poses": [...] } }
people_in_frame = frame_data.get("poses", [])
for person in people_in_frame:
total_poses += 1
kps_list = person.get("keypoints", [])
# Map keypoints by name for easier access
kp_dict = {kp['name']: kp for kp in kps_list}
# We need visible keypoints
nose = kp_dict.get('nose')
l_shoulder = kp_dict.get('left_shoulder')
r_shoulder = kp_dict.get('right_shoulder')
l_hip = kp_dict.get('left_hip')
r_hip = kp_dict.get('right_hip')
l_ankle = kp_dict.get('left_ankle')
r_ankle = kp_dict.get('right_ankle')
# Check visibility
if not nose or not (l_shoulder or r_shoulder):
continue
analyzed_poses += 1
# Estimate Head Size
# Distance Nose -> Mid-Shoulders is approx half head height.
if l_shoulder and r_shoulder:
mid_shoulder = get_midpoint(l_shoulder, r_shoulder)
elif l_shoulder:
mid_shoulder = l_shoulder
else:
mid_shoulder = r_shoulder
if not mid_shoulder:
continue
# Head Height approx = 2 * distance(Nose, Mid_Shoulder)
# Why 2? Nose is roughly in the middle of the face vertically (eyes/nose/mouth).
# Distance from nose to shoulder top is roughly "Neck + Half Head".
# A rough proxy for Head Height is 1/2 shoulder width? No.
# Let's use: Head_Height ~ 1.0 * distance(Nose, Shoulder) is risky.
# Let's assume Head_Height is roughly constant relative to shoulder width.
# Better metric: Body Length / Shoulder Width?
# No, shoulder width varies with build.
# Let's go back to: Total Visible Height / Estimated Head Height.
# Head Height Estimate = Distance(Nose, Mid_Shoulder) * 2.5 (Rough guess for full head).
# Actually, let's use: Head_Height = Distance(Left Ear, Right Ear) if visible? No, usually not reliable.
# Let's use: Head_Height = Distance(Nose, Mid_Shoulder) * 1.8 (Empirical factor).
head_height_est = distance(nose, mid_shoulder) * 1.8
if head_height_est < 10: # Too small/noisy
continue
# Body Height: Distance from Nose to lowest visible point (Hip or Ankle)
# We want to estimate Total Height.
# If Ankles visible:
if l_ankle and r_ankle:
mid_ankle = get_midpoint(l_ankle, r_ankle)
# Height from Top of Head to Ankle
# Nose is inside head. Distance(Nose, Ankle) + Top_of_Head_offset.
# Let's just use Distance(Nose, Ankle) as the "Body Length below nose".
# Total Height ≈ Dist(Nose, Ankle) + Head_Height/2.
dist_nose_ankle = distance(nose, mid_ankle)
total_height = dist_nose_ankle + (head_height_est / 2)
# Check for valid height (avoid division by zero or weird angles)
if total_height > head_height_est:
ratio = total_height / head_height_est
# Heuristic:
# Adults: ~7.0 - 8.0
# Kids: ~4.5 - 6.0
# We look for < 6.5
if ratio < BODY_TO_HEAD_RATIO_THRESHOLD:
potential_kids.append({
"frame": frame_idx_str,
"timestamp": timestamp,
"ratio": round(ratio, 2),
"person_id": person.get("person_id", "?")
})
else:
# If legs not visible (sitting/crouching), harder to judge ratio.
# We could use Shoulder-to-Hip vs Head, but let's stick to full body for safety.
pass
print(f"Analyzed {analyzed_poses} poses out of {total_poses} total detections.")
print(f"Found {len(potential_kids)} potential 'kids' (Ratio < {BODY_TO_HEAD_RATIO_THRESHOLD}).")
# Group by timestamp to avoid duplicates (same person in consecutive frames)
unique_kids = {}
for k in potential_kids:
ts = round(k['timestamp'], 1) # Round to 0.1s
if ts not in unique_kids:
unique_kids[ts] = k
# Sort by timestamp
sorted_kids = sorted(unique_kids.values(), key=lambda x: x['timestamp'])
print(f"\nUnique potential kid detections (timestamps):")
for k in sorted_kids:
print(f" -> Timestamp: {k['timestamp']:.2f}s | Ratio: {k['ratio']}")
if __name__ == "__main__":
find_kids()