Files
momentry_core/scripts/utils/body_action_decoder.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

875 lines
30 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Body Action Decoder - Extended pose action analysis with body keypoints
Purpose:
1. Decode face pose actions (existing)
2. Decode body actions (future MediaPipe Holistic)
3. Integrate face + body actions for comprehensive analysis
Body Keypoints (MediaPipe Holistic):
- Face: 468 points (eyes, mouth, nose, etc.)
- Pose: 33 points (shoulders, elbows, hands, hips, knees, feet)
- Hands: 21 points per hand
Action Types:
- Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head
- Eyes: blink, close, wide_open, look_left, look_right
- Mouth: open, close, smile, talk, yawn
- Arms: raise_left, raise_right, cross_arms, wave
- Hands: point, grab, clap, thumbs_up, fist
- Legs: stand, sit, walk, run, jump, kick
- Feet: tap, stomp, cross
Architecture:
┌─────────────────────────────────────────────────────────────────┐
│ Body Action Decoder │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │
│ │ Face Actions │ │ Body Actions │ │ Hand Actions │ │
│ │ (InsightFace) │ │ (MediaPipe) │ │ (MediaPipe) │ │
│ └───────────────┘ └───────────────┘ └───────────────┘ │
│ │ │ │ │
│ └──────────────────┼──────────────────┘ │
│ │ │
│ ┌───────▼───────┐ │
│ │ Action Merger│ │
│ └────────────────┘ │
│ │ │
│ ┌───────▼───────┐ │
│ │ Action Timeline│ │
│ └────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
"""
import json
import argparse
import numpy as np
from typing import Dict, List
# =============================================================================
# Face Action Definitions (Existing from pose_action_decoder.py)
# =============================================================================
FACE_TURN_ACTIONS = {
("frontal", "three_quarter"): "turn_partial",
("frontal", "profile_left"): "turn_left",
("frontal", "profile_right"): "turn_right",
("three_quarter", "frontal"): "return_frontal",
("three_quarter", "profile_left"): "turn_left",
("three_quarter", "profile_right"): "turn_right",
("profile_left", "frontal"): "turn_to_frontal",
("profile_left", "three_quarter"): "turn_to_three_quarter",
("profile_left", "profile_right"): "turn_full",
("profile_right", "frontal"): "turn_to_frontal",
("profile_right", "three_quarter"): "turn_to_three_quarter",
("profile_right", "profile_left"): "turn_full",
}
FACE_PITCH_ACTIONS = {
("neutral", "tilted_up"): "look_up",
("neutral", "tilted_down"): "look_down",
("tilted_up", "neutral"): "return_neutral",
("tilted_down", "neutral"): "return_neutral",
}
# =============================================================================
# Eye Action Definitions
# =============================================================================
EYE_ACTIONS = {
"blink": {
"description": "眨眼",
"pattern": "eye_aspect_ratio drops < 0.2 for 1-3 frames",
"min_frames": 1,
"max_frames": 3,
},
"close": {
"description": "闭眼",
"pattern": "eye_aspect_ratio < 0.15 for > 10 frames",
"min_frames": 10,
},
"wide_open": {
"description": "睁大眼",
"pattern": "eye_aspect_ratio > 0.4",
},
"look_left": {
"description": "向左看",
"pattern": "iris_position_x < 0.3",
},
"look_right": {
"description": "向右看",
"pattern": "iris_position_x > 0.7",
},
"squint": {
"description": "眯眼",
"pattern": "eye_aspect_ratio 0.15-0.25",
},
}
# =============================================================================
# Mouth Action Definitions
# =============================================================================
MOUTH_ACTIONS = {
"open": {
"description": "张嘴",
"pattern": "mouth_aspect_ratio > 0.5",
},
"close": {
"description": "闭嘴",
"pattern": "mouth_aspect_ratio < 0.2",
},
"smile": {
"description": "微笑",
"pattern": "mouth_corner_distance > threshold",
},
"talk": {
"description": "说话",
"pattern": "mouth_aspect_ratio oscillating 0.3-0.6",
"min_frames": 10,
},
"yawn": {
"description": "打哈欠",
"pattern": "mouth_aspect_ratio > 0.7 for > 20 frames",
"min_frames": 20,
},
"pout": {
"description": "嘟嘴",
"pattern": "lip_distance > threshold",
},
}
# =============================================================================
# Arm Action Definitions
# =============================================================================
ARM_ACTIONS = {
"raise_left": {
"description": "举起左手",
"pattern": "left_shoulder_y > left_elbow_y > left_wrist_y",
},
"raise_right": {
"description": "举起右手",
"pattern": "right_shoulder_y > right_elbow_y > right_wrist_y",
},
"raise_both": {
"description": "双手举起",
"pattern": "both arms raised",
},
"cross_arms": {
"description": "双手交叉",
"pattern": "left_wrist_x > right_wrist_x AND right_wrist_x < left_wrist_x",
},
"wave": {
"description": "挥手",
"pattern": "wrist_y oscillating ±20px for 5-15 frames",
"min_frames": 5,
"max_frames": 15,
},
"extend_left": {
"description": "伸展左臂",
"pattern": "left_elbow_angle > 150°",
},
"extend_right": {
"description": "伸展右臂",
"pattern": "right_elbow_angle > 150°",
},
"fold_left": {
"description": "弯曲左臂",
"pattern": "left_elbow_angle < 90°",
},
"fold_right": {
"description": "弯曲右臂",
"pattern": "right_elbow_angle < 90°",
},
"point": {
"description": "指向",
"pattern": "index_finger extended, other fingers folded",
},
}
# =============================================================================
# Hand Action Definitions
# =============================================================================
HAND_ACTIONS = {
"grab": {
"description": "抓取",
"pattern": "fingers folded, thumb opposing",
},
"open": {
"description": "张开手",
"pattern": "all fingers extended",
},
"clap": {
"description": "拍手",
"pattern": "hands together then apart (velocity pattern)",
"min_frames": 3,
"max_frames": 10,
},
"thumbs_up": {
"description": "点赞",
"pattern": "thumb extended upward, other fingers folded",
},
"fist": {
"description": "握拳",
"pattern": "all fingers folded into palm",
},
"peace": {
"description": "剪刀手",
"pattern": "index and middle fingers extended",
},
"ok": {
"description": "OK 手势",
"pattern": "thumb and index finger touching",
},
"touch_face": {
"description": "摸脸",
"pattern": "hand near face region",
},
"touch_hair": {
"description": "摸头发",
"pattern": "hand above head region",
},
"pocket_left": {
"description": "左手插兜",
"pattern": "left_hand in hip region",
},
"pocket_right": {
"description": "右手插兜",
"pattern": "right_hand in hip region",
},
}
# =============================================================================
# Leg Action Definitions
# =============================================================================
LEG_ACTIONS = {
"stand": {
"description": "站立",
"pattern": "hip_y < knee_y < ankle_y, vertical alignment",
},
"sit": {
"description": "坐姿",
"pattern": "hip_y ≈ knee_y, thigh horizontal",
},
"walk": {
"description": "行走",
"pattern": "hip-knee-ankle oscillating, stride pattern",
"min_frames": 10,
},
"run": {
"description": "奔跑",
"pattern": "fast oscillating, knee_bend > 60°",
"min_frames": 10,
},
"jump": {
"description": "跳跃",
"pattern": "all keypoints moving upward then landing",
"min_frames": 5,
"max_frames": 20,
},
"kick": {
"description": "踢腿",
"pattern": "one leg extended forward rapidly",
"min_frames": 3,
"max_frames": 15,
},
"cross_left": {
"description": "左腿交叉",
"pattern": "left_ankle_x > right_ankle_x",
},
"cross_right": {
"description": "右腿交叉",
"pattern": "right_ankle_x > left_ankle_x",
},
"knee_bend": {
"description": "弯膝",
"pattern": "knee_angle < 120°",
},
}
# =============================================================================
# Feet Action Definitions
# =============================================================================
FEET_ACTIONS = {
"tap": {
"description": "轻踏",
"pattern": "ankle_y oscillating ±10px",
"min_frames": 3,
"max_frames": 15,
},
"stomp": {
"description": "重踏",
"pattern": "ankle_y large downward movement",
"min_frames": 3,
},
"cross": {
"description": "交叉脚",
"pattern": "feet_x overlapping",
},
"point_left": {
"description": "左脚前伸",
"pattern": "left_ankle_y < right_ankle_y",
},
"point_right": {
"description": "右脚前伸",
"pattern": "right_ankle_y < left_ankle_y",
},
}
# =============================================================================
# Combined Actions (Face + Body)
# =============================================================================
COMBINED_ACTIONS = {
"thinking": {
"description": "思考姿势",
"components": ["touch_face", "look_down"],
"pattern": "hand near chin + head tilted down",
},
"listening": {
"description": "倾听姿势",
"components": ["turn_partial", "open_mouth"],
"pattern": "slight turn + mouth slightly open",
},
"nodding_agreement": {
"description": "点头同意",
"components": ["nod_head", "smile"],
"pattern": "head nod + smile",
},
"shaking_disagreement": {
"description": "摇头不同意",
"components": ["shake_head", "frown"],
"pattern": "shake head + frown",
},
"waving_greeting": {
"description": "挥手打招呼",
"components": ["wave", "smile"],
"pattern": "wave hand + smile",
},
"crossing_arms_defensive": {
"description": "双手交叉防御",
"components": ["cross_arms", "frontal_stable"],
"pattern": "cross arms + frontal pose",
},
"pointing_explaining": {
"description": "指向解释",
"components": ["point", "turn_partial"],
"pattern": "pointing + slight turn",
},
"stretching": {
"description": "伸展",
"components": ["raise_both", "look_up"],
"pattern": "raise arms + look up",
},
"sitting_relaxed": {
"description": "放松坐姿",
"components": ["sit", "cross_arms"],
"pattern": "sit + cross arms",
},
}
# =============================================================================
# Analysis Functions
# =============================================================================
def analyze_eye_actions(eye_landmarks: List, prev_eye_landmarks: List = None) -> List[Dict]:
"""
Analyze eye actions from landmarks
Args:
eye_landmarks: Current frame eye landmarks (left/right eye points)
prev_eye_landmarks: Previous frame landmarks (for motion detection)
Returns:
List of detected eye actions
"""
actions = []
if not eye_landmarks or len(eye_landmarks) < 6:
return actions
# Calculate eye aspect ratio (EAR)
# EAR = (|p2-p6| + |p3-p5|) / (2|p1-p4|)
# Points: p1, p2, p3, p4, p5, p6 (6 points per eye)
# For left eye
left_eye = eye_landmarks[:6]
if len(left_eye) == 6:
# Simplified EAR calculation
vertical_1 = np.linalg.norm(np.array(left_eye[1]) - np.array(left_eye[5]))
vertical_2 = np.linalg.norm(np.array(left_eye[2]) - np.array(left_eye[4]))
horizontal = np.linalg.norm(np.array(left_eye[0]) - np.array(left_eye[3]))
left_ear = (vertical_1 + vertical_2) / (2 * horizontal) if horizontal > 0 else 0
# Detect actions
if left_ear < 0.15:
actions.append({"action": "close_left", "description": "闭左眼", "confidence": 1.0 - left_ear})
elif left_ear > 0.4:
actions.append({"action": "wide_open_left", "description": "睁大左眼", "confidence": left_ear})
return actions
def analyze_mouth_actions(mouth_landmarks: List) -> List[Dict]:
"""
Analyze mouth actions from landmarks
Args:
mouth_landmarks: Mouth region landmarks (lips, mouth corners)
Returns:
List of detected mouth actions
"""
actions = []
if not mouth_landmarks or len(mouth_landmarks) < 4:
return actions
# Calculate mouth aspect ratio
# Upper lip - lower lip distance / mouth width
upper_lip = np.array(mouth_landmarks[0])
lower_lip = np.array(mouth_landmarks[1])
left_corner = np.array(mouth_landmarks[2])
right_corner = np.array(mouth_landmarks[3])
mouth_height = np.linalg.norm(upper_lip - lower_lip)
mouth_width = np.linalg.norm(left_corner - right_corner)
mar = mouth_height / mouth_width if mouth_width > 0 else 0
# Detect actions
if mar > 0.7:
actions.append({"action": "yawn", "description": "打哈欠", "mar": mar})
elif mar > 0.5:
actions.append({"action": "open", "description": "张嘴", "mar": mar})
elif mar < 0.2:
actions.append({"action": "close", "description": "闭嘴", "mar": mar})
else:
# Check smile (mouth corners distance)
corner_distance = abs(left_corner[1] - upper_lip[1]) + abs(right_corner[1] - upper_lip[1])
if corner_distance > 10: # Threshold
actions.append({"action": "smile", "description": "微笑", "corner_distance": corner_distance})
return actions
def analyze_arm_actions(pose_keypoints: Dict) -> List[Dict]:
"""
Analyze arm actions from pose keypoints
Args:
pose_keypoints: Pose keypoints dict with shoulder, elbow, wrist positions
Returns:
List of detected arm actions
"""
actions = []
# Keypoint indices (MediaPipe Pose):
# 11: left_shoulder, 12: right_shoulder
# 13: left_elbow, 14: right_elbow
# 15: left_wrist, 16: right_wrist
left_shoulder = pose_keypoints.get("left_shoulder")
left_elbow = pose_keypoints.get("left_elbow")
left_wrist = pose_keypoints.get("left_wrist")
right_shoulder = pose_keypoints.get("right_shoulder")
right_elbow = pose_keypoints.get("right_elbow")
right_wrist = pose_keypoints.get("right_wrist")
# Left arm actions
if left_shoulder and left_elbow and left_wrist:
# Calculate elbow angle
shoulder_elbow = np.array(left_elbow) - np.array(left_shoulder)
elbow_wrist = np.array(left_wrist) - np.array(left_elbow)
elbow_angle = np.arccos(
np.dot(shoulder_elbow, elbow_wrist) /
(np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
)
elbow_angle_deg = np.degrees(elbow_angle)
# Detect actions
if left_wrist[1] < left_elbow[1] < left_shoulder[1]: # Raised (y decreases upward)
actions.append({"action": "raise_left", "description": "举起左手", "angle": elbow_angle_deg})
if elbow_angle_deg > 150:
actions.append({"action": "extend_left", "description": "伸展左臂", "angle": elbow_angle_deg})
elif elbow_angle_deg < 90:
actions.append({"action": "fold_left", "description": "弯曲左臂", "angle": elbow_angle_deg})
# Right arm actions
if right_shoulder and right_elbow and right_wrist:
shoulder_elbow = np.array(right_elbow) - np.array(right_shoulder)
elbow_wrist = np.array(right_wrist) - np.array(right_elbow)
elbow_angle = np.arccos(
np.dot(shoulder_elbow, elbow_wrist) /
(np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
)
elbow_angle_deg = np.degrees(elbow_angle)
if right_wrist[1] < right_elbow[1] < right_shoulder[1]:
actions.append({"action": "raise_right", "description": "举起右手", "angle": elbow_angle_deg})
if elbow_angle_deg > 150:
actions.append({"action": "extend_right", "description": "伸展右臂", "angle": elbow_angle_deg})
elif elbow_angle_deg < 90:
actions.append({"action": "fold_right", "description": "弯曲右臂", "angle": elbow_angle_deg})
# Cross arms detection
if left_wrist and right_wrist:
if left_wrist[0] > right_wrist[0] and right_wrist[0] < left_shoulder[0]:
actions.append({"action": "cross_arms", "description": "双手交叉"})
return actions
def analyze_hand_actions(hand_keypoints: List, hand_type: str = "right") -> List[Dict]:
"""
Analyze hand actions from hand keypoints
Args:
hand_keypoints: 21 hand keypoints (MediaPipe Hand)
hand_type: "left" or "right"
Returns:
List of detected hand actions
"""
actions = []
if not hand_keypoints or len(hand_keypoints) < 21:
return actions
# MediaPipe Hand keypoint indices:
# 0: wrist
# 1-4: thumb (CMC, MCP, IP, TIP)
# 5-8: index finger (MCP, PIP, DIP, TIP)
# 9-12: middle finger
# 13-16: ring finger
# 17-20: pinky
wrist = np.array(hand_keypoints[0])
thumb_tip = np.array(hand_keypoints[4])
index_tip = np.array(hand_keypoints[8])
middle_tip = np.array(hand_keypoints[12])
ring_tip = np.array(hand_keypoints[16])
pinky_tip = np.array(hand_keypoints[20])
# Calculate finger extensions
finger_tips = [thumb_tip, index_tip, middle_tip, ring_tip, pinky_tip]
finger_bases = [
np.array(hand_keypoints[2]), # thumb IP
np.array(hand_keypoints[5]), # index MCP
np.array(hand_keypoints[9]), # middle MCP
np.array(hand_keypoints[13]), # ring MCP
np.array(hand_keypoints[17]), # pinky MCP
]
extensions = []
for tip, base in zip(finger_tips, finger_bases):
dist = np.linalg.norm(tip - base)
extensions.append(dist)
# Detect actions
avg_extension = np.mean(extensions)
if avg_extension > 50: # Open hand
actions.append({"action": f"open_{hand_type}", "description": f"张开{hand_type}"})
elif avg_extension < 30: # Closed/fist
actions.append({"action": f"fist_{hand_type}", "description": f"{hand_type}"})
# Thumbs up (thumb extended upward, others folded)
if extensions[0] > 40 and np.mean(extensions[1:]) < 30:
actions.append({"action": f"thumbs_up_{hand_type}", "description": f"{hand_type}手点赞"})
# Peace sign (index and middle extended)
if extensions[1] > 40 and extensions[2] > 40 and np.mean(extensions[3:]) < 30:
actions.append({"action": f"peace_{hand_type}", "description": f"{hand_type}手剪刀手"})
# Pointing (index extended, others folded)
if extensions[1] > 40 and np.mean([extensions[0], extensions[2], extensions[3], extensions[4]]) < 30:
actions.append({"action": f"point_{hand_type}", "description": f"{hand_type}手指向"})
return actions
def analyze_leg_actions(pose_keypoints: Dict) -> List[Dict]:
"""
Analyze leg actions from pose keypoints
Args:
pose_keypoints: Pose keypoints with hip, knee, ankle positions
Returns:
List of detected leg actions
"""
actions = []
# Keypoint indices (MediaPipe Pose):
# 23: left_hip, 24: right_hip
# 25: left_knee, 26: right_knee
# 27: left_ankle, 28: right_ankle
left_hip = pose_keypoints.get("left_hip")
left_knee = pose_keypoints.get("left_knee")
left_ankle = pose_keypoints.get("left_ankle")
right_hip = pose_keypoints.get("right_hip")
right_knee = pose_keypoints.get("right_knee")
right_ankle = pose_keypoints.get("right_ankle")
# Left leg actions
if left_hip and left_knee and left_ankle:
hip_knee = np.array(left_knee) - np.array(left_hip)
knee_ankle = np.array(left_ankle) - np.array(left_knee)
knee_angle = np.arccos(
np.dot(hip_knee, knee_ankle) /
(np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
)
knee_angle_deg = np.degrees(knee_angle)
# Detect actions
if knee_angle_deg < 120:
actions.append({"action": "knee_bend_left", "description": "弯左膝", "angle": knee_angle_deg})
# Standing detection
if left_hip[1] < left_knee[1] < left_ankle[1]: # Vertical alignment (y increases downward)
actions.append({"action": "stand_left", "description": "左腿站立"})
# Right leg actions
if right_hip and right_knee and right_ankle:
hip_knee = np.array(right_knee) - np.array(right_hip)
knee_ankle = np.array(right_ankle) - np.array(right_knee)
knee_angle = np.arccos(
np.dot(hip_knee, knee_ankle) /
(np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
)
knee_angle_deg = np.degrees(knee_angle)
if knee_angle_deg < 120:
actions.append({"action": "knee_bend_right", "description": "弯右膝", "angle": knee_angle_deg})
if right_hip[1] < right_knee[1] < right_ankle[1]:
actions.append({"action": "stand_right", "description": "右腿站立"})
# Sit detection (hip ≈ knee height)
if left_hip and left_knee and right_hip and right_knee:
hip_avg_y = (left_hip[1] + right_hip[1]) / 2
knee_avg_y = (left_knee[1] + right_knee[1]) / 2
if abs(hip_avg_y - knee_avg_y) < 30: # Hip and knee at similar height
actions.append({"action": "sit", "description": "坐姿"})
return actions
# =============================================================================
# Main Decoder Function
# =============================================================================
def decode_body_actions(
pose_data: Dict,
face_data: Dict = None,
hand_data: Dict = None,
) -> Dict:
"""
Decode all body actions from multiple data sources
Args:
pose_data: Pose estimation data (MediaPipe Pose)
face_data: Face pose data (InsightFace pose_angle)
hand_data: Hand tracking data (MediaPipe Hand)
Returns:
Combined action data dict
"""
all_actions = {
"face": [],
"eyes": [],
"mouth": [],
"arms": [],
"hands": [],
"legs": [],
"feet": [],
"combined": [],
}
# 1. Face actions (existing)
if face_data:
pose_angle = face_data.get("pose_angle", {})
prev_pose_angle = face_data.get("prev_pose_angle", {})
if pose_angle and prev_pose_angle:
angle = pose_angle.get("angle", "unknown")
prev_angle = prev_pose_angle.get("angle", "unknown")
turn_key = (prev_angle, angle)
if turn_key in FACE_TURN_ACTIONS:
all_actions["face"].append({
"action": FACE_TURN_ACTIONS[turn_key],
"description": f"Face: {prev_angle}{angle}",
})
# Pitch actions
pitch = pose_angle.get("pitch", "neutral")
prev_pitch = prev_pose_angle.get("pitch", "neutral")
pitch_key = (prev_pitch, pitch)
if pitch_key in FACE_PITCH_ACTIONS:
all_actions["face"].append({
"action": FACE_PITCH_ACTIONS[pitch_key],
"description": f"Pitch: {prev_pitch}{pitch}",
})
# 2. Eye actions (if eye landmarks available)
if face_data and face_data.get("eye_landmarks"):
all_actions["eyes"] = analyze_eye_actions(
face_data["eye_landmarks"],
face_data.get("prev_eye_landmarks")
)
# 3. Mouth actions (if mouth landmarks available)
if face_data and face_data.get("mouth_landmarks"):
all_actions["mouth"] = analyze_mouth_actions(face_data["mouth_landmarks"])
# 4. Arm actions (if pose keypoints available)
if pose_data and pose_data.get("keypoints"):
all_actions["arms"] = analyze_arm_actions(pose_data["keypoints"])
# 5. Hand actions (if hand keypoints available)
if hand_data:
if hand_data.get("left_hand"):
all_actions["hands"].extend(analyze_hand_actions(hand_data["left_hand"], "left"))
if hand_data.get("right_hand"):
all_actions["hands"].extend(analyze_hand_actions(hand_data["right_hand"], "right"))
# 6. Leg actions (if pose keypoints available)
if pose_data and pose_data.get("keypoints"):
all_actions["legs"] = analyze_leg_actions(pose_data["keypoints"])
# 7. Combined actions
detected_actions = []
for category, actions in all_actions.items():
if actions:
detected_actions.extend([a["action"] for a in actions])
for combined_name, combined_def in COMBINED_ACTIONS.items():
components = combined_def["components"]
if all(comp in detected_actions for comp in components):
all_actions["combined"].append({
"action": combined_name,
"description": combined_def["description"],
"components": components,
})
return all_actions
def print_body_action_report(action_data: Dict) -> None:
"""
Print body action report
"""
print("\n" + "=" * 70)
print("Body Action Decoder Report")
print("=" * 70)
categories = ["face", "eyes", "mouth", "arms", "hands", "legs", "feet", "combined"]
for category in categories:
actions = action_data.get(category, [])
if actions:
print(f"\n{category.upper()} Actions ({len(actions)}):")
for act in actions:
desc = act.get("description", act["action"])
print(f" - {act['action']}: {desc}")
print("\n" + "=" * 70)
# =============================================================================
# Main Entry Point
# =============================================================================
def main():
parser = argparse.ArgumentParser(description="Decode body actions from pose data")
parser.add_argument("--pose-json", help="Path to pose.json (MediaPipe output)")
parser.add_argument("--face-json", help="Path to face.json (InsightFace output)")
parser.add_argument("--hand-json", help="Path to hand.json (MediaPipe Hand output)")
parser.add_argument("--output-json", help="Output action data JSON")
parser.add_argument("--frame", type=int, help="Analyze specific frame")
args = parser.parse_args()
print("=" * 70)
print("Body Action Decoder")
print("=" * 70)
# Load data
pose_data = None
face_data = None
hand_data = None
if args.pose_json:
with open(args.pose_json) as f:
pose_data = json.load(f)
if args.face_json:
with open(args.face_json) as f:
face_data = json.load(f)
if args.hand_json:
with open(args.hand_json) as f:
hand_data = json.load(f)
# Analyze
if pose_data or face_data or hand_data:
action_data = decode_body_actions(
pose_data=pose_data,
face_data=face_data,
hand_data=hand_data,
)
print_body_action_report(action_data)
if args.output_json:
with open(args.output_json, "w") as f:
json.dump(action_data, f, indent=2)
print(f"\n✅ Output saved to: {args.output_json}")
else:
print("\n⚠️ No input data provided")
print("\nAction Categories:")
print(" - Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head")
print(" - Eyes: blink, close, wide_open, look_left, look_right")
print(" - Mouth: open, close, smile, talk, yawn")
print(" - Arms: raise_left, raise_right, cross_arms, wave, point")
print(" - Hands: grab, open, clap, thumbs_up, fist, peace, ok")
print(" - Legs: stand, sit, walk, run, jump, kick")
print(" - Feet: tap, stomp, cross, point")
print(" - Combined: thinking, listening, nodding_agreement, waving_greeting")
if __name__ == "__main__":
main()