momentry_core/scripts/utils/body_action_decoder.py

#!/opt/homebrew/bin/python3.11
"""
Body Action Decoder - Extended pose action analysis with body keypoints

Purpose:
1. Decode face pose actions (existing)
2. Decode body actions (future MediaPipe Holistic)
3. Integrate face + body actions for comprehensive analysis

Body Keypoints (MediaPipe Holistic):
- Face: 468 points (eyes, mouth, nose, etc.)
- Pose: 33 points (shoulders, elbows, hands, hips, knees, feet)
- Hands: 21 points per hand

Action Types:
- Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head
- Eyes: blink, close, wide_open, look_left, look_right
- Mouth: open, close, smile, talk, yawn
- Arms: raise_left, raise_right, cross_arms, wave
- Hands: point, grab, clap, thumbs_up, fist
- Legs: stand, sit, walk, run, jump, kick
- Feet: tap, stomp, cross

Architecture:
┌─────────────────────────────────────────────────────────────────┐
│                     Body Action Decoder                         │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  ┌───────────────┐  ┌───────────────┐  ┌───────────────┐       │
│  │ Face Actions  │  │ Body Actions  │  │ Hand Actions  │       │
│  │ (InsightFace) │  │ (MediaPipe)   │  │ (MediaPipe)   │       │
│  └───────────────┘  └───────────────┘  └───────────────┘       │
│         │                  │                  │                │
│         └──────────────────┼──────────────────┘                │
│                            │                                    │
│                    ┌───────▼───────┐                            │
│                    │  Action Merger│                            │
│                    └────────────────┘                            │
│                            │                                    │
│                    ┌───────▼───────┐                            │
│                    │ Action Timeline│                            │
│                    └────────────────┘                            │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘
"""

import json
import argparse
import numpy as np
from typing import Dict, List


# =============================================================================
# Face Action Definitions (Existing from pose_action_decoder.py)
# =============================================================================

FACE_TURN_ACTIONS = {
    ("frontal", "three_quarter"): "turn_partial",
    ("frontal", "profile_left"): "turn_left",
    ("frontal", "profile_right"): "turn_right",
    ("three_quarter", "frontal"): "return_frontal",
    ("three_quarter", "profile_left"): "turn_left",
    ("three_quarter", "profile_right"): "turn_right",
    ("profile_left", "frontal"): "turn_to_frontal",
    ("profile_left", "three_quarter"): "turn_to_three_quarter",
    ("profile_left", "profile_right"): "turn_full",
    ("profile_right", "frontal"): "turn_to_frontal",
    ("profile_right", "three_quarter"): "turn_to_three_quarter",
    ("profile_right", "profile_left"): "turn_full",
}

FACE_PITCH_ACTIONS = {
    ("neutral", "tilted_up"): "look_up",
    ("neutral", "tilted_down"): "look_down",
    ("tilted_up", "neutral"): "return_neutral",
    ("tilted_down", "neutral"): "return_neutral",
}


# =============================================================================
# Eye Action Definitions
# =============================================================================

EYE_ACTIONS = {
    "blink": {
        "description": "眨眼",
        "pattern": "eye_aspect_ratio drops < 0.2 for 1-3 frames",
        "min_frames": 1,
        "max_frames": 3,
    },
    "close": {
        "description": "闭眼",
        "pattern": "eye_aspect_ratio < 0.15 for > 10 frames",
        "min_frames": 10,
    },
    "wide_open": {
        "description": "睁大眼",
        "pattern": "eye_aspect_ratio > 0.4",
    },
    "look_left": {
        "description": "向左看",
        "pattern": "iris_position_x < 0.3",
    },
    "look_right": {
        "description": "向右看",
        "pattern": "iris_position_x > 0.7",
    },
    "squint": {
        "description": "眯眼",
        "pattern": "eye_aspect_ratio 0.15-0.25",
    },
}


# =============================================================================
# Mouth Action Definitions
# =============================================================================

MOUTH_ACTIONS = {
    "open": {
        "description": "张嘴",
        "pattern": "mouth_aspect_ratio > 0.5",
    },
    "close": {
        "description": "闭嘴",
        "pattern": "mouth_aspect_ratio < 0.2",
    },
    "smile": {
        "description": "微笑",
        "pattern": "mouth_corner_distance > threshold",
    },
    "talk": {
        "description": "说话",
        "pattern": "mouth_aspect_ratio oscillating 0.3-0.6",
        "min_frames": 10,
    },
    "yawn": {
        "description": "打哈欠",
        "pattern": "mouth_aspect_ratio > 0.7 for > 20 frames",
        "min_frames": 20,
    },
    "pout": {
        "description": "嘟嘴",
        "pattern": "lip_distance > threshold",
    },
}


# =============================================================================
# Arm Action Definitions
# =============================================================================

ARM_ACTIONS = {
    "raise_left": {
        "description": "举起左手",
        "pattern": "left_shoulder_y > left_elbow_y > left_wrist_y",
    },
    "raise_right": {
        "description": "举起右手",
        "pattern": "right_shoulder_y > right_elbow_y > right_wrist_y",
    },
    "raise_both": {
        "description": "双手举起",
        "pattern": "both arms raised",
    },
    "cross_arms": {
        "description": "双手交叉",
        "pattern": "left_wrist_x > right_wrist_x AND right_wrist_x < left_wrist_x",
    },
    "wave": {
        "description": "挥手",
        "pattern": "wrist_y oscillating ±20px for 5-15 frames",
        "min_frames": 5,
        "max_frames": 15,
    },
    "extend_left": {
        "description": "伸展左臂",
        "pattern": "left_elbow_angle > 150°",
    },
    "extend_right": {
        "description": "伸展右臂",
        "pattern": "right_elbow_angle > 150°",
    },
    "fold_left": {
        "description": "弯曲左臂",
        "pattern": "left_elbow_angle < 90°",
    },
    "fold_right": {
        "description": "弯曲右臂",
        "pattern": "right_elbow_angle < 90°",
    },
    "point": {
        "description": "指向",
        "pattern": "index_finger extended, other fingers folded",
    },
}


# =============================================================================
# Hand Action Definitions
# =============================================================================

HAND_ACTIONS = {
    "grab": {
        "description": "抓取",
        "pattern": "fingers folded, thumb opposing",
    },
    "open": {
        "description": "张开手",
        "pattern": "all fingers extended",
    },
    "clap": {
        "description": "拍手",
        "pattern": "hands together then apart (velocity pattern)",
        "min_frames": 3,
        "max_frames": 10,
    },
    "thumbs_up": {
        "description": "点赞",
        "pattern": "thumb extended upward, other fingers folded",
    },
    "fist": {
        "description": "握拳",
        "pattern": "all fingers folded into palm",
    },
    "peace": {
        "description": "剪刀手",
        "pattern": "index and middle fingers extended",
    },
    "ok": {
        "description": "OK 手势",
        "pattern": "thumb and index finger touching",
    },
    "touch_face": {
        "description": "摸脸",
        "pattern": "hand near face region",
    },
    "touch_hair": {
        "description": "摸头发",
        "pattern": "hand above head region",
    },
    "pocket_left": {
        "description": "左手插兜",
        "pattern": "left_hand in hip region",
    },
    "pocket_right": {
        "description": "右手插兜",
        "pattern": "right_hand in hip region",
    },
}


# =============================================================================
# Leg Action Definitions
# =============================================================================

LEG_ACTIONS = {
    "stand": {
        "description": "站立",
        "pattern": "hip_y < knee_y < ankle_y, vertical alignment",
    },
    "sit": {
        "description": "坐姿",
        "pattern": "hip_y ≈ knee_y, thigh horizontal",
    },
    "walk": {
        "description": "行走",
        "pattern": "hip-knee-ankle oscillating, stride pattern",
        "min_frames": 10,
    },
    "run": {
        "description": "奔跑",
        "pattern": "fast oscillating, knee_bend > 60°",
        "min_frames": 10,
    },
    "jump": {
        "description": "跳跃",
        "pattern": "all keypoints moving upward then landing",
        "min_frames": 5,
        "max_frames": 20,
    },
    "kick": {
        "description": "踢腿",
        "pattern": "one leg extended forward rapidly",
        "min_frames": 3,
        "max_frames": 15,
    },
    "cross_left": {
        "description": "左腿交叉",
        "pattern": "left_ankle_x > right_ankle_x",
    },
    "cross_right": {
        "description": "右腿交叉",
        "pattern": "right_ankle_x > left_ankle_x",
    },
    "knee_bend": {
        "description": "弯膝",
        "pattern": "knee_angle < 120°",
    },
}


# =============================================================================
# Feet Action Definitions
# =============================================================================

FEET_ACTIONS = {
    "tap": {
        "description": "轻踏",
        "pattern": "ankle_y oscillating ±10px",
        "min_frames": 3,
        "max_frames": 15,
    },
    "stomp": {
        "description": "重踏",
        "pattern": "ankle_y large downward movement",
        "min_frames": 3,
    },
    "cross": {
        "description": "交叉脚",
        "pattern": "feet_x overlapping",
    },
    "point_left": {
        "description": "左脚前伸",
        "pattern": "left_ankle_y < right_ankle_y",
    },
    "point_right": {
        "description": "右脚前伸",
        "pattern": "right_ankle_y < left_ankle_y",
    },
}


# =============================================================================
# Combined Actions (Face + Body)
# =============================================================================

COMBINED_ACTIONS = {
    "thinking": {
        "description": "思考姿势",
        "components": ["touch_face", "look_down"],
        "pattern": "hand near chin + head tilted down",
    },
    "listening": {
        "description": "倾听姿势",
        "components": ["turn_partial", "open_mouth"],
        "pattern": "slight turn + mouth slightly open",
    },
    "nodding_agreement": {
        "description": "点头同意",
        "components": ["nod_head", "smile"],
        "pattern": "head nod + smile",
    },
    "shaking_disagreement": {
        "description": "摇头不同意",
        "components": ["shake_head", "frown"],
        "pattern": "shake head + frown",
    },
    "waving_greeting": {
        "description": "挥手打招呼",
        "components": ["wave", "smile"],
        "pattern": "wave hand + smile",
    },
    "crossing_arms_defensive": {
        "description": "双手交叉防御",
        "components": ["cross_arms", "frontal_stable"],
        "pattern": "cross arms + frontal pose",
    },
    "pointing_explaining": {
        "description": "指向解释",
        "components": ["point", "turn_partial"],
        "pattern": "pointing + slight turn",
    },
    "stretching": {
        "description": "伸展",
        "components": ["raise_both", "look_up"],
        "pattern": "raise arms + look up",
    },
    "sitting_relaxed": {
        "description": "放松坐姿",
        "components": ["sit", "cross_arms"],
        "pattern": "sit + cross arms",
    },
}


# =============================================================================
# Analysis Functions
# =============================================================================

def analyze_eye_actions(eye_landmarks: List, prev_eye_landmarks: List = None) -> List[Dict]:
    """
    Analyze eye actions from landmarks

    Args:
        eye_landmarks: Current frame eye landmarks (left/right eye points)
        prev_eye_landmarks: Previous frame landmarks (for motion detection)

    Returns:
        List of detected eye actions
    """
    actions = []

    if not eye_landmarks or len(eye_landmarks) < 6:
        return actions

    # Calculate eye aspect ratio (EAR)
    # EAR = (|p2-p6| + |p3-p5|) / (2|p1-p4|)
    # Points: p1, p2, p3, p4, p5, p6 (6 points per eye)

    # For left eye
    left_eye = eye_landmarks[:6]
    if len(left_eye) == 6:
        # Simplified EAR calculation
        vertical_1 = np.linalg.norm(np.array(left_eye[1]) - np.array(left_eye[5]))
        vertical_2 = np.linalg.norm(np.array(left_eye[2]) - np.array(left_eye[4]))
        horizontal = np.linalg.norm(np.array(left_eye[0]) - np.array(left_eye[3]))

        left_ear = (vertical_1 + vertical_2) / (2 * horizontal) if horizontal > 0 else 0

        # Detect actions
        if left_ear < 0.15:
            actions.append({"action": "close_left", "description": "闭左眼", "confidence": 1.0 - left_ear})
        elif left_ear > 0.4:
            actions.append({"action": "wide_open_left", "description": "睁大左眼", "confidence": left_ear})

    return actions


def analyze_mouth_actions(mouth_landmarks: List) -> List[Dict]:
    """
    Analyze mouth actions from landmarks

    Args:
        mouth_landmarks: Mouth region landmarks (lips, mouth corners)

    Returns:
        List of detected mouth actions
    """
    actions = []

    if not mouth_landmarks or len(mouth_landmarks) < 4:
        return actions

    # Calculate mouth aspect ratio
    # Upper lip - lower lip distance / mouth width

    upper_lip = np.array(mouth_landmarks[0])
    lower_lip = np.array(mouth_landmarks[1])
    left_corner = np.array(mouth_landmarks[2])
    right_corner = np.array(mouth_landmarks[3])

    mouth_height = np.linalg.norm(upper_lip - lower_lip)
    mouth_width = np.linalg.norm(left_corner - right_corner)

    mar = mouth_height / mouth_width if mouth_width > 0 else 0

    # Detect actions
    if mar > 0.7:
        actions.append({"action": "yawn", "description": "打哈欠", "mar": mar})
    elif mar > 0.5:
        actions.append({"action": "open", "description": "张嘴", "mar": mar})
    elif mar < 0.2:
        actions.append({"action": "close", "description": "闭嘴", "mar": mar})
    else:
        # Check smile (mouth corners distance)
        corner_distance = abs(left_corner[1] - upper_lip[1]) + abs(right_corner[1] - upper_lip[1])
        if corner_distance > 10:  # Threshold
            actions.append({"action": "smile", "description": "微笑", "corner_distance": corner_distance})

    return actions


def analyze_arm_actions(pose_keypoints: Dict) -> List[Dict]:
    """
    Analyze arm actions from pose keypoints

    Args:
        pose_keypoints: Pose keypoints dict with shoulder, elbow, wrist positions

    Returns:
        List of detected arm actions
    """
    actions = []

    # Keypoint indices (MediaPipe Pose):
    # 11: left_shoulder, 12: right_shoulder
    # 13: left_elbow, 14: right_elbow
    # 15: left_wrist, 16: right_wrist

    left_shoulder = pose_keypoints.get("left_shoulder")
    left_elbow = pose_keypoints.get("left_elbow")
    left_wrist = pose_keypoints.get("left_wrist")

    right_shoulder = pose_keypoints.get("right_shoulder")
    right_elbow = pose_keypoints.get("right_elbow")
    right_wrist = pose_keypoints.get("right_wrist")

    # Left arm actions
    if left_shoulder and left_elbow and left_wrist:
        # Calculate elbow angle
        shoulder_elbow = np.array(left_elbow) - np.array(left_shoulder)
        elbow_wrist = np.array(left_wrist) - np.array(left_elbow)

        elbow_angle = np.arccos(
            np.dot(shoulder_elbow, elbow_wrist) /
            (np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
        )
        elbow_angle_deg = np.degrees(elbow_angle)

        # Detect actions
        if left_wrist[1] < left_elbow[1] < left_shoulder[1]:  # Raised (y decreases upward)
            actions.append({"action": "raise_left", "description": "举起左手", "angle": elbow_angle_deg})

        if elbow_angle_deg > 150:
            actions.append({"action": "extend_left", "description": "伸展左臂", "angle": elbow_angle_deg})
        elif elbow_angle_deg < 90:
            actions.append({"action": "fold_left", "description": "弯曲左臂", "angle": elbow_angle_deg})

    # Right arm actions
    if right_shoulder and right_elbow and right_wrist:
        shoulder_elbow = np.array(right_elbow) - np.array(right_shoulder)
        elbow_wrist = np.array(right_wrist) - np.array(right_elbow)

        elbow_angle = np.arccos(
            np.dot(shoulder_elbow, elbow_wrist) /
            (np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
        )
        elbow_angle_deg = np.degrees(elbow_angle)

        if right_wrist[1] < right_elbow[1] < right_shoulder[1]:
            actions.append({"action": "raise_right", "description": "举起右手", "angle": elbow_angle_deg})

        if elbow_angle_deg > 150:
            actions.append({"action": "extend_right", "description": "伸展右臂", "angle": elbow_angle_deg})
        elif elbow_angle_deg < 90:
            actions.append({"action": "fold_right", "description": "弯曲右臂", "angle": elbow_angle_deg})

    # Cross arms detection
    if left_wrist and right_wrist:
        if left_wrist[0] > right_wrist[0] and right_wrist[0] < left_shoulder[0]:
            actions.append({"action": "cross_arms", "description": "双手交叉"})

    return actions


def analyze_hand_actions(hand_keypoints: List, hand_type: str = "right") -> List[Dict]:
    """
    Analyze hand actions from hand keypoints

    Args:
        hand_keypoints: 21 hand keypoints (MediaPipe Hand)
        hand_type: "left" or "right"

    Returns:
        List of detected hand actions
    """
    actions = []

    if not hand_keypoints or len(hand_keypoints) < 21:
        return actions

    # MediaPipe Hand keypoint indices:
    # 0: wrist
    # 1-4: thumb (CMC, MCP, IP, TIP)
    # 5-8: index finger (MCP, PIP, DIP, TIP)
    # 9-12: middle finger
    # 13-16: ring finger
    # 17-20: pinky

    wrist = np.array(hand_keypoints[0])
    thumb_tip = np.array(hand_keypoints[4])
    index_tip = np.array(hand_keypoints[8])
    middle_tip = np.array(hand_keypoints[12])
    ring_tip = np.array(hand_keypoints[16])
    pinky_tip = np.array(hand_keypoints[20])

    # Calculate finger extensions
    finger_tips = [thumb_tip, index_tip, middle_tip, ring_tip, pinky_tip]
    finger_bases = [
        np.array(hand_keypoints[2]),   # thumb IP
        np.array(hand_keypoints[5]),   # index MCP
        np.array(hand_keypoints[9]),   # middle MCP
        np.array(hand_keypoints[13]),  # ring MCP
        np.array(hand_keypoints[17]),  # pinky MCP
    ]

    extensions = []
    for tip, base in zip(finger_tips, finger_bases):
        dist = np.linalg.norm(tip - base)
        extensions.append(dist)

    # Detect actions
    avg_extension = np.mean(extensions)

    if avg_extension > 50:  # Open hand
        actions.append({"action": f"open_{hand_type}", "description": f"张开{hand_type}手"})

    elif avg_extension < 30:  # Closed/fist
        actions.append({"action": f"fist_{hand_type}", "description": f"握{hand_type}拳"})

    # Thumbs up (thumb extended upward, others folded)
    if extensions[0] > 40 and np.mean(extensions[1:]) < 30:
        actions.append({"action": f"thumbs_up_{hand_type}", "description": f"{hand_type}手点赞"})

    # Peace sign (index and middle extended)
    if extensions[1] > 40 and extensions[2] > 40 and np.mean(extensions[3:]) < 30:
        actions.append({"action": f"peace_{hand_type}", "description": f"{hand_type}手剪刀手"})

    # Pointing (index extended, others folded)
    if extensions[1] > 40 and np.mean([extensions[0], extensions[2], extensions[3], extensions[4]]) < 30:
        actions.append({"action": f"point_{hand_type}", "description": f"{hand_type}手指向"})

    return actions


def analyze_leg_actions(pose_keypoints: Dict) -> List[Dict]:
    """
    Analyze leg actions from pose keypoints

    Args:
        pose_keypoints: Pose keypoints with hip, knee, ankle positions

    Returns:
        List of detected leg actions
    """
    actions = []

    # Keypoint indices (MediaPipe Pose):
    # 23: left_hip, 24: right_hip
    # 25: left_knee, 26: right_knee
    # 27: left_ankle, 28: right_ankle

    left_hip = pose_keypoints.get("left_hip")
    left_knee = pose_keypoints.get("left_knee")
    left_ankle = pose_keypoints.get("left_ankle")

    right_hip = pose_keypoints.get("right_hip")
    right_knee = pose_keypoints.get("right_knee")
    right_ankle = pose_keypoints.get("right_ankle")

    # Left leg actions
    if left_hip and left_knee and left_ankle:
        hip_knee = np.array(left_knee) - np.array(left_hip)
        knee_ankle = np.array(left_ankle) - np.array(left_knee)

        knee_angle = np.arccos(
            np.dot(hip_knee, knee_ankle) /
            (np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
        )
        knee_angle_deg = np.degrees(knee_angle)

        # Detect actions
        if knee_angle_deg < 120:
            actions.append({"action": "knee_bend_left", "description": "弯左膝", "angle": knee_angle_deg})

        # Standing detection
        if left_hip[1] < left_knee[1] < left_ankle[1]:  # Vertical alignment (y increases downward)
            actions.append({"action": "stand_left", "description": "左腿站立"})

    # Right leg actions
    if right_hip and right_knee and right_ankle:
        hip_knee = np.array(right_knee) - np.array(right_hip)
        knee_ankle = np.array(right_ankle) - np.array(right_knee)

        knee_angle = np.arccos(
            np.dot(hip_knee, knee_ankle) /
            (np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
        )
        knee_angle_deg = np.degrees(knee_angle)

        if knee_angle_deg < 120:
            actions.append({"action": "knee_bend_right", "description": "弯右膝", "angle": knee_angle_deg})

        if right_hip[1] < right_knee[1] < right_ankle[1]:
            actions.append({"action": "stand_right", "description": "右腿站立"})

    # Sit detection (hip ≈ knee height)
    if left_hip and left_knee and right_hip and right_knee:
        hip_avg_y = (left_hip[1] + right_hip[1]) / 2
        knee_avg_y = (left_knee[1] + right_knee[1]) / 2

        if abs(hip_avg_y - knee_avg_y) < 30:  # Hip and knee at similar height
            actions.append({"action": "sit", "description": "坐姿"})

    return actions


# =============================================================================
# Main Decoder Function
# =============================================================================

def decode_body_actions(
    pose_data: Dict,
    face_data: Dict = None,
    hand_data: Dict = None,
) -> Dict:
    """
    Decode all body actions from multiple data sources

    Args:
        pose_data: Pose estimation data (MediaPipe Pose)
        face_data: Face pose data (InsightFace pose_angle)
        hand_data: Hand tracking data (MediaPipe Hand)

    Returns:
        Combined action data dict
    """
    all_actions = {
        "face": [],
        "eyes": [],
        "mouth": [],
        "arms": [],
        "hands": [],
        "legs": [],
        "feet": [],
        "combined": [],
    }

    # 1. Face actions (existing)
    if face_data:
        pose_angle = face_data.get("pose_angle", {})
        prev_pose_angle = face_data.get("prev_pose_angle", {})

        if pose_angle and prev_pose_angle:
            angle = pose_angle.get("angle", "unknown")
            prev_angle = prev_pose_angle.get("angle", "unknown")

            turn_key = (prev_angle, angle)
            if turn_key in FACE_TURN_ACTIONS:
                all_actions["face"].append({
                    "action": FACE_TURN_ACTIONS[turn_key],
                    "description": f"Face: {prev_angle} → {angle}",
                })

        # Pitch actions
        pitch = pose_angle.get("pitch", "neutral")
        prev_pitch = prev_pose_angle.get("pitch", "neutral")

        pitch_key = (prev_pitch, pitch)
        if pitch_key in FACE_PITCH_ACTIONS:
            all_actions["face"].append({
                "action": FACE_PITCH_ACTIONS[pitch_key],
                "description": f"Pitch: {prev_pitch} → {pitch}",
            })

    # 2. Eye actions (if eye landmarks available)
    if face_data and face_data.get("eye_landmarks"):
        all_actions["eyes"] = analyze_eye_actions(
            face_data["eye_landmarks"],
            face_data.get("prev_eye_landmarks")
        )

    # 3. Mouth actions (if mouth landmarks available)
    if face_data and face_data.get("mouth_landmarks"):
        all_actions["mouth"] = analyze_mouth_actions(face_data["mouth_landmarks"])

    # 4. Arm actions (if pose keypoints available)
    if pose_data and pose_data.get("keypoints"):
        all_actions["arms"] = analyze_arm_actions(pose_data["keypoints"])

    # 5. Hand actions (if hand keypoints available)
    if hand_data:
        if hand_data.get("left_hand"):
            all_actions["hands"].extend(analyze_hand_actions(hand_data["left_hand"], "left"))
        if hand_data.get("right_hand"):
            all_actions["hands"].extend(analyze_hand_actions(hand_data["right_hand"], "right"))

    # 6. Leg actions (if pose keypoints available)
    if pose_data and pose_data.get("keypoints"):
        all_actions["legs"] = analyze_leg_actions(pose_data["keypoints"])

    # 7. Combined actions
    detected_actions = []
    for category, actions in all_actions.items():
        if actions:
            detected_actions.extend([a["action"] for a in actions])

    for combined_name, combined_def in COMBINED_ACTIONS.items():
        components = combined_def["components"]
        if all(comp in detected_actions for comp in components):
            all_actions["combined"].append({
                "action": combined_name,
                "description": combined_def["description"],
                "components": components,
            })

    return all_actions


def print_body_action_report(action_data: Dict) -> None:
    """
    Print body action report
    """
    print("\n" + "=" * 70)
    print("Body Action Decoder Report")
    print("=" * 70)

    categories = ["face", "eyes", "mouth", "arms", "hands", "legs", "feet", "combined"]

    for category in categories:
        actions = action_data.get(category, [])

        if actions:
            print(f"\n{category.upper()} Actions ({len(actions)}):")
            for act in actions:
                desc = act.get("description", act["action"])
                print(f"  - {act['action']}: {desc}")

    print("\n" + "=" * 70)


# =============================================================================
# Main Entry Point
# =============================================================================

def main():
    parser = argparse.ArgumentParser(description="Decode body actions from pose data")
    parser.add_argument("--pose-json", help="Path to pose.json (MediaPipe output)")
    parser.add_argument("--face-json", help="Path to face.json (InsightFace output)")
    parser.add_argument("--hand-json", help="Path to hand.json (MediaPipe Hand output)")
    parser.add_argument("--output-json", help="Output action data JSON")
    parser.add_argument("--frame", type=int, help="Analyze specific frame")
    args = parser.parse_args()

    print("=" * 70)
    print("Body Action Decoder")
    print("=" * 70)

    # Load data
    pose_data = None
    face_data = None
    hand_data = None

    if args.pose_json:
        with open(args.pose_json) as f:
            pose_data = json.load(f)

    if args.face_json:
        with open(args.face_json) as f:
            face_data = json.load(f)

    if args.hand_json:
        with open(args.hand_json) as f:
            hand_data = json.load(f)

    # Analyze
    if pose_data or face_data or hand_data:
        action_data = decode_body_actions(
            pose_data=pose_data,
            face_data=face_data,
            hand_data=hand_data,
        )

        print_body_action_report(action_data)

        if args.output_json:
            with open(args.output_json, "w") as f:
                json.dump(action_data, f, indent=2)
            print(f"\n✅ Output saved to: {args.output_json}")
    else:
        print("\n⚠️ No input data provided")
        print("\nAction Categories:")
        print("  - Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head")
        print("  - Eyes: blink, close, wide_open, look_left, look_right")
        print("  - Mouth: open, close, smile, talk, yawn")
        print("  - Arms: raise_left, raise_right, cross_arms, wave, point")
        print("  - Hands: grab, open, clap, thumbs_up, fist, peace, ok")
        print("  - Legs: stand, sit, walk, run, jump, kick")
        print("  - Feet: tap, stomp, cross, point")
        print("  - Combined: thinking, listening, nodding_agreement, waving_greeting")


if __name__ == "__main__":
    main()