- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
875 lines
30 KiB
Python
875 lines
30 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Body Action Decoder - Extended pose action analysis with body keypoints
|
|
|
|
Purpose:
|
|
1. Decode face pose actions (existing)
|
|
2. Decode body actions (future MediaPipe Holistic)
|
|
3. Integrate face + body actions for comprehensive analysis
|
|
|
|
Body Keypoints (MediaPipe Holistic):
|
|
- Face: 468 points (eyes, mouth, nose, etc.)
|
|
- Pose: 33 points (shoulders, elbows, hands, hips, knees, feet)
|
|
- Hands: 21 points per hand
|
|
|
|
Action Types:
|
|
- Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head
|
|
- Eyes: blink, close, wide_open, look_left, look_right
|
|
- Mouth: open, close, smile, talk, yawn
|
|
- Arms: raise_left, raise_right, cross_arms, wave
|
|
- Hands: point, grab, clap, thumbs_up, fist
|
|
- Legs: stand, sit, walk, run, jump, kick
|
|
- Feet: tap, stomp, cross
|
|
|
|
Architecture:
|
|
┌─────────────────────────────────────────────────────────────────┐
|
|
│ Body Action Decoder │
|
|
├─────────────────────────────────────────────────────────────────┤
|
|
│ │
|
|
│ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │
|
|
│ │ Face Actions │ │ Body Actions │ │ Hand Actions │ │
|
|
│ │ (InsightFace) │ │ (MediaPipe) │ │ (MediaPipe) │ │
|
|
│ └───────────────┘ └───────────────┘ └───────────────┘ │
|
|
│ │ │ │ │
|
|
│ └──────────────────┼──────────────────┘ │
|
|
│ │ │
|
|
│ ┌───────▼───────┐ │
|
|
│ │ Action Merger│ │
|
|
│ └────────────────┘ │
|
|
│ │ │
|
|
│ ┌───────▼───────┐ │
|
|
│ │ Action Timeline│ │
|
|
│ └────────────────┘ │
|
|
│ │
|
|
└─────────────────────────────────────────────────────────────────┘
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import numpy as np
|
|
from typing import Dict, List
|
|
|
|
|
|
# =============================================================================
|
|
# Face Action Definitions (Existing from pose_action_decoder.py)
|
|
# =============================================================================
|
|
|
|
FACE_TURN_ACTIONS = {
|
|
("frontal", "three_quarter"): "turn_partial",
|
|
("frontal", "profile_left"): "turn_left",
|
|
("frontal", "profile_right"): "turn_right",
|
|
("three_quarter", "frontal"): "return_frontal",
|
|
("three_quarter", "profile_left"): "turn_left",
|
|
("three_quarter", "profile_right"): "turn_right",
|
|
("profile_left", "frontal"): "turn_to_frontal",
|
|
("profile_left", "three_quarter"): "turn_to_three_quarter",
|
|
("profile_left", "profile_right"): "turn_full",
|
|
("profile_right", "frontal"): "turn_to_frontal",
|
|
("profile_right", "three_quarter"): "turn_to_three_quarter",
|
|
("profile_right", "profile_left"): "turn_full",
|
|
}
|
|
|
|
FACE_PITCH_ACTIONS = {
|
|
("neutral", "tilted_up"): "look_up",
|
|
("neutral", "tilted_down"): "look_down",
|
|
("tilted_up", "neutral"): "return_neutral",
|
|
("tilted_down", "neutral"): "return_neutral",
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Eye Action Definitions
|
|
# =============================================================================
|
|
|
|
EYE_ACTIONS = {
|
|
"blink": {
|
|
"description": "眨眼",
|
|
"pattern": "eye_aspect_ratio drops < 0.2 for 1-3 frames",
|
|
"min_frames": 1,
|
|
"max_frames": 3,
|
|
},
|
|
"close": {
|
|
"description": "闭眼",
|
|
"pattern": "eye_aspect_ratio < 0.15 for > 10 frames",
|
|
"min_frames": 10,
|
|
},
|
|
"wide_open": {
|
|
"description": "睁大眼",
|
|
"pattern": "eye_aspect_ratio > 0.4",
|
|
},
|
|
"look_left": {
|
|
"description": "向左看",
|
|
"pattern": "iris_position_x < 0.3",
|
|
},
|
|
"look_right": {
|
|
"description": "向右看",
|
|
"pattern": "iris_position_x > 0.7",
|
|
},
|
|
"squint": {
|
|
"description": "眯眼",
|
|
"pattern": "eye_aspect_ratio 0.15-0.25",
|
|
},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Mouth Action Definitions
|
|
# =============================================================================
|
|
|
|
MOUTH_ACTIONS = {
|
|
"open": {
|
|
"description": "张嘴",
|
|
"pattern": "mouth_aspect_ratio > 0.5",
|
|
},
|
|
"close": {
|
|
"description": "闭嘴",
|
|
"pattern": "mouth_aspect_ratio < 0.2",
|
|
},
|
|
"smile": {
|
|
"description": "微笑",
|
|
"pattern": "mouth_corner_distance > threshold",
|
|
},
|
|
"talk": {
|
|
"description": "说话",
|
|
"pattern": "mouth_aspect_ratio oscillating 0.3-0.6",
|
|
"min_frames": 10,
|
|
},
|
|
"yawn": {
|
|
"description": "打哈欠",
|
|
"pattern": "mouth_aspect_ratio > 0.7 for > 20 frames",
|
|
"min_frames": 20,
|
|
},
|
|
"pout": {
|
|
"description": "嘟嘴",
|
|
"pattern": "lip_distance > threshold",
|
|
},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Arm Action Definitions
|
|
# =============================================================================
|
|
|
|
ARM_ACTIONS = {
|
|
"raise_left": {
|
|
"description": "举起左手",
|
|
"pattern": "left_shoulder_y > left_elbow_y > left_wrist_y",
|
|
},
|
|
"raise_right": {
|
|
"description": "举起右手",
|
|
"pattern": "right_shoulder_y > right_elbow_y > right_wrist_y",
|
|
},
|
|
"raise_both": {
|
|
"description": "双手举起",
|
|
"pattern": "both arms raised",
|
|
},
|
|
"cross_arms": {
|
|
"description": "双手交叉",
|
|
"pattern": "left_wrist_x > right_wrist_x AND right_wrist_x < left_wrist_x",
|
|
},
|
|
"wave": {
|
|
"description": "挥手",
|
|
"pattern": "wrist_y oscillating ±20px for 5-15 frames",
|
|
"min_frames": 5,
|
|
"max_frames": 15,
|
|
},
|
|
"extend_left": {
|
|
"description": "伸展左臂",
|
|
"pattern": "left_elbow_angle > 150°",
|
|
},
|
|
"extend_right": {
|
|
"description": "伸展右臂",
|
|
"pattern": "right_elbow_angle > 150°",
|
|
},
|
|
"fold_left": {
|
|
"description": "弯曲左臂",
|
|
"pattern": "left_elbow_angle < 90°",
|
|
},
|
|
"fold_right": {
|
|
"description": "弯曲右臂",
|
|
"pattern": "right_elbow_angle < 90°",
|
|
},
|
|
"point": {
|
|
"description": "指向",
|
|
"pattern": "index_finger extended, other fingers folded",
|
|
},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Hand Action Definitions
|
|
# =============================================================================
|
|
|
|
HAND_ACTIONS = {
|
|
"grab": {
|
|
"description": "抓取",
|
|
"pattern": "fingers folded, thumb opposing",
|
|
},
|
|
"open": {
|
|
"description": "张开手",
|
|
"pattern": "all fingers extended",
|
|
},
|
|
"clap": {
|
|
"description": "拍手",
|
|
"pattern": "hands together then apart (velocity pattern)",
|
|
"min_frames": 3,
|
|
"max_frames": 10,
|
|
},
|
|
"thumbs_up": {
|
|
"description": "点赞",
|
|
"pattern": "thumb extended upward, other fingers folded",
|
|
},
|
|
"fist": {
|
|
"description": "握拳",
|
|
"pattern": "all fingers folded into palm",
|
|
},
|
|
"peace": {
|
|
"description": "剪刀手",
|
|
"pattern": "index and middle fingers extended",
|
|
},
|
|
"ok": {
|
|
"description": "OK 手势",
|
|
"pattern": "thumb and index finger touching",
|
|
},
|
|
"touch_face": {
|
|
"description": "摸脸",
|
|
"pattern": "hand near face region",
|
|
},
|
|
"touch_hair": {
|
|
"description": "摸头发",
|
|
"pattern": "hand above head region",
|
|
},
|
|
"pocket_left": {
|
|
"description": "左手插兜",
|
|
"pattern": "left_hand in hip region",
|
|
},
|
|
"pocket_right": {
|
|
"description": "右手插兜",
|
|
"pattern": "right_hand in hip region",
|
|
},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Leg Action Definitions
|
|
# =============================================================================
|
|
|
|
LEG_ACTIONS = {
|
|
"stand": {
|
|
"description": "站立",
|
|
"pattern": "hip_y < knee_y < ankle_y, vertical alignment",
|
|
},
|
|
"sit": {
|
|
"description": "坐姿",
|
|
"pattern": "hip_y ≈ knee_y, thigh horizontal",
|
|
},
|
|
"walk": {
|
|
"description": "行走",
|
|
"pattern": "hip-knee-ankle oscillating, stride pattern",
|
|
"min_frames": 10,
|
|
},
|
|
"run": {
|
|
"description": "奔跑",
|
|
"pattern": "fast oscillating, knee_bend > 60°",
|
|
"min_frames": 10,
|
|
},
|
|
"jump": {
|
|
"description": "跳跃",
|
|
"pattern": "all keypoints moving upward then landing",
|
|
"min_frames": 5,
|
|
"max_frames": 20,
|
|
},
|
|
"kick": {
|
|
"description": "踢腿",
|
|
"pattern": "one leg extended forward rapidly",
|
|
"min_frames": 3,
|
|
"max_frames": 15,
|
|
},
|
|
"cross_left": {
|
|
"description": "左腿交叉",
|
|
"pattern": "left_ankle_x > right_ankle_x",
|
|
},
|
|
"cross_right": {
|
|
"description": "右腿交叉",
|
|
"pattern": "right_ankle_x > left_ankle_x",
|
|
},
|
|
"knee_bend": {
|
|
"description": "弯膝",
|
|
"pattern": "knee_angle < 120°",
|
|
},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Feet Action Definitions
|
|
# =============================================================================
|
|
|
|
FEET_ACTIONS = {
|
|
"tap": {
|
|
"description": "轻踏",
|
|
"pattern": "ankle_y oscillating ±10px",
|
|
"min_frames": 3,
|
|
"max_frames": 15,
|
|
},
|
|
"stomp": {
|
|
"description": "重踏",
|
|
"pattern": "ankle_y large downward movement",
|
|
"min_frames": 3,
|
|
},
|
|
"cross": {
|
|
"description": "交叉脚",
|
|
"pattern": "feet_x overlapping",
|
|
},
|
|
"point_left": {
|
|
"description": "左脚前伸",
|
|
"pattern": "left_ankle_y < right_ankle_y",
|
|
},
|
|
"point_right": {
|
|
"description": "右脚前伸",
|
|
"pattern": "right_ankle_y < left_ankle_y",
|
|
},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Combined Actions (Face + Body)
|
|
# =============================================================================
|
|
|
|
COMBINED_ACTIONS = {
|
|
"thinking": {
|
|
"description": "思考姿势",
|
|
"components": ["touch_face", "look_down"],
|
|
"pattern": "hand near chin + head tilted down",
|
|
},
|
|
"listening": {
|
|
"description": "倾听姿势",
|
|
"components": ["turn_partial", "open_mouth"],
|
|
"pattern": "slight turn + mouth slightly open",
|
|
},
|
|
"nodding_agreement": {
|
|
"description": "点头同意",
|
|
"components": ["nod_head", "smile"],
|
|
"pattern": "head nod + smile",
|
|
},
|
|
"shaking_disagreement": {
|
|
"description": "摇头不同意",
|
|
"components": ["shake_head", "frown"],
|
|
"pattern": "shake head + frown",
|
|
},
|
|
"waving_greeting": {
|
|
"description": "挥手打招呼",
|
|
"components": ["wave", "smile"],
|
|
"pattern": "wave hand + smile",
|
|
},
|
|
"crossing_arms_defensive": {
|
|
"description": "双手交叉防御",
|
|
"components": ["cross_arms", "frontal_stable"],
|
|
"pattern": "cross arms + frontal pose",
|
|
},
|
|
"pointing_explaining": {
|
|
"description": "指向解释",
|
|
"components": ["point", "turn_partial"],
|
|
"pattern": "pointing + slight turn",
|
|
},
|
|
"stretching": {
|
|
"description": "伸展",
|
|
"components": ["raise_both", "look_up"],
|
|
"pattern": "raise arms + look up",
|
|
},
|
|
"sitting_relaxed": {
|
|
"description": "放松坐姿",
|
|
"components": ["sit", "cross_arms"],
|
|
"pattern": "sit + cross arms",
|
|
},
|
|
}
|
|
|
|
|
|
# =============================================================================
|
|
# Analysis Functions
|
|
# =============================================================================
|
|
|
|
def analyze_eye_actions(eye_landmarks: List, prev_eye_landmarks: List = None) -> List[Dict]:
|
|
"""
|
|
Analyze eye actions from landmarks
|
|
|
|
Args:
|
|
eye_landmarks: Current frame eye landmarks (left/right eye points)
|
|
prev_eye_landmarks: Previous frame landmarks (for motion detection)
|
|
|
|
Returns:
|
|
List of detected eye actions
|
|
"""
|
|
actions = []
|
|
|
|
if not eye_landmarks or len(eye_landmarks) < 6:
|
|
return actions
|
|
|
|
# Calculate eye aspect ratio (EAR)
|
|
# EAR = (|p2-p6| + |p3-p5|) / (2|p1-p4|)
|
|
# Points: p1, p2, p3, p4, p5, p6 (6 points per eye)
|
|
|
|
# For left eye
|
|
left_eye = eye_landmarks[:6]
|
|
if len(left_eye) == 6:
|
|
# Simplified EAR calculation
|
|
vertical_1 = np.linalg.norm(np.array(left_eye[1]) - np.array(left_eye[5]))
|
|
vertical_2 = np.linalg.norm(np.array(left_eye[2]) - np.array(left_eye[4]))
|
|
horizontal = np.linalg.norm(np.array(left_eye[0]) - np.array(left_eye[3]))
|
|
|
|
left_ear = (vertical_1 + vertical_2) / (2 * horizontal) if horizontal > 0 else 0
|
|
|
|
# Detect actions
|
|
if left_ear < 0.15:
|
|
actions.append({"action": "close_left", "description": "闭左眼", "confidence": 1.0 - left_ear})
|
|
elif left_ear > 0.4:
|
|
actions.append({"action": "wide_open_left", "description": "睁大左眼", "confidence": left_ear})
|
|
|
|
return actions
|
|
|
|
|
|
def analyze_mouth_actions(mouth_landmarks: List) -> List[Dict]:
|
|
"""
|
|
Analyze mouth actions from landmarks
|
|
|
|
Args:
|
|
mouth_landmarks: Mouth region landmarks (lips, mouth corners)
|
|
|
|
Returns:
|
|
List of detected mouth actions
|
|
"""
|
|
actions = []
|
|
|
|
if not mouth_landmarks or len(mouth_landmarks) < 4:
|
|
return actions
|
|
|
|
# Calculate mouth aspect ratio
|
|
# Upper lip - lower lip distance / mouth width
|
|
|
|
upper_lip = np.array(mouth_landmarks[0])
|
|
lower_lip = np.array(mouth_landmarks[1])
|
|
left_corner = np.array(mouth_landmarks[2])
|
|
right_corner = np.array(mouth_landmarks[3])
|
|
|
|
mouth_height = np.linalg.norm(upper_lip - lower_lip)
|
|
mouth_width = np.linalg.norm(left_corner - right_corner)
|
|
|
|
mar = mouth_height / mouth_width if mouth_width > 0 else 0
|
|
|
|
# Detect actions
|
|
if mar > 0.7:
|
|
actions.append({"action": "yawn", "description": "打哈欠", "mar": mar})
|
|
elif mar > 0.5:
|
|
actions.append({"action": "open", "description": "张嘴", "mar": mar})
|
|
elif mar < 0.2:
|
|
actions.append({"action": "close", "description": "闭嘴", "mar": mar})
|
|
else:
|
|
# Check smile (mouth corners distance)
|
|
corner_distance = abs(left_corner[1] - upper_lip[1]) + abs(right_corner[1] - upper_lip[1])
|
|
if corner_distance > 10: # Threshold
|
|
actions.append({"action": "smile", "description": "微笑", "corner_distance": corner_distance})
|
|
|
|
return actions
|
|
|
|
|
|
def analyze_arm_actions(pose_keypoints: Dict) -> List[Dict]:
|
|
"""
|
|
Analyze arm actions from pose keypoints
|
|
|
|
Args:
|
|
pose_keypoints: Pose keypoints dict with shoulder, elbow, wrist positions
|
|
|
|
Returns:
|
|
List of detected arm actions
|
|
"""
|
|
actions = []
|
|
|
|
# Keypoint indices (MediaPipe Pose):
|
|
# 11: left_shoulder, 12: right_shoulder
|
|
# 13: left_elbow, 14: right_elbow
|
|
# 15: left_wrist, 16: right_wrist
|
|
|
|
left_shoulder = pose_keypoints.get("left_shoulder")
|
|
left_elbow = pose_keypoints.get("left_elbow")
|
|
left_wrist = pose_keypoints.get("left_wrist")
|
|
|
|
right_shoulder = pose_keypoints.get("right_shoulder")
|
|
right_elbow = pose_keypoints.get("right_elbow")
|
|
right_wrist = pose_keypoints.get("right_wrist")
|
|
|
|
# Left arm actions
|
|
if left_shoulder and left_elbow and left_wrist:
|
|
# Calculate elbow angle
|
|
shoulder_elbow = np.array(left_elbow) - np.array(left_shoulder)
|
|
elbow_wrist = np.array(left_wrist) - np.array(left_elbow)
|
|
|
|
elbow_angle = np.arccos(
|
|
np.dot(shoulder_elbow, elbow_wrist) /
|
|
(np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
|
|
)
|
|
elbow_angle_deg = np.degrees(elbow_angle)
|
|
|
|
# Detect actions
|
|
if left_wrist[1] < left_elbow[1] < left_shoulder[1]: # Raised (y decreases upward)
|
|
actions.append({"action": "raise_left", "description": "举起左手", "angle": elbow_angle_deg})
|
|
|
|
if elbow_angle_deg > 150:
|
|
actions.append({"action": "extend_left", "description": "伸展左臂", "angle": elbow_angle_deg})
|
|
elif elbow_angle_deg < 90:
|
|
actions.append({"action": "fold_left", "description": "弯曲左臂", "angle": elbow_angle_deg})
|
|
|
|
# Right arm actions
|
|
if right_shoulder and right_elbow and right_wrist:
|
|
shoulder_elbow = np.array(right_elbow) - np.array(right_shoulder)
|
|
elbow_wrist = np.array(right_wrist) - np.array(right_elbow)
|
|
|
|
elbow_angle = np.arccos(
|
|
np.dot(shoulder_elbow, elbow_wrist) /
|
|
(np.linalg.norm(shoulder_elbow) * np.linalg.norm(elbow_wrist))
|
|
)
|
|
elbow_angle_deg = np.degrees(elbow_angle)
|
|
|
|
if right_wrist[1] < right_elbow[1] < right_shoulder[1]:
|
|
actions.append({"action": "raise_right", "description": "举起右手", "angle": elbow_angle_deg})
|
|
|
|
if elbow_angle_deg > 150:
|
|
actions.append({"action": "extend_right", "description": "伸展右臂", "angle": elbow_angle_deg})
|
|
elif elbow_angle_deg < 90:
|
|
actions.append({"action": "fold_right", "description": "弯曲右臂", "angle": elbow_angle_deg})
|
|
|
|
# Cross arms detection
|
|
if left_wrist and right_wrist:
|
|
if left_wrist[0] > right_wrist[0] and right_wrist[0] < left_shoulder[0]:
|
|
actions.append({"action": "cross_arms", "description": "双手交叉"})
|
|
|
|
return actions
|
|
|
|
|
|
def analyze_hand_actions(hand_keypoints: List, hand_type: str = "right") -> List[Dict]:
|
|
"""
|
|
Analyze hand actions from hand keypoints
|
|
|
|
Args:
|
|
hand_keypoints: 21 hand keypoints (MediaPipe Hand)
|
|
hand_type: "left" or "right"
|
|
|
|
Returns:
|
|
List of detected hand actions
|
|
"""
|
|
actions = []
|
|
|
|
if not hand_keypoints or len(hand_keypoints) < 21:
|
|
return actions
|
|
|
|
# MediaPipe Hand keypoint indices:
|
|
# 0: wrist
|
|
# 1-4: thumb (CMC, MCP, IP, TIP)
|
|
# 5-8: index finger (MCP, PIP, DIP, TIP)
|
|
# 9-12: middle finger
|
|
# 13-16: ring finger
|
|
# 17-20: pinky
|
|
|
|
wrist = np.array(hand_keypoints[0])
|
|
thumb_tip = np.array(hand_keypoints[4])
|
|
index_tip = np.array(hand_keypoints[8])
|
|
middle_tip = np.array(hand_keypoints[12])
|
|
ring_tip = np.array(hand_keypoints[16])
|
|
pinky_tip = np.array(hand_keypoints[20])
|
|
|
|
# Calculate finger extensions
|
|
finger_tips = [thumb_tip, index_tip, middle_tip, ring_tip, pinky_tip]
|
|
finger_bases = [
|
|
np.array(hand_keypoints[2]), # thumb IP
|
|
np.array(hand_keypoints[5]), # index MCP
|
|
np.array(hand_keypoints[9]), # middle MCP
|
|
np.array(hand_keypoints[13]), # ring MCP
|
|
np.array(hand_keypoints[17]), # pinky MCP
|
|
]
|
|
|
|
extensions = []
|
|
for tip, base in zip(finger_tips, finger_bases):
|
|
dist = np.linalg.norm(tip - base)
|
|
extensions.append(dist)
|
|
|
|
# Detect actions
|
|
avg_extension = np.mean(extensions)
|
|
|
|
if avg_extension > 50: # Open hand
|
|
actions.append({"action": f"open_{hand_type}", "description": f"张开{hand_type}手"})
|
|
|
|
elif avg_extension < 30: # Closed/fist
|
|
actions.append({"action": f"fist_{hand_type}", "description": f"握{hand_type}拳"})
|
|
|
|
# Thumbs up (thumb extended upward, others folded)
|
|
if extensions[0] > 40 and np.mean(extensions[1:]) < 30:
|
|
actions.append({"action": f"thumbs_up_{hand_type}", "description": f"{hand_type}手点赞"})
|
|
|
|
# Peace sign (index and middle extended)
|
|
if extensions[1] > 40 and extensions[2] > 40 and np.mean(extensions[3:]) < 30:
|
|
actions.append({"action": f"peace_{hand_type}", "description": f"{hand_type}手剪刀手"})
|
|
|
|
# Pointing (index extended, others folded)
|
|
if extensions[1] > 40 and np.mean([extensions[0], extensions[2], extensions[3], extensions[4]]) < 30:
|
|
actions.append({"action": f"point_{hand_type}", "description": f"{hand_type}手指向"})
|
|
|
|
return actions
|
|
|
|
|
|
def analyze_leg_actions(pose_keypoints: Dict) -> List[Dict]:
|
|
"""
|
|
Analyze leg actions from pose keypoints
|
|
|
|
Args:
|
|
pose_keypoints: Pose keypoints with hip, knee, ankle positions
|
|
|
|
Returns:
|
|
List of detected leg actions
|
|
"""
|
|
actions = []
|
|
|
|
# Keypoint indices (MediaPipe Pose):
|
|
# 23: left_hip, 24: right_hip
|
|
# 25: left_knee, 26: right_knee
|
|
# 27: left_ankle, 28: right_ankle
|
|
|
|
left_hip = pose_keypoints.get("left_hip")
|
|
left_knee = pose_keypoints.get("left_knee")
|
|
left_ankle = pose_keypoints.get("left_ankle")
|
|
|
|
right_hip = pose_keypoints.get("right_hip")
|
|
right_knee = pose_keypoints.get("right_knee")
|
|
right_ankle = pose_keypoints.get("right_ankle")
|
|
|
|
# Left leg actions
|
|
if left_hip and left_knee and left_ankle:
|
|
hip_knee = np.array(left_knee) - np.array(left_hip)
|
|
knee_ankle = np.array(left_ankle) - np.array(left_knee)
|
|
|
|
knee_angle = np.arccos(
|
|
np.dot(hip_knee, knee_ankle) /
|
|
(np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
|
|
)
|
|
knee_angle_deg = np.degrees(knee_angle)
|
|
|
|
# Detect actions
|
|
if knee_angle_deg < 120:
|
|
actions.append({"action": "knee_bend_left", "description": "弯左膝", "angle": knee_angle_deg})
|
|
|
|
# Standing detection
|
|
if left_hip[1] < left_knee[1] < left_ankle[1]: # Vertical alignment (y increases downward)
|
|
actions.append({"action": "stand_left", "description": "左腿站立"})
|
|
|
|
# Right leg actions
|
|
if right_hip and right_knee and right_ankle:
|
|
hip_knee = np.array(right_knee) - np.array(right_hip)
|
|
knee_ankle = np.array(right_ankle) - np.array(right_knee)
|
|
|
|
knee_angle = np.arccos(
|
|
np.dot(hip_knee, knee_ankle) /
|
|
(np.linalg.norm(hip_knee) * np.linalg.norm(knee_ankle))
|
|
)
|
|
knee_angle_deg = np.degrees(knee_angle)
|
|
|
|
if knee_angle_deg < 120:
|
|
actions.append({"action": "knee_bend_right", "description": "弯右膝", "angle": knee_angle_deg})
|
|
|
|
if right_hip[1] < right_knee[1] < right_ankle[1]:
|
|
actions.append({"action": "stand_right", "description": "右腿站立"})
|
|
|
|
# Sit detection (hip ≈ knee height)
|
|
if left_hip and left_knee and right_hip and right_knee:
|
|
hip_avg_y = (left_hip[1] + right_hip[1]) / 2
|
|
knee_avg_y = (left_knee[1] + right_knee[1]) / 2
|
|
|
|
if abs(hip_avg_y - knee_avg_y) < 30: # Hip and knee at similar height
|
|
actions.append({"action": "sit", "description": "坐姿"})
|
|
|
|
return actions
|
|
|
|
|
|
# =============================================================================
|
|
# Main Decoder Function
|
|
# =============================================================================
|
|
|
|
def decode_body_actions(
|
|
pose_data: Dict,
|
|
face_data: Dict = None,
|
|
hand_data: Dict = None,
|
|
) -> Dict:
|
|
"""
|
|
Decode all body actions from multiple data sources
|
|
|
|
Args:
|
|
pose_data: Pose estimation data (MediaPipe Pose)
|
|
face_data: Face pose data (InsightFace pose_angle)
|
|
hand_data: Hand tracking data (MediaPipe Hand)
|
|
|
|
Returns:
|
|
Combined action data dict
|
|
"""
|
|
all_actions = {
|
|
"face": [],
|
|
"eyes": [],
|
|
"mouth": [],
|
|
"arms": [],
|
|
"hands": [],
|
|
"legs": [],
|
|
"feet": [],
|
|
"combined": [],
|
|
}
|
|
|
|
# 1. Face actions (existing)
|
|
if face_data:
|
|
pose_angle = face_data.get("pose_angle", {})
|
|
prev_pose_angle = face_data.get("prev_pose_angle", {})
|
|
|
|
if pose_angle and prev_pose_angle:
|
|
angle = pose_angle.get("angle", "unknown")
|
|
prev_angle = prev_pose_angle.get("angle", "unknown")
|
|
|
|
turn_key = (prev_angle, angle)
|
|
if turn_key in FACE_TURN_ACTIONS:
|
|
all_actions["face"].append({
|
|
"action": FACE_TURN_ACTIONS[turn_key],
|
|
"description": f"Face: {prev_angle} → {angle}",
|
|
})
|
|
|
|
# Pitch actions
|
|
pitch = pose_angle.get("pitch", "neutral")
|
|
prev_pitch = prev_pose_angle.get("pitch", "neutral")
|
|
|
|
pitch_key = (prev_pitch, pitch)
|
|
if pitch_key in FACE_PITCH_ACTIONS:
|
|
all_actions["face"].append({
|
|
"action": FACE_PITCH_ACTIONS[pitch_key],
|
|
"description": f"Pitch: {prev_pitch} → {pitch}",
|
|
})
|
|
|
|
# 2. Eye actions (if eye landmarks available)
|
|
if face_data and face_data.get("eye_landmarks"):
|
|
all_actions["eyes"] = analyze_eye_actions(
|
|
face_data["eye_landmarks"],
|
|
face_data.get("prev_eye_landmarks")
|
|
)
|
|
|
|
# 3. Mouth actions (if mouth landmarks available)
|
|
if face_data and face_data.get("mouth_landmarks"):
|
|
all_actions["mouth"] = analyze_mouth_actions(face_data["mouth_landmarks"])
|
|
|
|
# 4. Arm actions (if pose keypoints available)
|
|
if pose_data and pose_data.get("keypoints"):
|
|
all_actions["arms"] = analyze_arm_actions(pose_data["keypoints"])
|
|
|
|
# 5. Hand actions (if hand keypoints available)
|
|
if hand_data:
|
|
if hand_data.get("left_hand"):
|
|
all_actions["hands"].extend(analyze_hand_actions(hand_data["left_hand"], "left"))
|
|
if hand_data.get("right_hand"):
|
|
all_actions["hands"].extend(analyze_hand_actions(hand_data["right_hand"], "right"))
|
|
|
|
# 6. Leg actions (if pose keypoints available)
|
|
if pose_data and pose_data.get("keypoints"):
|
|
all_actions["legs"] = analyze_leg_actions(pose_data["keypoints"])
|
|
|
|
# 7. Combined actions
|
|
detected_actions = []
|
|
for category, actions in all_actions.items():
|
|
if actions:
|
|
detected_actions.extend([a["action"] for a in actions])
|
|
|
|
for combined_name, combined_def in COMBINED_ACTIONS.items():
|
|
components = combined_def["components"]
|
|
if all(comp in detected_actions for comp in components):
|
|
all_actions["combined"].append({
|
|
"action": combined_name,
|
|
"description": combined_def["description"],
|
|
"components": components,
|
|
})
|
|
|
|
return all_actions
|
|
|
|
|
|
def print_body_action_report(action_data: Dict) -> None:
|
|
"""
|
|
Print body action report
|
|
"""
|
|
print("\n" + "=" * 70)
|
|
print("Body Action Decoder Report")
|
|
print("=" * 70)
|
|
|
|
categories = ["face", "eyes", "mouth", "arms", "hands", "legs", "feet", "combined"]
|
|
|
|
for category in categories:
|
|
actions = action_data.get(category, [])
|
|
|
|
if actions:
|
|
print(f"\n{category.upper()} Actions ({len(actions)}):")
|
|
for act in actions:
|
|
desc = act.get("description", act["action"])
|
|
print(f" - {act['action']}: {desc}")
|
|
|
|
print("\n" + "=" * 70)
|
|
|
|
|
|
# =============================================================================
|
|
# Main Entry Point
|
|
# =============================================================================
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Decode body actions from pose data")
|
|
parser.add_argument("--pose-json", help="Path to pose.json (MediaPipe output)")
|
|
parser.add_argument("--face-json", help="Path to face.json (InsightFace output)")
|
|
parser.add_argument("--hand-json", help="Path to hand.json (MediaPipe Hand output)")
|
|
parser.add_argument("--output-json", help="Output action data JSON")
|
|
parser.add_argument("--frame", type=int, help="Analyze specific frame")
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 70)
|
|
print("Body Action Decoder")
|
|
print("=" * 70)
|
|
|
|
# Load data
|
|
pose_data = None
|
|
face_data = None
|
|
hand_data = None
|
|
|
|
if args.pose_json:
|
|
with open(args.pose_json) as f:
|
|
pose_data = json.load(f)
|
|
|
|
if args.face_json:
|
|
with open(args.face_json) as f:
|
|
face_data = json.load(f)
|
|
|
|
if args.hand_json:
|
|
with open(args.hand_json) as f:
|
|
hand_data = json.load(f)
|
|
|
|
# Analyze
|
|
if pose_data or face_data or hand_data:
|
|
action_data = decode_body_actions(
|
|
pose_data=pose_data,
|
|
face_data=face_data,
|
|
hand_data=hand_data,
|
|
)
|
|
|
|
print_body_action_report(action_data)
|
|
|
|
if args.output_json:
|
|
with open(args.output_json, "w") as f:
|
|
json.dump(action_data, f, indent=2)
|
|
print(f"\n✅ Output saved to: {args.output_json}")
|
|
else:
|
|
print("\n⚠️ No input data provided")
|
|
print("\nAction Categories:")
|
|
print(" - Face: turn_left, turn_right, look_up, look_down, shake_head, nod_head")
|
|
print(" - Eyes: blink, close, wide_open, look_left, look_right")
|
|
print(" - Mouth: open, close, smile, talk, yawn")
|
|
print(" - Arms: raise_left, raise_right, cross_arms, wave, point")
|
|
print(" - Hands: grab, open, clap, thumbs_up, fist, peace, ok")
|
|
print(" - Legs: stand, sit, walk, run, jump, kick")
|
|
print(" - Feet: tap, stomp, cross, point")
|
|
print(" - Combined: thinking, listening, nodding_agreement, waving_greeting")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |