momentry_core/scripts/pose_processor.py

#!/opt/homebrew/bin/python3.11
"""
Pose Processor - Pose Estimation
Uses YOLOv8 Pose via ultralytics (local model)
"""

import sys
import json
import argparse
import os
import signal

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher


def signal_handler(signum, frame):
    print(f"POSE: Received signal {signum}, exiting...")
    sys.exit(1)


def process_pose(video_path: str, output_path: str, uuid: str = ""):
    """Process video for pose estimation using YOLOv8 Pose"""

    # Set up signal handlers
    signal.signal(signal.SIGTERM, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)

    publisher = RedisPublisher(uuid) if uuid else None
    if publisher:
        publisher.info("pose", "POSE_START")

    try:
        from ultralytics import YOLO  # pyright: ignore
    except ImportError:
        if publisher:
            publisher.error("pose", "ultralytics not installed")
        result = {"frame_count": 0, "fps": 0.0, "frames": []}
        if publisher:
            publisher.complete("pose", "0 frames")
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)
        return result

    if publisher:
        publisher.info("pose", "POSE_LOADING_MODEL")

    # Load YOLOv8 Pose model
    # yolov8n-pose.pt = nano (fastest)
    # yolov8s-pose.pt = small
    # yolov8m-pose.pt = medium
    model = YOLO("yolov8n-pose.pt")

    # Get video info
    import cv2

    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()

    if publisher:
        publisher.info("pose", f"fps={fps}, frames={total_frames}")
        publisher.progress("pose", 0, total_frames, "Starting")

    # Process video with YOLO Pose
    results = model(
        video_path,
        conf=0.5,  # confidence threshold
        save=False,
        stream=True,
        verbose=False,
        pose=True,  # Enable pose estimation
    )

    # COCO keypoint names
    KEYPOINT_NAMES = [
        "nose",
        "left_eye",
        "right_eye",
        "left_ear",
        "right_ear",
        "left_shoulder",
        "right_shoulder",
        "left_elbow",
        "right_elbow",
        "left_wrist",
        "right_wrist",
        "left_hip",
        "right_hip",
        "left_knee",
        "right_knee",
        "left_ankle",
        "right_ankle",
    ]

    frames = []
    frame_count = 0

    for result in results:
        frame_count += 1

        # Get frame number and timestamp
        frame_idx = (
            result.orig_frame_idx
            if hasattr(result, "orig_frame_idx")
            else frame_count - 1
        )
        timestamp = frame_idx / fps if fps > 0 else 0

        # Get pose keypoints
        persons = []

        if result.keypoints is not None:
            for person in result.keypoints:
                keypoints = []

                for i, kp in enumerate(person):
                    if len(kp) >= 3:
                        keypoints.append(
                            {
                                "name": KEYPOINT_NAMES[i]
                                if i < len(KEYPOINT_NAMES)
                                else f"kp_{i}",
                                "x": float(kp[0]),
                                "y": float(kp[1]),
                                "confidence": float(kp[2]),
                            }
                        )

                # Get bounding box from keypoints if available
                valid_kps = [kp for kp in keypoints if kp["confidence"] > 0.3]
                if valid_kps:
                    xs = [kp["x"] for kp in valid_kps]
                    ys = [kp["y"] for kp in valid_kps]
                    bbox = {
                        "x": int(min(xs)),
                        "y": int(min(ys)),
                        "width": int(max(xs) - min(xs)),
                        "height": int(max(ys) - min(ys)),
                    }
                else:
                    bbox = {"x": 0, "y": 0, "width": 0, "height": 0}

                persons.append({"keypoints": keypoints, "bbox": bbox})

        # Only add frames with poses or sample periodically
        if persons or frame_count % 30 == 0:
            frames.append(
                {
                    "frame": frame_idx,
                    "timestamp": round(timestamp, 3),
                    "persons": persons,
                }
            )

        if publisher:
            publisher.progress("pose", frame_count, total_frames, f"Frame {frame_idx}")

    result = {"frame_count": total_frames, "fps": fps, "frames": frames}

    if publisher:
        publisher.complete("pose", f"{len(frames)} frames with poses")

    with open(output_path, "w") as f:
        json.dump(result, f, indent=2)

    return result


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Pose Estimation")
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
    args = parser.parse_args()

    process_pose(args.video_path, args.output_path, args.uuid)