momentry_core/scripts/ocr_processor.py

#!/opt/homebrew/bin/python3.11
"""
OCR Processor - Text Recognition with Resume Support
Uses EasyOCR (local model)

Resume Feature:
- Auto-detect existing results and resume from last frame
- Auto-save at configurable intervals (default: 30 seconds)
- Ctrl+C gracefully saves and exits
"""

import sys
import json
import argparse
import os
import signal
import time
from datetime import datetime

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
from resume_framework import ResumeFramework, format_time, print_progress


def process_ocr(
    video_path: str,
    output_path: str,
    uuid: str = "",
    auto_save_interval: int = 30,
    auto_save_frames: int = 300,
    force_restart: bool = False,
    sample_interval: int = 30,
):
    """Process video for OCR using EasyOCR with resume support"""

    framework = ResumeFramework(
        output_path=output_path,
        processor_name="ocr",
        uuid=uuid,
        auto_save_interval=auto_save_interval,
        auto_save_frames=auto_save_frames,
        force_restart=force_restart,
    )

    framework.publish_info("OCR_START")

    try:
        import easyocr
    except ImportError:
        framework.publish_error("easyocr not installed")
        result = {
            "metadata": {"status": "error", "error": "easyocr not installed"},
            "frames": {},
        }
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)
        framework.publish_progress(0, 0, "0 frames")
        return result

    framework.publish_info("OCR_LOADING_MODEL")

    reader = easyocr.Reader(["en"], gpu=False, verbose=False)

    framework.publish_info("OCR_MODEL_LOADED")

    import cv2

    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"Error: Cannot open video: {video_path}")
        return {"metadata": {"status": "error"}, "frames": {}}

    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    total_duration = total_frames / fps if fps > 0 else 0
    cap.release()

    framework.publish_info(f"fps={fps}, frames={total_frames}")

    existing_data, last_checkpoint = framework.load_existing_data()
    resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart

    if resume_mode:
        print(f"\nFound existing data: {output_path}")
        print(f"Last processed frame: {last_checkpoint}")
        print(f"Will resume from frame {last_checkpoint + 1}")

    if resume_mode and existing_data:
        ocr_data = existing_data
        frame_count = last_checkpoint
        processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
        cap = cv2.VideoCapture(video_path)
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
    else:
        ocr_data = {
            "metadata": framework.init_metadata(
                video_path=video_path,
                fps=fps,
                width=width,
                height=height,
                total_frames=total_frames,
                total_duration=total_duration,
                extra={"sample_interval": sample_interval},
            ),
            "frames": {},
        }
        frame_count = 0
        processed_frames = set()
        cap = cv2.VideoCapture(video_path)

    framework.set_data(ocr_data)

    start_time = time.time()
    framework.last_save_time = start_time

    print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
    print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
    print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
    print()

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        current_time = (frame_count - 1) / fps if fps > 0 else 0

        if frame_count in processed_frames:
            continue

        if frame_count % sample_interval != 0:
            continue

        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        try:
            detections = reader.readtext(
                frame_rgb, text_threshold=0.5, low_text=0.3, link_threshold=0.3
            )
        except Exception as e:
            framework.publish_error(f"Frame {frame_count}: {e}")
            detections = []

        texts = []
        for detection in detections:
            det: tuple = tuple(detection)
            bbox = list(det[0])
            text: str = str(det[1])
            confidence: float = float(det[2])

            x = int(min(float(p[0]) for p in bbox))
            y = int(min(float(p[1]) for p in bbox))
            w = int(max(float(p[0]) for p in bbox) - x)
            h = int(max(float(p[1]) for p in bbox) - y)

            if text.strip():
                texts.append(
                    {
                        "text": text,
                        "x": x,
                        "y": y,
                        "width": w,
                        "height": h,
                        "confidence": confidence,
                    }
                )

        if texts:
            ocr_data["frames"][str(frame_count)] = {
                "frame_number": frame_count,
                "time_seconds": round(current_time, 3),
                "time_formatted": format_time(current_time),
                "texts": texts,
            }
            processed_frames.add(frame_count)

        if frame_count % 500 == 0:
            elapsed = time.time() - start_time
            print_progress(frame_count, total_frames, elapsed, f"{len(texts)} texts")
            framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")

        if framework.should_auto_save(frame_count):
            framework.save_progress(frame_count, silent=True)

    cap.release()

    total_processed = len(processed_frames)

    framework.finalize(
        total_processed=total_processed,
        extra_metadata={"sample_interval": sample_interval},
    )

    print(f"\nOCR completed: {total_processed} frames processed")
    print(f"Frames with text: {len(ocr_data['frames'])}")

    return ocr_data


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="OCR Text Recognition with Resume Support")
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
    parser.add_argument(
        "--auto-save-interval",
        "-a",
        help="Auto-save interval in seconds",
        type=int,
        default=30,
    )
    parser.add_argument(
        "--auto-save-frames",
        "-f",
        help="Auto-save interval in frames",
        type=int,
        default=300,
    )
    parser.add_argument(
        "--force-restart",
        "-r",
        help="Force restart (ignore existing data)",
        action="store_true",
    )
    parser.add_argument(
        "--sample-interval",
        "-s",
        help="Frame sample interval",
        type=int,
        default=30,
    )
    args = parser.parse_args()

    process_ocr(
        args.video_path,
        args.output_path,
        args.uuid,
        args.auto_save_interval,
        args.auto_save_frames,
        args.force_restart,
        args.sample_interval,
    )