momentry_core/scripts/florence2_scan_stamps.py

#!/opt/homebrew/bin/python3.11
"""
Use Florence-2 to scan video frames for "stamp" using open vocabulary detection
"""

import os
import cv2
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

UUID = "384b0ff44aaaa1f1"
VIDEO_PATH = f"output/{UUID}/{UUID}.mp4"
OUTPUT_DIR = f"output/{UUID}/florence2_stamp_scan"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Scan frames at 5-minute intervals throughout the 2-hour video
TIMESTAMPS = list(range(0, 6879, 300))  # Every 5 minutes

print(f"📽️ Loading Florence-2 model...")
processor = AutoProcessor.from_pretrained(
    "microsoft/Florence-2-base", trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-base", trust_remote_code=True
)
model.eval()

cap = cv2.VideoCapture(VIDEO_PATH)
print(f"🔍 Scanning {len(TIMESTAMPS)} frames for 'stamp'...")

for ts in TIMESTAMPS:
    cap.set(cv2.CAP_PROP_POS_MSEC, ts * 1000)
    ret, frame = cap.read()
    if not ret:
        continue

    image_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Open Vocabulary Detection for "stamp"
    prompt = "<OPEN_VOCABULARY_DETECTION>"
    inputs = processor(
        text=prompt,
        images=image_pil,
        return_tensors="pt",
        # Florence-2 expects the prompt to include what to detect
    )

    # For open vocabulary, we need to use a different approach
    # Florence-2 uses specific task prompts
    task = "<OPEN_VOCABULARY_DETECTION>"
    text_input = f"{task} stamp"

    inputs = processor(text=text_input, images=image_pil, return_tensors="pt")

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=512,
            num_beams=3,
        )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

    try:
        parsed = processor.post_process_generation(
            generated_text,
            task=task,
            image_size=(image_pil.width, image_pil.height),
        )

        if parsed and "<OPEN_VOCABULARY_DETECTION>" in parsed:
            detections = parsed["<OPEN_VOCABULARY_DETECTION>"]
            if detections:
                print(f"  📍 Frame {ts}s: Found {len(detections)} stamp(s)")
                for i, det in enumerate(detections):
                    bbox = det.get("bbox", [0, 0, 0, 0])
                    x1, y1, x2, y2 = map(int, bbox)
                    crop = frame[y1:y2, x1:x2]
                    if crop.size > 0:
                        crop_path = os.path.join(OUTPUT_DIR, f"stamp_{ts}s_{i}.jpg")
                        cv2.imwrite(crop_path, crop)

                        # Also draw on full frame
                        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 3)
                        cv2.putText(
                            frame,
                            f"stamp {i}",
                            (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            1,
                            (0, 255, 0),
                            2,
                        )

                # Save annotated frame
                ann_path = os.path.join(OUTPUT_DIR, f"annotated_{ts}s.jpg")
                cv2.imwrite(ann_path, frame)
    except Exception as e:
        print(f"  ⚠️  Frame {ts}s: Parse error - {e}")

cap.release()
print(f"\n🏁 Done. Check {OUTPUT_DIR} for results.")