momentry_core/scripts/test_florence2_pipeline.py

#!/opt/homebrew/bin/python3.11
"""
Test Florence-2 for "Stamps" Detection using Pipeline
"""

import os
import cv2
from transformers import pipeline

UUID = "384b0ff44aaaa1f1"
VIDEO_PATH = f"output/{UUID}/{UUID}.mp4"
OUTPUT_DIR = f"output/{UUID}/florence2_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Frame where "stamp" is heavily discussed
TIMESTAMP = 6846.0

print(f"📽️ Extracting frame at {TIMESTAMP}s...")
cap = cv2.VideoCapture(VIDEO_PATH)
cap.set(cv2.CAP_PROP_POS_MSEC, TIMESTAMP * 1000)
ret, frame = cap.read()
cap.release()

if not ret:
    print("❌ Failed to read frame.")
    exit()

# Save raw frame
raw_path = os.path.join(OUTPUT_DIR, f"raw_{int(TIMESTAMP)}.jpg")
cv2.imwrite(raw_path, frame)
print(f"💾 Raw frame saved.")

print("🧠 Loading Florence-2 model via pipeline...")
try:
    # Using pipeline handles model configuration automatically
    pipe = pipeline(
        "image-to-text", model="microsoft/Florence-2-base", trust_remote_code=True
    )

    print("🔍 Running detection on 'stamp'...")
    # Florence-2 tasks: '<OPEN_VOCABULARY_DETECTION>', '<CAPTION>', etc.
    # We want to see if there is a stamp, so let's use caption first to see what it sees.

    result = pipe(raw_path, prompt="<CAPTION>")
    print(f"📝 Caption Result: {result}")

    # Let's try open vocabulary detection for 'stamp'
    print("🔍 Running Open Vocabulary Detection for 'stamp'...")
    result_ood = pipe(
        raw_path, prompt="<OPEN_VOCABULARY_DETECTION>", text_input="stamp"
    )
    print(f"📦 OOD Result: {result_ood}")

except Exception as e:
    print(f"❌ Error: {e}")

print("🏁 Done.")