momentry_core/scripts/test_florence2_direct.py

#!/opt/homebrew/bin/python3.11
"""
Test Florence-2 for "Stamps" Detection (Robust Patch for Transformers 4.57.6)
"""

import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

UUID = "384b0ff44aaaa1f1"
VIDEO_PATH = f"output/{UUID}/{UUID}.mp4"
OUTPUT_DIR = f"output/{UUID}/florence2_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Frame where "stamp" is heavily discussed
TIMESTAMP = 6846.0

print(f"📽️ Extracting frame at {TIMESTAMP}s...")
cap = cv2.VideoCapture(VIDEO_PATH)
cap.set(cv2.CAP_PROP_POS_MSEC, TIMESTAMP * 1000)
ret, frame = cap.read()
cap.release()

if not ret:
    print("❌ Failed to read frame.")
    exit()

# Save raw frame
raw_path = os.path.join(OUTPUT_DIR, f"raw_{int(TIMESTAMP)}.jpg")
cv2.imwrite(raw_path, frame)
print(f"💾 Raw frame saved to {raw_path}")

print("🧠 Loading Florence-2 model...")
try:
    processor = AutoProcessor.from_pretrained(
        "microsoft/Florence-2-base", trust_remote_code=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/Florence-2-base", trust_remote_code=True, attn_implementation="eager"
    )

    # PATCH: Fix compatibility with transformers 4.57.6
    # The issue is that `past_key_values` might be initialized as [None] which crashes the model code.
    print("🔧 Patching model to fix past_key_values handling...")
    inner_model = model.language_model
    original_prepare = inner_model.prepare_inputs_for_generation

    def patched_prepare(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        inputs_embeds=None,
        **kwargs,
    ):
        # Check if past_key_values is valid.
        # In some transformers versions, it's passed as [None] initially, causing a crash.
        is_valid_cache = False
        if past_key_values is not None:
            if isinstance(past_key_values, (list, tuple)) and len(past_key_values) > 0:
                if past_key_values[0] is not None:
                    is_valid_cache = True

        if not is_valid_cache:
            # Treat as step 0.
            # CRITICAL: Do NOT return inputs_embeds if input_ids is present to avoid
            # "You cannot specify both input_ids and inputs_embeds at the same time" error.
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "past_key_values": None,
                "use_cache": kwargs.get("use_cache", True),
            }
        else:
            return original_prepare(
                input_ids,
                past_key_values=past_key_values,
                attention_mask=attention_mask,
                inputs_embeds=inputs_embeds,
                **kwargs,
            )

    inner_model.prepare_inputs_for_generation = types.MethodType(
        patched_prepare, inner_model
    )
    print("✅ Patch applied.")

    image = Image.open(raw_path).convert("RGB")
    prompt = "<OPEN_VOCABULARY_DETECTION>"
    text_input = "stamp"

    print(f"🔍 Running detection for '{text_input}'...")

    # Prepare inputs
    # Note: For OVD, the prompt format is usually <TASK_PROMPT>text_input
    # But let's try passing just the task prompt and text_input separately if supported,
    # or combining them.
    # Florence-2 documentation suggests: prompt="<OPEN_VOCABULARY_DETECTION>", text_input="stamp"
    # But we saw text_input argument error before.
    # Let's try combining: "<OPEN_VOCABULARY_DETECTION>stamp"
    full_prompt = f"{prompt}{text_input}"

    inputs = processor(text=full_prompt, images=image, return_tensors="pt")

    # Generate
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        do_sample=False,
        num_beams=3,
    )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    print(f"📝 Raw Output: {generated_text}")

    # Post-processing might fail if the format isn't expected.
    # Let's just print the raw text if parsing fails.
    try:
        parsed_answer = processor.post_process_generation(
            generated_text, task=prompt, image_size=(image.width, image.height)
        )
        print(f"📦 Parsed Result: {parsed_answer}")
    except Exception as e:
        print(f"⚠️ Parsing failed (Raw text is above): {e}")

except Exception as e:
    print(f"❌ Error: {e}")
    import traceback

    traceback.print_exc()

print("🏁 Done.")