#!/opt/homebrew/bin/python3.11 """ Deep Analysis of 112:36 Frame 1. Detailed Captioning 2. Search for "Envelope" and "Hand holding object" """ import os import cv2 import torch import types from PIL import Image from transformers import AutoProcessor, AutoModelForCausalLM UUID = "384b0ff44aaaa1f1" BASE_DIR = f"output/{UUID}/florence2_results" IMG_NAME = "scan_6756.jpg" # 112:36 IMG_PATH = os.path.join(BASE_DIR, IMG_NAME) # Patch for compatibility def patch_model(model): inner_model = model.language_model original_prepare = inner_model.prepare_inputs_for_generation def patched_prepare( self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs, ): is_valid_cache = False if past_key_values is not None: if isinstance(past_key_values, (list, tuple)) and len(past_key_values) > 0: first_layer = past_key_values[0] if first_layer is not None and ( not isinstance(first_layer, (list, tuple)) or len(first_layer) > 0 ): is_valid_cache = True if not is_valid_cache: return { "input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": None, "use_cache": True, } else: return original_prepare( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs, ) inner_model.prepare_inputs_for_generation = types.MethodType( patched_prepare, inner_model ) print(f"šŸ“· Loading image: {IMG_PATH}") if not os.path.exists(IMG_PATH): print("āŒ Image not found.") exit() image = Image.open(IMG_PATH).convert("RGB") print("🧠 Loading Florence-2 model...") try: processor = AutoProcessor.from_pretrained( "microsoft/Florence-2-base", trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( "microsoft/Florence-2-base", trust_remote_code=True, attn_implementation="eager" ) patch_model(model) # 1. Detailed Caption print("\nšŸ“ Generating Detailed Caption...") prompt = "" inputs = processor(text=prompt, images=image, return_tensors="pt") generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3, ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] print(f"šŸ—£ļø Caption: {generated_text}") # 2. Object Detection for specific items search_terms = ["envelope", "letter", "hand holding paper", "stamp", "small paper"] img_cv = cv2.imread(IMG_PATH) for term in search_terms: print(f"\nšŸ” Detecting '{term}'...") prompt_ovd = "" # Note: OVD usually takes text input differently or relies on generation. # For Florence-2, OVD often requires text_input in processor or prompt format. # We will try the standard way first. inputs = processor(text=prompt_ovd, images=image, return_tensors="pt") generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3, ) generated_text = processor.batch_decode( generated_ids, skip_special_tokens=False )[0] try: parsed_answer = processor.post_process_generation( generated_text, task=prompt_ovd, image_size=(image.width, image.height) ) results = parsed_answer.get("", {}) bboxes = results.get("bboxes", []) labels = results.get("bboxes_labels", []) if bboxes: print(f" āœ… Found '{term}': {labels}") for i, (box, label) in enumerate(zip(bboxes, labels)): if term.lower() in label.lower() or ( term == "envelope" and "paper" in label.lower() ): x1, y1, x2, y2 = map(int, box) print(f" šŸ“ Box: ({x1},{y1}) -> ({x2},{y2})") # Crop crop = img_cv[y1:y2, x1:x2] crop_path = os.path.join( BASE_DIR, f"crop_deep_{term.replace(' ', '_')}_{i}.jpg" ) cv2.imwrite(crop_path, crop) # Draw cv2.rectangle(img_cv, (x1, y1), (x2, y2), (0, 255, 0), 3) cv2.putText( img_cv, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, ) else: print(f" āŒ Not found.") except Exception as e: print(f" āš ļø Error: {e}") res_path = os.path.join(BASE_DIR, "deep_analysis_result.jpg") cv2.imwrite(res_path, img_cv) print(f"\nšŸŽØ Result saved to {res_path}") except Exception as e: print(f"āŒ Error: {e}")