chore: backup before migration to new repo

2026-04-23 16:46:02 +08:00
parent 13dd3b30f3
commit 59809dae1f
40 changed files with 5566 additions and 1783 deletions
@@ -65,12 +65,20 @@ def run_asr(video_path, output_path, uuid: str = ""):
    if publisher:
        publisher.info("asr", "Loading Whisper model...")

-    model = WhisperModel("tiny", device="cpu", compute_type="int8")
+    # Use small model with CPU (MPS not supported by faster_whisper)
+    # small 模型在準確率和速度間取得最佳平衡
+    model = WhisperModel("small", device="cpu", compute_type="int8")

    if publisher:
        publisher.info("asr", f"Transcribing: {video_path}")

-    segments, info = model.transcribe(video_path, beam_size=5)
+    # Transcribe with VAD filter for better accuracy
+    segments, info = model.transcribe(
+        video_path,
+        beam_size=5,
+        vad_filter=True,
+        vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
+    )

    if publisher:
        publisher.info("asr", f"ASR_LANGUAGE:{info.language}")
@@ -22,6 +22,7 @@ def process_asrx(video_path: str, output_path: str, uuid: str = ""):

    try:
        import whisperx
+        import torch
    except ImportError:
        if publisher:
            publisher.error("asrx", "whisperx not installed")
@@ -36,6 +37,14 @@ def process_asrx(video_path: str, output_path: str, uuid: str = ""):
        publisher.info("asrx", "ASRX_LOADING_MODEL")

    try:
+        # Fix for PyTorch 2.6+ compatibility
+        # Allow omegaconf types in torch.load
+        import omegaconf
+
+        torch.serialization.add_safe_globals(
+            [omegaconf.listconfig.ListConfig, omegaconf.dictconfig.DictConfig]
+        )
+
        # Load model - using faster-whisper for better performance
        # You can also use: "large-v3", "medium", "small", "base", "tiny"
        model = whisperx.load_model("base", device="cpu", compute_type="int8")
@@ -54,9 +63,14 @@ def process_asrx(video_path: str, output_path: str, uuid: str = ""):

        # Diarization (speaker segmentation)
        try:
-            import whisperx
+            from whisperx.diarize import DiarizationPipeline

-            diarize_model = whisperx.DiarizationPipeline(use_auth_token=None)
+            # DiarizationPipeline parameters: model_name, token, device, cache_dir
+            diarize_model = DiarizationPipeline(
+                model_name="pyannote/speaker-diarization",
+                token=None,  # HuggingFace token (None for public models)
+                device="cpu",
+            )
            diarize_segments = diarize_model(video_path)

            # Assign speaker labels
@@ -1,7 +1,8 @@
 #!/opt/homebrew/bin/python3.11
 """
-Caption Processor - Generate image captions
-Uses AI vision models to analyze video frames and generate descriptions
+Caption Processor - Generate image captions (LOCAL ONLY)
+Uses Moondream2 (local VLM) for image captioning
+No cloud API calls - fully offline processing
 """

 import sys
@@ -18,7 +19,6 @@ from redis_publisher import RedisPublisher
 def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
    """Extract frames from video at regular intervals"""

-    # Get video duration
    cmd = [
        "ffprobe",
        "-v",
@@ -34,14 +34,13 @@ def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
            data = json.loads(result.stdout)
            duration = float(data.get("format", {}).get("duration", 0))
        else:
-            duration = 60  # Default fallback
+            duration = 60
    except Exception:
        duration = 60

    if duration <= 0:
        duration = 60

-    # Calculate frame interval
    interval = max(duration / max_frames, 1.0)

    frames = []
@@ -76,94 +75,73 @@ def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
    return frames


-def generate_caption_with_llava(
+def generate_caption_with_moondream(
    image_path: str, prompt: str = "Describe this image in detail."
 ) -> Optional[str]:
-    """Generate caption using LLaVA model"""
+    """Generate caption using Moondream2 (local VLM)"""
    try:
-        # Try to use transformers with LLaVA
-        from transformers import AutoProcessor, AutoModelForVision2Seq  # noqa: F401
-        import torch  # noqa: F401
-        from PIL import Image  # noqa: F401
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from PIL import Image
+        import torch

-        # Note: This requires llava-hf/llava-1.5-7b-hf or similar
-        # For now, return a placeholder
-        return f"[LLaVA caption for {os.path.basename(image_path)}]"
+        model_id = "vikhyatk/moondream2"
+        revision = "2025-01-09"
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, revision=revision, trust_remote_code=True
+        )
+        moondream = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
+        ).to("mps" if torch.backends.mps.is_available() else "cpu")
+
+        moondream.eval()
+
+        image = Image.open(image_path)
+        enc_image = moondream.encode_image(image)
+        caption = moondream.answer_question(enc_image, prompt, tokenizer)
+
+        return caption if caption else None
    except ImportError:
        return None
-
-
-def generate_caption_with_gpt4v(image_path: str, api_key: str = None) -> Optional[str]:
-    """Generate caption using GPT-4V via OpenAI API"""
-    import base64
-
-    if not api_key:
-        api_key = os.environ.get("OPENAI_API_KEY")
-
-    if not api_key:
-        return None
-
-    try:
-        from openai import OpenAI
-
-        client = OpenAI(api_key=api_key)
-
-        # Encode image
-        with open(image_path, "rb") as f:
-            img_data = base64.b64encode(f.read()).decode()
-
-        response = client.chat.completions.create(
-            model="gpt-4o",  # or gpt-4-turbo for vision
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/jpeg;base64,{img_data}"},
-                        },
-                        {
-                            "type": "text",
-                            "text": "Describe what you see in this image in one sentence.",
-                        },
-                    ],
-                }
-            ],
-            max_tokens=100,
-        )
-
-        return response.choices[0].message.content
-    except Exception:
+    except Exception as e:
+        print(f"[CAPTION] Moondream error: {e}")
        return None


-def generate_caption_fallback(image_path: str, existing_data: Dict = None) -> str:
-    """Generate a basic caption using available metadata"""
+def generate_caption_from_metadata(image_path: str, existing_data: Dict = None) -> str:
+    """Generate caption using YOLO/OCR metadata (fallback)"""

    caption_parts = []

-    # Check YOLO data for objects
    if existing_data and existing_data.get("objects"):
        objects = list(set([o["class"] for o in existing_data["objects"]]))[:5]
        if objects:
-            caption_parts.append(f"Contains: {', '.join(objects)}")
+            caption_parts.append(f"Objects: {', '.join(objects)}")

-    # Check OCR data for text
    if existing_data and existing_data.get("texts"):
        texts = [t["text"] for t in existing_data["texts"] if t.get("text")]
        if texts:
-            caption_parts.append(f"On-screen text: {' '.join(texts[:3])}")
+            caption_parts.append(f"Text: {' '.join(texts[:3])}")
+
+    if existing_data and existing_data.get("scene_type"):
+        caption_parts.append(f"Scene: {existing_data['scene_type']}")

    if caption_parts:
        return " | ".join(caption_parts)

-    return "Video frame at timestamp"
+    return "Video frame"


 def process_frame(
-    frame_info: Dict, yolo_data: List = None, ocr_data: List = None
+    frame_info: Dict,
+    yolo_data: List = None,
+    ocr_data: List = None,
+    scene_data: Dict = None,
 ) -> Dict:
-    """Process a single frame and generate caption"""
+    """Process a single frame and generate caption (LOCAL ONLY)"""

    frame_path = frame_info["path"]
    timestamp = frame_info["timestamp"]
@@ -171,28 +149,34 @@ def process_frame(
    caption = None
    source = "unknown"

-    # Try GPT-4V first
-    caption = generate_caption_with_gpt4v(frame_path)
+    # Try Moondream2 (local VLM)
+    caption = generate_caption_with_moondream(frame_path)
    if caption:
-        source = "gpt-4v"
+        source = "moondream2"
    else:
-        # Try LLaVA
-        caption = generate_caption_with_llava(frame_path)
-        if caption:
-            source = "llava"
-        else:
-            # Use fallback with YOLO/OCR data
-            combined_data = {"objects": [], "texts": []}
-            if yolo_data:
-                combined_data["objects"] = [
-                    o for o in yolo_data if o.get("timestamp") == timestamp
-                ]
-            if ocr_data:
-                combined_data["texts"] = [
-                    t for t in ocr_data if t.get("timestamp") == timestamp
-                ]
-            caption = generate_caption_fallback(frame_path, combined_data)
-            source = "metadata"
+        # Fallback: Use metadata from YOLO/OCR/Scene
+        combined_data = {"objects": [], "texts": [], "scene_type": ""}
+
+        if yolo_data:
+            combined_data["objects"] = [
+                o for o in yolo_data if o.get("timestamp") == timestamp
+            ]
+
+        if ocr_data:
+            combined_data["texts"] = [
+                t for t in ocr_data if t.get("timestamp") == timestamp
+            ]
+
+        if scene_data:
+            for scene in scene_data.get("scenes", []):
+                if scene.get("start_time", 0) <= timestamp <= scene.get("end_time", 0):
+                    combined_data["scene_type"] = scene.get(
+                        "scene_type_zh"
+                    ) or scene.get("scene_type", "")
+                    break
+
+        caption = generate_caption_from_metadata(frame_path, combined_data)
+        source = "metadata"

    return {
        "index": frame_info["index"],
@@ -212,24 +196,22 @@ def run_caption(
    if publisher:
        publisher.info("caption", "Extracting frames from video...")

-    # Extract frames
    frames = extract_frames(video_path, max_frames)

    if publisher:
        publisher.info("caption", f"Extracted {len(frames)} frames")

-    # Load YOLO and OCR data for context
    base_path = os.path.dirname(output_path)
    uuid_name = os.path.basename(output_path).split(".")[0]

    yolo_objects = []
    ocr_texts = []
+    scene_info = {}

    yolo_path = os.path.join(base_path, f"{uuid_name}.yolo.json")
    if os.path.exists(yolo_path):
        with open(yolo_path) as f:
            yolo_data = json.load(f)
-            # Flatten objects from all frames
            for frame in yolo_data.get("frames", []):
                for obj in frame.get("objects", []):
                    obj["timestamp"] = frame.get("timestamp", 0)
@@ -244,7 +226,11 @@ def run_caption(
                    text["timestamp"] = frame.get("timestamp", 0)
                    ocr_texts.append(text)

-    # Process each frame
+    scene_path = os.path.join(base_path, f"{uuid_name}.scene.json")
+    if os.path.exists(scene_path):
+        with open(scene_path) as f:
+            scene_info = json.load(f)
+
    captions = []
    for i, frame in enumerate(frames):
        if publisher and i % 5 == 0:
@@ -252,16 +238,14 @@ def run_caption(
                "caption", i, len(frames), f"Frame {i + 1}/{len(frames)}"
            )

-        caption_data = process_frame(frame, yolo_objects, ocr_texts)
+        caption_data = process_frame(frame, yolo_objects, ocr_texts, scene_info)
        captions.append(caption_data)

-        # Cleanup temp frame
        try:
            os.remove(frame["path"])
        except Exception:
            pass

-    # Cleanup temp directory
    temp_dir = os.path.join(os.path.dirname(video_path), ".caption_frames")
    try:
        os.rmdir(temp_dir)
@@ -275,9 +259,11 @@ def run_caption(
        "summary": {
            "avg_caption_length": sum(len(c.get("caption", "")) for c in captions)
            / max(len(captions), 1),
-            "gpt4v_count": sum(1 for c in captions if c.get("source") == "gpt-4v"),
-            "llava_count": sum(1 for c in captions if c.get("source") == "llava"),
+            "moondream_count": sum(
+                1 for c in captions if c.get("source") == "moondream2"
+            ),
            "metadata_count": sum(1 for c in captions if c.get("source") == "metadata"),
+            "cloud_api_count": 0,
        },
    }

@@ -285,13 +271,13 @@ def run_caption(
        json.dump(result, f, indent=2, ensure_ascii=False)

    if publisher:
-        publisher.complete("caption", f"{len(captions)} frames captioned")
+        publisher.complete("caption", f"{len(captions)} frames captioned (LOCAL)")

    return result


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Video Caption Generator")
+    parser = argparse.ArgumentParser(description="Video Caption Generator (LOCAL ONLY)")
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", help="UUID for progress tracking", default="")
@@ -302,4 +288,4 @@ if __name__ == "__main__":
    args = parser.parse_args()

    result = run_caption(args.video_path, args.output_path, args.uuid, args.max_frames)
-    print(f"Caption generated: {result['total_frames']} frames")
+    print(f"Caption generated: {result['total_frames']} frames (LOCAL)")
@@ -1,8 +1,8 @@
 #!/opt/homebrew/bin/python3.11
 """
-Face Processor - Face Detection
-Uses OpenCV Haar Cascade (local, no extra download needed)
-Alternative: MediaPipe (requires model download)
+Face Processor - Face Detection & Demographics
+Uses InsightFace for detection, age, and gender analysis.
+Falls back to OpenCV Haar Cascade if InsightFace fails.
 """

 import sys
@@ -15,7 +15,7 @@ from redis_publisher import RedisPublisher


 def process_face(video_path: str, output_path: str, uuid: str = ""):
-    """Process video for face detection"""
+    """Process video for face detection and demographics analysis"""

    publisher = RedisPublisher(uuid) if uuid else None
    if publisher:
@@ -23,56 +23,82 @@ def process_face(video_path: str, output_path: str, uuid: str = ""):

    try:
        import cv2
-    except ImportError:
+        import numpy as np
+        import insightface
+    except ImportError as e:
+        error_msg = f"Missing dependency: {e.name}"
        if publisher:
-            publisher.error("face", "opencv-python not installed")
+            publisher.error("face", error_msg)
        result = {"frame_count": 0, "fps": 0.0, "frames": []}
-        if publisher:
-            publisher.complete("face", "0 frames")
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)
        return result

-    if publisher:
-        publisher.info("face", "FACE_LOADING_CASCADE")
-
-    # Try to use OpenCV's built-in Haar Cascade
-    # This is included with OpenCV
-    face_cascade = cv2.CascadeClassifier(
-        cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
-    )
-
-    if face_cascade.empty():
+    # 1. Initialize InsightFace
+    use_insightface = False
+    app = None
+    try:
        if publisher:
-            publisher.error("face", "Could not load Haar Cascade")
-        result = {"frame_count": 0, "fps": 0.0, "frames": []}
+            publisher.info("face", "LOADING_INSIGHTFACE")
+        # 'buffalo_l' is a robust model. det_size can be adjusted.
+        app = insightface.app.FaceAnalysis(
+            name="buffalo_l", providers=["CPUExecutionProvider"]
+        )
+        app.prepare(ctx_id=0, det_size=(320, 320))
+        use_insightface = True
        if publisher:
-            publisher.complete("face", "0 frames")
-        with open(output_path, "w") as f:
-            json.dump(result, f, indent=2)
-        return result
+            publisher.info("face", "INSIGHTFACE_LOADED")
+    except Exception as e:
+        print(f"[WARNING] InsightFace failed to load: {e}")
+        use_insightface = False
+
+    # 2. Fallback to Haar Cascade
+    face_cascade = None
+    if not use_insightface:
+        if publisher:
+            publisher.info("face", "LOADING_HAAR_CASCADE")
+        face_cascade = cv2.CascadeClassifier(
+            cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
+        )
+        if face_cascade.empty():
+            if publisher:
+                publisher.error("face", "Could not load Haar Cascade")
+            result = {"frame_count": 0, "fps": 0.0, "frames": []}
+            with open(output_path, "w") as f:
+                json.dump(result, f, indent=2)
+            return result
+        if publisher:
+            publisher.info("face", "HAAR_CASCADE_LOADED")

    if publisher:
-        publisher.info("face", "FACE_CASCADE_LOADED")
+        publisher.info("face", "PROCESSING_VIDEO")

-    # Get video info
    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        if publisher:
+            publisher.error("face", "Could not open video")
+        result = {"frame_count": 0, "fps": 0.0, "frames": []}
+        with open(output_path, "w") as f:
+            json.dump(result, f, indent=2)
+        return result
+
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    cap.release()
+
+    # Optimization: Process every N frames to speed up analysis
+    # Since we just need attributes for the person identity, we don't need every single frame.
+    sample_interval = 30
+    if total_frames > 0:
+        estimated_samples = total_frames // sample_interval
+    else:
+        estimated_samples = 0
+
+    frame_count = 0
+    processed_count = 0
+    frames_data = []

    if publisher:
-        publisher.info("face", f"fps={fps}, frames={total_frames}")
-        publisher.progress("face", 0, total_frames, "Starting")
-
-    # Process every N frames to speed up
-    sample_interval = 30  # Process every 30 frames
-
-    frames = []
-    frame_count = 0
-    processed = 0
-
-    cap = cv2.VideoCapture(video_path)
+        publisher.progress("face", 0, estimated_samples, "Starting")

    while True:
        ret, frame = cap.read()
@@ -81,62 +107,92 @@ def process_face(video_path: str, output_path: str, uuid: str = ""):

        frame_count += 1

-        # Sample frames
+        # Sampling
        if frame_count % sample_interval != 0:
            continue

-        processed += 1
+        processed_count += 1
        timestamp = (frame_count - 1) / fps if fps > 0 else 0

-        # Convert to grayscale
-        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-
-        # Detect faces
-        try:
-            faces = face_cascade.detectMultiScale(
-                gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)
-            )
-        except Exception as e:
-            if publisher:
-                publisher.error("face", f"Frame {frame_count}: {e}")
-            faces = []
-
        face_list = []
-        for x, y, w, h in faces:
-            face_list.append(
-                {
-                    "face_id": None,
-                    "x": int(x),
-                    "y": int(y),
-                    "width": int(w),
-                    "height": int(h),
-                    "confidence": 0.8,  # Haar cascade doesn't provide confidence
-                }
-            )

-        # Only add frames with faces
+        try:
+            if use_insightface and app:
+                # InsightFace Detection & Analysis
+                faces = app.get(frame)
+                for face in faces:
+                    bbox = face.bbox.astype(int)
+                    bx, by, bw, bh = (
+                        bbox[0],
+                        bbox[1],
+                        bbox[2] - bbox[0],
+                        bbox[3] - bbox[1],
+                    )
+
+                    # Extract Attributes
+                    age = int(face.age) if hasattr(face, "age") else None
+                    gender_val = face.gender if hasattr(face, "gender") else None
+                    gender = (
+                        "female"
+                        if gender_val == 0
+                        else ("male" if gender_val == 1 else None)
+                    )
+
+                    face_list.append(
+                        {
+                            "x": int(bx),
+                            "y": int(by),
+                            "width": int(bw),
+                            "height": int(bh),
+                            "confidence": float(face.det_score)
+                            if hasattr(face, "det_score")
+                            else 0.9,
+                            "attributes": {"age": age, "gender": gender},
+                        }
+                    )
+            else:
+                # Haar Cascade Fallback (No Age/Gender)
+                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+                faces = face_cascade.detectMultiScale(
+                    gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)
+                )
+                for x, y, w, h in faces:
+                    face_list.append(
+                        {
+                            "x": int(x),
+                            "y": int(y),
+                            "width": int(w),
+                            "height": int(h),
+                            "confidence": 0.8,
+                            "attributes": {"age": None, "gender": None},
+                        }
+                    )
+        except Exception as e:
+            print(f"[ERROR] Frame processing error: {e}")
+
        if face_list:
-            frames.append(
+            frames_data.append(
                {
                    "frame": frame_count - 1,
                    "timestamp": round(timestamp, 3),
                    "faces": face_list,
                }
            )
+
            if publisher:
                publisher.progress(
                    "face",
-                    processed,
-                    total_frames // sample_interval,
+                    processed_count,
+                    estimated_samples,
                    f"Frame {frame_count}",
                )

    cap.release()

-    result = {"frame_count": total_frames, "fps": fps, "frames": frames}
+    result = {"frame_count": total_frames, "fps": fps, "frames": frames_data}

    if publisher:
-        publisher.complete("face", f"{len(frames)} frames with faces")
+        publisher.complete("face", f"{len(frames_data)} frames processed")

    with open(output_path, "w") as f:
        json.dump(result, f, indent=2)
@@ -145,7 +201,7 @@ def process_face(video_path: str, output_path: str, uuid: str = ""):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Face Detection")
+    parser = argparse.ArgumentParser(description="Face Detection & Demographics")
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
@@ -1,382 +1,367 @@
 {
-  "0": "airplane_cabin",
-  "1": "airport_terminal",
-  "2": "alley",
-  "3": "amphitheater",
-  "4": "amusement_park",
-  "5": "apartment_building_outdoor",
-  "6": "aquarium",
-  "7": "arcade",
-  "8": "arena_hockey",
-  "9": "arena_performance",
-  "10": "army_base",
-  "11": "art_gallery",
-  "12": "art_studio",
-  "13": "assembly_line",
-  "14": "athletic_field_outdoor",
-  "15": "atrium_public",
-  "16": "attic",
-  "17": "auditorium",
-  "18": "auto_factory",
-  "19": "backyard",
-  "20": "badminton_court_indoor",
-  "21": "baggage_claim",
-  "22": "bakery_shop",
-  "23": "balcony_exterior",
-  "24": "balcony_interior",
-  "25": "ball_pit",
-  "26": "ballroom",
-  "27": "bamboo_forest",
-  "28": "banquet_hall",
-  "29": "bar",
-  "30": "barn",
-  "31": "barndoor",
-  "32": "baseball_field",
-  "33": "basement",
-  "34": "basilica",
-  "35": "basketball_court_indoor",
-  "36": "basketball_court_outdoor",
-  "37": "bathroom",
-  "38": "bazaar_indoor",
-  "39": "bazaar_outdoor",
-  "40": "beach",
-  "41": "beauty_salon",
-  "42": "bedroom",
-  "43": "berth",
-  "44": "biology_laboratory",
-  "45": "boardwalk",
-  "46": "boat_deck",
-  "47": "boathouse",
-  "48": "bookstore",
-  "49": "booth_indoor",
-  "50": "botanical_garden",
-  "51": "bow_window_indoor",
-  "52": "bow_window_outdoor",
-  "53": "bowling_alley",
-  "54": "boxing_ring",
-  "55": "brewery_indoor",
-  "56": "bridge",
-  "57": "building_facade",
-  "58": "bullring",
-  "59": "burial_chamber",
-  "60": "bus_interior",
-  "61": "bus_station_indoor",
-  "62": "butchers_shop",
-  "63": "butte",
-  "64": "cabin_outdoor",
-  "65": "cafeteria",
-  "66": "campsite",
-  "67": "campus",
-  "68": "canal_natural",
-  "69": "canal_urban",
-  "70": "candy_store",
-  "71": "canyon",
-  "72": "car_interior",
-  "73": "carrousel",
-  "74": "castle",
-  "75": "catacomb",
-  "76": "cathedral_indoor",
-  "77": "cathedral_outdoor",
-  "78": "cavern_indoor",
-  "79": "cemetery",
-  "80": "chalet",
-  "81": "cheese_factory",
-  "82": "chemistry_lab",
-  "83": "chicken_coop_indoor",
-  "84": "chicken_coop_outdoor",
-  "85": "childs_room",
-  "86": "church_indoor",
-  "87": "church_outdoor",
-  "88": "classroom",
-  "89": "clean_room",
-  "90": "cliff",
-  "91": "cloister_indoor",
-  "92": "closet",
-  "93": "clothing_store",
-  "94": "coast",
-  "95": "cockpit",
-  "96": "coffee_shop",
-  "97": "computer_room",
-  "98": "conference_center",
-  "99": "conference_room",
-  "100": "construction_site",
-  "101": "control_room",
-  "102": "control_tower_outdoor",
-  "103": "corn_field",
-  "104": "corral",
-  "105": "corridor",
-  "106": "cottage_garden",
-  "107": "courthouse",
-  "108": "courtroom",
+  "0": "airfield",
+  "1": "airplane_cabin",
+  "2": "airport_terminal",
+  "3": "alcove",
+  "4": "alley",
+  "5": "amphitheater",
+  "6": "amusement_arcade",
+  "7": "amusement_park",
+  "8": "outdoor",
+  "9": "aquarium",
+  "10": "aqueduct",
+  "11": "arcade",
+  "12": "arch",
+  "13": "archaelogical_excavation",
+  "14": "archive",
+  "15": "hockey",
+  "16": "performance",
+  "17": "rodeo",
+  "18": "army_base",
+  "19": "art_gallery",
+  "20": "art_school",
+  "21": "art_studio",
+  "22": "artists_loft",
+  "23": "assembly_line",
+  "24": "outdoor",
+  "25": "public",
+  "26": "attic",
+  "27": "auditorium",
+  "28": "auto_factory",
+  "29": "auto_showroom",
+  "30": "badlands",
+  "31": "shop",
+  "32": "exterior",
+  "33": "interior",
+  "34": "ball_pit",
+  "35": "ballroom",
+  "36": "bamboo_forest",
+  "37": "bank_vault",
+  "38": "banquet_hall",
+  "39": "bar",
+  "40": "barn",
+  "41": "barndoor",
+  "42": "baseball_field",
+  "43": "basement",
+  "44": "indoor",
+  "45": "bathroom",
+  "46": "indoor",
+  "47": "outdoor",
+  "48": "beach",
+  "49": "beach_house",
+  "50": "beauty_salon",
+  "51": "bedchamber",
+  "52": "bedroom",
+  "53": "beer_garden",
+  "54": "beer_hall",
+  "55": "berth",
+  "56": "biology_laboratory",
+  "57": "boardwalk",
+  "58": "boat_deck",
+  "59": "boathouse",
+  "60": "bookstore",
+  "61": "indoor",
+  "62": "botanical_garden",
+  "63": "indoor",
+  "64": "bowling_alley",
+  "65": "boxing_ring",
+  "66": "bridge",
+  "67": "building_facade",
+  "68": "bullring",
+  "69": "burial_chamber",
+  "70": "bus_interior",
+  "71": "indoor",
+  "72": "butchers_shop",
+  "73": "butte",
+  "74": "outdoor",
+  "75": "cafeteria",
+  "76": "campsite",
+  "77": "campus",
+  "78": "natural",
+  "79": "urban",
+  "80": "candy_store",
+  "81": "canyon",
+  "82": "car_interior",
+  "83": "carrousel",
+  "84": "castle",
+  "85": "catacomb",
+  "86": "cemetery",
+  "87": "chalet",
+  "88": "chemistry_lab",
+  "89": "childs_room",
+  "90": "indoor",
+  "91": "outdoor",
+  "92": "classroom",
+  "93": "clean_room",
+  "94": "cliff",
+  "95": "closet",
+  "96": "clothing_store",
+  "97": "coast",
+  "98": "cockpit",
+  "99": "coffee_shop",
+  "100": "computer_room",
+  "101": "conference_center",
+  "102": "conference_room",
+  "103": "construction_site",
+  "104": "corn_field",
+  "105": "corral",
+  "106": "corridor",
+  "107": "cottage",
+  "108": "courthouse",
  "109": "courtyard",
-  "110": "covered_bridge_exterior",
-  "111": "creek",
-  "112": "crevasse",
-  "113": "crosswalk",
-  "114": "cubicle_office",
-  "115": "dam",
-  "116": "daycare_center",
-  "117": "delicatessen",
-  "118": "dentists_office",
-  "119": "desert_sand",
-  "120": "desert_vegetation",
-  "121": "diner_indoor",
-  "122": "diner_outdoor",
-  "123": "dinette_home",
-  "124": "dinette_vehicle",
-  "125": "dining_car",
-  "126": "dining_room",
-  "127": "discotheque",
-  "128": "dock",
-  "129": "doorway_indoor",
-  "130": "doorway_outdoor",
-  "131": "dorm_room",
-  "132": "driveway",
-  "133": "driving_range_outdoor",
-  "134": "drugstore",
-  "135": "electrical_substation",
-  "136": "elevator_door",
-  "137": "elevator_escalator",
-  "138": "elevator_interior",
-  "139": "engine_room",
-  "140": "escalator_indoor",
-  "141": "excavation",
-  "142": "factory_indoor",
-  "143": "fairway",
-  "144": "fastfood_restaurant",
-  "145": "field_cultivated",
-  "146": "field_wild",
-  "147": "fire_escape",
-  "148": "fire_station",
-  "149": "firing_range_indoor",
-  "150": "fishpond",
-  "151": "florist_shop_indoor",
-  "152": "food_court",
-  "153": "forest_broadleaf",
-  "154": "forest_needleleaf",
-  "155": "forest_path",
-  "156": "forest_road",
-  "157": "formal_garden",
-  "158": "fountain",
-  "159": "galley",
-  "160": "game_room",
-  "161": "garage_indoor",
-  "162": "garage_outdoor",
-  "163": "garbage_dump",
-  "164": "gas_station",
-  "165": "gazebo_exterior",
-  "166": "general_store_indoor",
-  "167": "general_store_outdoor",
-  "168": "gift_shop",
-  "169": "golf_course",
-  "170": "greenhouse_indoor",
-  "171": "greenhouse_outdoor",
-  "172": "gymnasium_indoor",
-  "173": "hangar_indoor",
-  "174": "hangar_outdoor",
-  "175": "harbor",
-  "176": "hardware_store",
-  "177": "hayfield",
-  "178": "heliport",
-  "179": "herb_garden",
-  "180": "highway",
-  "181": "hill",
-  "182": "home_office",
-  "183": "hospital",
-  "184": "hospital_room",
-  "185": "hot_spring",
-  "186": "hot_tub_outdoor",
-  "187": "hotel",
-  "188": "hotel_outdoor",
-  "189": "hotel_room",
-  "190": "house",
-  "191": "hunting_lodge_outdoor",
-  "192": "ice_cream_parlor",
-  "193": "ice_floe",
-  "194": "ice_shelf",
-  "195": "ice_skating_rink_indoor",
-  "196": "ice_skating_rink_outdoor",
-  "197": "iceberg",
-  "198": "igloo",
-  "199": "industrial_area",
-  "200": "inn_outdoor",
-  "201": "islet",
-  "202": "jacuzzi_indoor",
-  "203": "jail_cell",
-  "204": "jail_indoor",
-  "205": "jewelry_shop",
-  "206": "kasbah",
-  "207": "kennel_indoor",
-  "208": "kennel_outdoor",
-  "209": "kindergarden_classroom",
-  "210": "kitchen",
-  "211": "kitchenette",
-  "212": "labyrinth_outdoor",
-  "213": "lake_natural",
-  "214": "landfill",
-  "215": "landing_deck",
-  "216": "laundromat",
-  "217": "lecture_room",
-  "218": "library_indoor",
-  "219": "library_outdoor",
-  "220": "lido_deck_outdoor",
-  "221": "lift_bridge",
-  "222": "lighthouse",
-  "223": "limousine_interior",
-  "224": "living_room",
-  "225": "loading_dock",
-  "226": "lobby",
-  "227": "lock_chamber",
-  "228": "locker_room",
-  "229": "mansion",
-  "230": "manufactured_home",
-  "231": "market_indoor",
-  "232": "market_outdoor",
-  "233": "marsh",
-  "234": "martial_arts_gym",
-  "235": "mausoleum",
-  "236": "medina",
-  "237": "moat_water",
-  "238": "monastery_outdoor",
-  "239": "mosque_indoor",
-  "240": "mosque_outdoor",
-  "241": "motel",
-  "242": "mountain",
-  "243": "mountain_path",
-  "244": "mountain_snowy",
-  "245": "movie_theater_indoor",
-  "246": "museum_indoor",
-  "247": "museum_outdoor",
-  "248": "music_store",
-  "249": "music_studio",
-  "250": "nuclear_power_plant_outdoor",
-  "251": "nursery",
-  "252": "oast_house",
-  "253": "observatory_indoor",
-  "254": "observatory_outdoor",
-  "255": "ocean",
-  "256": "office",
-  "257": "office_building",
-  "258": "office_cubicles",
-  "259": "oil_refinery_outdoor",
-  "260": "oilrig",
-  "261": "operating_room",
-  "262": "orchard",
-  "263": "outhouse_outdoor",
-  "264": "pagoda",
-  "265": "palace",
-  "266": "pantry",
-  "267": "park",
-  "268": "parking_garage_indoor",
-  "269": "parking_garage_outdoor",
-  "270": "parking_lot",
-  "271": "parlor",
-  "272": "pasture",
-  "273": "patio",
-  "274": "pavilion",
-  "275": "pharmacy",
-  "276": "phone_booth",
-  "277": "physics_laboratory",
-  "278": "picnic_area",
-  "279": "pilothouse_indoor",
-  "280": "planetarium_indoor",
-  "281": "playground",
-  "282": "playroom",
-  "283": "plaza",
-  "284": "podium_indoor",
-  "285": "podium_outdoor",
-  "286": "pond",
-  "287": "poolroom_home",
-  "288": "poolroom_establishment",
-  "289": "power_plant_outdoor",
-  "290": "promenade_deck",
-  "291": "pub_indoor",
-  "292": "pulpit",
-  "293": "putting_green",
-  "294": "racecourse",
-  "295": "raceway",
-  "296": "raft",
-  "297": "railroad_track",
-  "298": "rainforest",
-  "299": "reception",
-  "300": "recreation_room",
-  "301": "residential_neighborhood",
-  "302": "restaurant",
-  "303": "restaurant_kitchen",
-  "304": "restaurant_patio",
-  "305": "rice_paddy",
-  "306": "riding_arena",
-  "307": "river",
-  "308": "rock_arch",
-  "309": "rope_bridge",
-  "310": "ruin",
-  "311": "runway",
-  "312": "sandbar",
-  "313": "sandbox",
-  "314": "sauna",
-  "315": "schoolhouse",
-  "316": "sea_cliff",
-  "317": "server_room",
-  "318": "shed",
-  "319": "shoe_shop",
-  "320": "shop_front",
-  "321": "shopping_mall_indoor",
-  "322": "shower",
-  "323": "skatepark",
-  "324": "ski_resort",
-  "325": "ski_slope",
-  "326": "sky",
-  "327": "skyscraper",
-  "328": "slum",
-  "329": "snowfield",
-  "330": "squash_court",
-  "331": "stable",
-  "332": "stadium_baseball",
-  "333": "stadium_football",
-  "334": "staircase",
-  "335": "street",
-  "336": "subway_interior",
-  "337": "subway_station_platform",
-  "338": "supermarket",
-  "339": "sushi_bar",
-  "340": "swamp",
-  "341": "swimming_hole",
-  "342": "swimming_pool_indoor",
-  "343": "swimming_pool_outdoor",
-  "344": "synagogue_indoor",
-  "345": "synagogue_outdoor",
-  "346": "television_room",
-  "347": "television_studio",
-  "348": "temple_asia",
-  "349": "temple_europe",
-  "350": "trench",
-  "351": "underwater_coral_reef",
-  "352": "utility_room",
-  "353": "valley",
-  "354": "van_interior",
-  "355": "vegetable_garden",
-  "356": "veranda",
-  "357": "veterinarians_office",
-  "358": "viaduct",
-  "359": "videostore",
-  "360": "village",
-  "361": "vineyard",
-  "362": "volcano",
-  "363": "volleyball_court_indoor",
-  "364": "volleyball_court_outdoor",
-  "365": "waiting_room",
-  "366": "warehouse_indoor",
-  "367": "water_tower",
-  "368": "waterfall_block",
-  "369": "waterfall_fan",
-  "370": "waterfall_plunge",
-  "371": "wetland",
-  "372": "wheat_field",
-  "373": "wind_farm",
-  "374": "windmill",
-  "375": "wine_cellar_barrel_storage",
-  "376": "wine_cellar_bottle_storage",
-  "377": "wrestling_ring_indoor",
-  "378": "yard",
-  "379": "youth_hostel"
+  "110": "creek",
+  "111": "crevasse",
+  "112": "crosswalk",
+  "113": "dam",
+  "114": "delicatessen",
+  "115": "department_store",
+  "116": "sand",
+  "117": "vegetation",
+  "118": "desert_road",
+  "119": "outdoor",
+  "120": "dining_hall",
+  "121": "dining_room",
+  "122": "discotheque",
+  "123": "outdoor",
+  "124": "dorm_room",
+  "125": "downtown",
+  "126": "dressing_room",
+  "127": "driveway",
+  "128": "drugstore",
+  "129": "door",
+  "130": "elevator_lobby",
+  "131": "elevator_shaft",
+  "132": "embassy",
+  "133": "engine_room",
+  "134": "entrance_hall",
+  "135": "indoor",
+  "136": "excavation",
+  "137": "fabric_store",
+  "138": "farm",
+  "139": "fastfood_restaurant",
+  "140": "cultivated",
+  "141": "wild",
+  "142": "field_road",
+  "143": "fire_escape",
+  "144": "fire_station",
+  "145": "fishpond",
+  "146": "indoor",
+  "147": "indoor",
+  "148": "food_court",
+  "149": "football_field",
+  "150": "broadleaf",
+  "151": "forest_path",
+  "152": "forest_road",
+  "153": "formal_garden",
+  "154": "fountain",
+  "155": "galley",
+  "156": "indoor",
+  "157": "outdoor",
+  "158": "gas_station",
+  "159": "exterior",
+  "160": "indoor",
+  "161": "outdoor",
+  "162": "gift_shop",
+  "163": "glacier",
+  "164": "golf_course",
+  "165": "indoor",
+  "166": "outdoor",
+  "167": "grotto",
+  "168": "indoor",
+  "169": "indoor",
+  "170": "outdoor",
+  "171": "harbor",
+  "172": "hardware_store",
+  "173": "hayfield",
+  "174": "heliport",
+  "175": "highway",
+  "176": "home_office",
+  "177": "home_theater",
+  "178": "hospital",
+  "179": "hospital_room",
+  "180": "hot_spring",
+  "181": "outdoor",
+  "182": "hotel_room",
+  "183": "house",
+  "184": "outdoor",
+  "185": "ice_cream_parlor",
+  "186": "ice_floe",
+  "187": "ice_shelf",
+  "188": "indoor",
+  "189": "outdoor",
+  "190": "iceberg",
+  "191": "igloo",
+  "192": "industrial_area",
+  "193": "outdoor",
+  "194": "islet",
+  "195": "indoor",
+  "196": "jail_cell",
+  "197": "japanese_garden",
+  "198": "jewelry_shop",
+  "199": "junkyard",
+  "200": "kasbah",
+  "201": "outdoor",
+  "202": "kindergarden_classroom",
+  "203": "kitchen",
+  "204": "lagoon",
+  "205": "natural",
+  "206": "landfill",
+  "207": "landing_deck",
+  "208": "laundromat",
+  "209": "lawn",
+  "210": "lecture_room",
+  "211": "legislative_chamber",
+  "212": "indoor",
+  "213": "outdoor",
+  "214": "lighthouse",
+  "215": "living_room",
+  "216": "loading_dock",
+  "217": "lobby",
+  "218": "lock_chamber",
+  "219": "locker_room",
+  "220": "mansion",
+  "221": "manufactured_home",
+  "222": "indoor",
+  "223": "outdoor",
+  "224": "marsh",
+  "225": "martial_arts_gym",
+  "226": "mausoleum",
+  "227": "medina",
+  "228": "mezzanine",
+  "229": "water",
+  "230": "outdoor",
+  "231": "motel",
+  "232": "mountain",
+  "233": "mountain_path",
+  "234": "mountain_snowy",
+  "235": "indoor",
+  "236": "indoor",
+  "237": "outdoor",
+  "238": "music_studio",
+  "239": "natural_history_museum",
+  "240": "nursery",
+  "241": "nursing_home",
+  "242": "oast_house",
+  "243": "ocean",
+  "244": "office",
+  "245": "office_building",
+  "246": "office_cubicles",
+  "247": "oilrig",
+  "248": "operating_room",
+  "249": "orchard",
+  "250": "orchestra_pit",
+  "251": "pagoda",
+  "252": "palace",
+  "253": "pantry",
+  "254": "park",
+  "255": "indoor",
+  "256": "outdoor",
+  "257": "parking_lot",
+  "258": "pasture",
+  "259": "patio",
+  "260": "pavilion",
+  "261": "pet_shop",
+  "262": "pharmacy",
+  "263": "phone_booth",
+  "264": "physics_laboratory",
+  "265": "picnic_area",
+  "266": "pier",
+  "267": "pizzeria",
+  "268": "playground",
+  "269": "playroom",
+  "270": "plaza",
+  "271": "pond",
+  "272": "porch",
+  "273": "promenade",
+  "274": "indoor",
+  "275": "racecourse",
+  "276": "raceway",
+  "277": "raft",
+  "278": "railroad_track",
+  "279": "rainforest",
+  "280": "reception",
+  "281": "recreation_room",
+  "282": "repair_shop",
+  "283": "residential_neighborhood",
+  "284": "restaurant",
+  "285": "restaurant_kitchen",
+  "286": "restaurant_patio",
+  "287": "rice_paddy",
+  "288": "river",
+  "289": "rock_arch",
+  "290": "roof_garden",
+  "291": "rope_bridge",
+  "292": "ruin",
+  "293": "runway",
+  "294": "sandbox",
+  "295": "sauna",
+  "296": "schoolhouse",
+  "297": "science_museum",
+  "298": "server_room",
+  "299": "shed",
+  "300": "shoe_shop",
+  "301": "shopfront",
+  "302": "indoor",
+  "303": "shower",
+  "304": "ski_resort",
+  "305": "ski_slope",
+  "306": "sky",
+  "307": "skyscraper",
+  "308": "slum",
+  "309": "snowfield",
+  "310": "soccer_field",
+  "311": "stable",
+  "312": "baseball",
+  "313": "football",
+  "314": "soccer",
+  "315": "indoor",
+  "316": "outdoor",
+  "317": "staircase",
+  "318": "storage_room",
+  "319": "street",
+  "320": "platform",
+  "321": "supermarket",
+  "322": "sushi_bar",
+  "323": "swamp",
+  "324": "swimming_hole",
+  "325": "indoor",
+  "326": "outdoor",
+  "327": "outdoor",
+  "328": "television_room",
+  "329": "television_studio",
+  "330": "asia",
+  "331": "throne_room",
+  "332": "ticket_booth",
+  "333": "topiary_garden",
+  "334": "tower",
+  "335": "toyshop",
+  "336": "train_interior",
+  "337": "platform",
+  "338": "tree_farm",
+  "339": "tree_house",
+  "340": "trench",
+  "341": "tundra",
+  "342": "ocean_deep",
+  "343": "utility_room",
+  "344": "valley",
+  "345": "vegetable_garden",
+  "346": "veterinarians_office",
+  "347": "viaduct",
+  "348": "village",
+  "349": "vineyard",
+  "350": "volcano",
+  "351": "outdoor",
+  "352": "waiting_room",
+  "353": "water_park",
+  "354": "water_tower",
+  "355": "waterfall",
+  "356": "watering_hole",
+  "357": "wave",
+  "358": "wet_bar",
+  "359": "wheat_field",
+  "360": "wind_farm",
+  "361": "windmill",
+  "362": "yard",
+  "363": "youth_hostel",
+  "364": "zen_garden"
 }
@@ -162,9 +162,13 @@ class SceneClassifier:
            model_path: Core ML 模型路徑 (可選)
        """
        self.model_path = model_path
+        self.places365_model_path = (
+            "/Users/accusys/momentry/models/resnet18_places365.pth.tar"
+        )
        self.model = None
        self.coreml_model = None
        self.transform = None
+        self.model_type = "unknown"

        # 圖像預處理
        self.transform = transforms.Compose(
@@ -189,23 +193,57 @@ class SceneClassifier:
            try:
                print(f"[SCENE] Loading Core ML model: {self.model_path}")
                self.coreml_model = ct.models.MLModel(self.model_path)
+                self.model_type = "coreml"
                print("[SCENE] Core ML model loaded successfully")
                return True
            except Exception as e:
                print(f"[SCENE] Warning: Failed to load Core ML model: {e}")

-        # 備案：使用 PyTorch + ResNet
+        # 備案：使用 PyTorch + Places365
        if HAS_TORCH:
            try:
                print(f"[SCENE] Loading PyTorch model on {DEVICE}")
-                # 使用預訓練的 ResNet18
-                self.model = models.resnet18(pretrained=True)
+
+                # 檢查 Places365 模型是否存在
+                if Path(self.places365_model_path).exists():
+                    print(
+                        f"[SCENE] Loading Places365 model: {self.places365_model_path}"
+                    )
+                    checkpoint = torch.load(
+                        self.places365_model_path, map_location=DEVICE
+                    )
+
+                    # 建立 ResNet18 模型 (Places365 有 365 個類別)
+                    self.model = models.resnet18(num_classes=365)
+
+                    # 移除 'module.' prefix (DataParallel training)
+                    state_dict = checkpoint["state_dict"]
+                    new_state_dict = {}
+                    for k, v in state_dict.items():
+                        if k.startswith("module."):
+                            new_state_dict[k[7:]] = v
+                        else:
+                            new_state_dict[k] = v
+
+                    self.model.load_state_dict(new_state_dict)
+                    self.model_type = "places365"
+                    print("[SCENE] Places365 model loaded successfully (365 classes)")
+                else:
+                    print(
+                        f"[SCENE] Places365 model not found, using ImageNet pretrained"
+                    )
+                    self.model = models.resnet18(pretrained=True)
+                    self.model_type = "imagenet"
+
                self.model.to(DEVICE)
                self.model.eval()
                print("[SCENE] PyTorch model loaded successfully")
                return True
            except Exception as e:
                print(f"[SCENE] Warning: Failed to load PyTorch model: {e}")
+                import traceback
+
+                traceback.print_exc()

        print("[SCENE] Error: No model available")
        return False
@@ -1,12 +1,8 @@
 #!/opt/homebrew/bin/python3.11
 """
 Story Processor - Generate parent-child chunk hierarchy for RAG
-Uses video analysis (ASR, YOLO, OCR) to create parent chunks that summarize child chunks.
-
-Parent-Child Chunk Strategy:
- Parent chunks: Summarize multiple scenes/segments with narrative description
- Child chunks: Individual ASR segments, OCR texts, detected objects
- When embedding: Parent description + Child content for better retrieval
+Uses LOCAL video analysis (ASR, YOLO, OCR, Scene) to create parent chunks.
+NO cloud API calls - fully offline processing
 """

 import sys
@@ -47,57 +43,59 @@ def generate_parent_child_chunks(
    cut_data: Dict,
    yolo_data: Dict,
    ocr_data: Dict,
+    scene_data: Dict,
    parent_chunk_size: int = 5,
-) -> Dict[str, Any]:
+) -> Dict:
    """
-    Generate parent-child chunk hierarchy.
-
-    Parent chunks summarize multiple child chunks for better RAG retrieval.
-    Child chunks are individual segments from ASR, scenes from CUT, etc.
+    Generate parent-child chunk hierarchy using LOCAL data only.
+    No LLM/API calls - uses template-based narrative generation.
    """
-
    child_chunks = []
    parent_chunks = []

-    # Get source data
-    asr_segments = asr_data.get("segments", [])
-    cut_scenes = cut_data.get("scenes", [])
-    yolo_frames = yolo_data.get("frames", [])
-    _ocr_frames = ocr_data.get("frames", [])
-
-    # Create child chunks from ASR segments
-    asr_child_ids = []
-    for i, seg in enumerate(asr_segments):
-        child_chunk = {
-            "chunk_id": f"asr_{i:04d}",
-            "chunk_type": "sentence",
-            "source": "asr",
-            "start_time": seg.get("start", 0),
-            "end_time": seg.get("end", 0),
-            "text_content": seg.get("text", ""),
-            "content": seg,
-            "child_chunk_ids": [],
-            "parent_chunk_id": None,
-        }
-        child_chunks.append(child_chunk)
-        asr_child_ids.append(child_chunk["chunk_id"])
+    # Create child chunks from ASR
+    for seg in asr_data.get("segments", []):
+        child_chunks.append(
+            {
+                "chunk_id": f"asr_{seg.get('start', 0):.1f}_{seg.get('end', 0):.1f}",
+                "chunk_type": "asr",
+                "source": "asr",
+                "start_time": seg.get("start", 0),
+                "end_time": seg.get("end", 0),
+                "text_content": seg.get("text", ""),
+                "content": {
+                    "text": seg.get("text", ""),
+                    "confidence": seg.get("confidence", 0),
+                },
+                "child_chunk_ids": [],
+                "parent_chunk_id": None,
+            }
+        )

    # Create child chunks from CUT scenes
-    cut_child_ids = []
-    for i, scene in enumerate(cut_scenes):
-        child_chunk = {
-            "chunk_id": f"cut_{i:04d}",
-            "chunk_type": "cut",
-            "source": "cut",
-            "start_time": scene.get("start_time", scene.get("start", 0)),
-            "end_time": scene.get("end_time", scene.get("end", 0)),
-            "text_content": None,
-            "content": scene,
-            "child_chunk_ids": [],
-            "parent_chunk_id": None,
-        }
-        child_chunks.append(child_chunk)
-        cut_child_ids.append(child_chunk["chunk_id"])
+    for scene in cut_data.get("scenes", []):
+        child_chunks.append(
+            {
+                "chunk_id": f"cut_{scene.get('scene_number', 0)}",
+                "chunk_type": "cut",
+                "source": "cut",
+                "start_time": scene.get("start_time", 0),
+                "end_time": scene.get("end_time", 0),
+                "text_content": f"Scene {scene.get('scene_number', 0)}",
+                "content": {
+                    "scene_number": scene.get("scene_number", 0),
+                    "duration": scene.get("duration", 0),
+                },
+                "child_chunk_ids": [],
+                "parent_chunk_id": None,
+            }
+        )
+
+    asr_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "asr"]
+    cut_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "cut"]
+
+    yolo_frames = yolo_data.get("frames", [])
+    ocr_frames = ocr_data.get("frames", [])

    # Group ASR segments into parent chunks
    for i in range(0, len(asr_child_ids), parent_chunk_size):
@@ -105,7 +103,6 @@ def generate_parent_child_chunks(
        if not batch:
            continue

-        # Collect text from child chunks
        batch_texts = []
        batch_objects = []
        batch_times = []
@@ -118,11 +115,16 @@ def generate_parent_child_chunks(
                    batch_times.append((child["start_time"], child["end_time"]))
                    break

-        # Create parent chunk with narrative description
        start_time = batch_times[0][0] if batch_times else 0
        end_time = batch_times[-1][1] if batch_times else 0

-        # Generate narrative description
+        # Find objects in this time range
+        for frame in yolo_frames[:50]:
+            ts = frame.get("timestamp", 0)
+            if start_time <= ts <= end_time:
+                for obj in frame.get("objects", []):
+                    batch_objects.append(obj.get("class_name", "unknown"))
+
        narrative = generate_narrative(batch_texts, batch_objects, start_time, end_time)

        parent_chunk = {
@@ -136,13 +138,13 @@ def generate_parent_child_chunks(
                "description": narrative,
                "child_count": len(batch),
                "speech_preview": " ".join(batch_texts[:3]) if batch_texts else None,
+                "detected_objects": list(set(batch_objects))[:5],
            },
            "child_chunk_ids": batch,
            "parent_chunk_id": None,
        }
        parent_chunks.append(parent_chunk)

-        # Update child chunks with parent reference
        for child_id in batch:
            for child in child_chunks:
                if child["chunk_id"] == child_id:
@@ -167,14 +169,12 @@ def generate_parent_child_chunks(
        start_time = batch_times[0][0] if batch_times else 0
        end_time = batch_times[-1][1] if batch_times else 0

-        # Find objects in this time range from YOLO
-        for frame in yolo_frames[:100]:  # Sample frames
+        for frame in yolo_frames[:50]:
            ts = frame.get("timestamp", 0)
            if start_time <= ts <= end_time:
                for obj in frame.get("objects", []):
                    batch_objects.append(obj.get("class_name", "unknown"))

-        # Generate scene narrative
        narrative = generate_scene_narrative(
            batch_objects, start_time, end_time, len(batch)
        )
@@ -190,14 +190,13 @@ def generate_parent_child_chunks(
                "description": narrative,
                "child_count": len(batch),
                "scenes": batch,
-                "detected_objects": list(set(batch_objects))[:10],
+                "detected_objects": list(set(batch_objects))[:5],
            },
            "child_chunk_ids": batch,
            "parent_chunk_id": None,
        }
        parent_chunks.append(parent_chunk)

-        # Update child chunks with parent reference
        for child_id in batch:
            for child in child_chunks:
                if child["chunk_id"] == child_id:
@@ -219,27 +218,33 @@ def generate_parent_child_chunks(
 def generate_narrative(
    texts: List[str], objects: List[str], start: float, end: float
 ) -> str:
-    """Generate narrative description from text snippets"""
-    if not texts:
+    """Generate narrative description from LOCAL text snippets and objects"""
+    if not texts and not objects:
        return f"Video segment from {start:.1f}s to {end:.1f}s"

-    # Combine and summarize
-    combined = " ".join(texts)
-    if len(combined) > 200:
-        combined = combined[:200] + "..."
+    parts = []
+    if texts:
+        combined = " ".join(texts[:5])
+        if len(combined) > 150:
+            combined = combined[:150] + "..."
+        parts.append(f"Speech: {combined}")

-    return f"[{start:.0f}s-{end:.0f}s] {combined}"
+    if objects:
+        unique_objs = list(set(objects))[:5]
+        parts.append(f"Visuals: {', '.join(unique_objs)}")
+
+    return f"[{start:.0f}s-{end:.0f}s] {' | '.join(parts)}"


 def generate_scene_narrative(
    objects: List[str], start: float, end: float, scene_count: int
 ) -> str:
-    """Generate scene narrative from detected objects"""
+    """Generate scene narrative from LOCAL detected objects"""
    unique_objects = list(set(objects))[:5]

    if unique_objects:
        obj_str = ", ".join(unique_objects)
-        return f"[{start:.0f}s-{end:.0f}s] Scenes {scene_count} segments. Visual: {obj_str}."
+        return f"[{start:.0f}s-{end:.0f}s] {scene_count} scenes. Visuals: {obj_str}."
    else:
        return f"[{start:.0f}s-{end:.0f}s] {scene_count} video scenes."

@@ -251,70 +256,45 @@ def run_story(
    if publisher:
        publisher.info("story", "STORY_START")

-    # Load existing JSON files
    base_path = os.path.dirname(output_path)
    uuid_name = os.path.basename(output_path).split(".")[0]

-    # Load analysis data
    asr_data = {"segments": []}
    cut_data = {"scenes": []}
    yolo_data = {"frames": []}
    ocr_data = {"frames": []}
+    scene_data = {"scenes": []}

-    # Load ASR
-    asr_path = os.path.join(base_path, f"{uuid_name}.asr.json")
-    if os.path.exists(asr_path):
-        with open(asr_path) as f:
-            asr_data = json.load(f)
-        if publisher:
-            publisher.info(
-                "story", f"Loaded ASR: {len(asr_data.get('segments', []))} segments"
-            )
+    for name, data_var in [
+        ("asr", asr_data),
+        ("cut", cut_data),
+        ("yolo", yolo_data),
+        ("ocr", ocr_data),
+        ("scene", scene_data),
+    ]:
+        path = os.path.join(base_path, f"{uuid_name}.{name}.json")
+        if os.path.exists(path):
+            with open(path) as f:
+                data_var.update(json.load(f))

-    # Load CUT
-    cut_path = os.path.join(base_path, f"{uuid_name}.cut.json")
-    if os.path.exists(cut_path):
-        with open(cut_path) as f:
-            cut_data = json.load(f)
-        if publisher:
-            publisher.info(
-                "story", f"Loaded CUT: {len(cut_data.get('scenes', []))} scenes"
-            )
-
-    # Load YOLO
-    yolo_path = os.path.join(base_path, f"{uuid_name}.yolo.json")
-    if os.path.exists(yolo_path):
-        with open(yolo_path) as f:
-            yolo_data = json.load(f)
-
-    # Load OCR
-    ocr_path = os.path.join(base_path, f"{uuid_name}.ocr.json")
-    if os.path.exists(ocr_path):
-        with open(ocr_path) as f:
-            ocr_data = json.load(f)
-
-    # Load metadata
-    metadata = extract_video_metadata(video_path)
-
-    if publisher:
-        publisher.info("story", "Generating parent-child chunks...")
-
-    # Generate parent-child hierarchy
    result = generate_parent_child_chunks(
-        asr_data, cut_data, yolo_data, ocr_data, parent_chunk_size
+        asr_data, cut_data, yolo_data, ocr_data, scene_data, parent_chunk_size
    )

-    result["metadata"] = metadata
-    result["parent_chunk_size"] = parent_chunk_size
+    result["video_metadata"] = extract_video_metadata(video_path)
+    result["processing"] = {
+        "method": "local_aggregation",
+        "cloud_api_used": False,
+        "parent_chunk_size": parent_chunk_size,
+    }

    with open(output_path, "w") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    if publisher:
-        stats = result["stats"]
        publisher.complete(
            "story",
-            f"{stats['total_parent_chunks']} parents, {stats['total_child_chunks']} children",
+            f"{result['stats']['total_parent_chunks']} parent, {result['stats']['total_child_chunks']} child chunks (LOCAL)",
        )

    return result
@@ -322,7 +302,7 @@ def run_story(

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="Video Story Generator - Parent-Child Chunks"
+        description="Story Processor - Parent-Child Chunk Hierarchy (LOCAL ONLY)"
    )
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
@@ -331,7 +311,7 @@ if __name__ == "__main__":
        "--parent-chunk-size",
        type=int,
        default=5,
-        help="Number of child chunks per parent chunk",
+        help="Number of child chunks per parent",
    )

    args = parser.parse_args()
@@ -340,6 +320,6 @@ if __name__ == "__main__":
        args.video_path, args.output_path, args.uuid, args.parent_chunk_size
    )
    print(
-        f"Story generated: {result['stats']['total_parent_chunks']} parent chunks, "
-        f"{result['stats']['total_child_chunks']} child chunks"
+        f"Story generated: {result['stats']['total_parent_chunks']} parent, "
+        f"{result['stats']['total_child_chunks']} child chunks (LOCAL)"
    )