momentry_core/scripts/generate_parent_chunks_gemma4.py

#!/opt/homebrew/bin/python3.11
"""
Regenerate ALL parent chunks for 384b0ff44aaaa1f1 using gemma4
Groups ASR chunks into ~17 logical scenes and generates summaries.
"""

import json
import subprocess
import psycopg2
import psycopg2.extras

DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
UUID = "384b0ff44aaaa1f1"
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "gemma4:latest"

# Target ~17 scenes across 6865s = ~400s per scene
# But use natural breaks (gaps in dialogue) to split
SCENE_TARGET_COUNT = 17


def get_chunks():
    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    cur.execute(
        """
        SELECT id, chunk_id, start_time, end_time, start_frame, end_frame,
               text_content, fps
        FROM chunks
        WHERE uuid = %s AND chunk_type = 'sentence'
        ORDER BY start_time
    """,
        (UUID,),
    )
    chunks = cur.fetchall()
    cur.close()
    conn.close()
    return chunks


def call_gemma4(prompt, max_tokens=300):
    payload = {
        "model": MODEL,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.3, "num_predict": max_tokens},
    }
    try:
        resp = subprocess.run(
            ["curl", "-s", OLLAMA_URL, "-d", json.dumps(payload)],
            capture_output=True,
            text=True,
            timeout=180,
        )
        if resp.returncode == 0:
            result = json.loads(resp.stdout)
            return result.get("response", "").strip()
    except Exception as e:
        print(f"    ⚠️  Ollama error: {e}")
    return ""


def find_scene_boundaries(chunks, target_count=SCENE_TARGET_COUNT):
    """Find optimal scene boundaries based on dialogue gaps"""
    if not chunks:
        return []

    # Calculate gaps between consecutive chunks
    gaps = []
    for i in range(1, len(chunks)):
        gap = chunks[i]["start_time"] - chunks[i - 1]["end_time"]
        gaps.append((i, gap))

    # Sort by gap size, take top (target_count - 1) gaps
    gaps.sort(key=lambda x: x[1], reverse=True)
    split_indices = sorted([g[0] for g in gaps[: target_count - 1]])

    # Create scenes
    scenes = []
    start = 0
    for split in split_indices:
        scenes.append(chunks[start:split])
        start = split
    scenes.append(chunks[start:])

    return scenes


def generate_summary(scene_chunks, scene_num):
    """Generate summary for a scene using gemma4"""
    texts = [c["text_content"] for c in scene_chunks if c["text_content"]]
    if not texts:
        return f"Scene {scene_num}: No dialogue"

    combined = " ".join(texts)[:3000]
    duration = scene_chunks[-1]["end_time"] - scene_chunks[0]["start_time"]

    prompt = f"""You are a professional film scene analyst. Given the following dialogue transcript from a movie scene, write a concise one-sentence English summary.

Duration: {duration:.0f} seconds
Dialogue:
{combined}

Provide ONLY the summary sentence, nothing else. Focus on plot events and character actions."""

    summary = call_gemma4(prompt, max_tokens=250)
    if not summary:
        # Fallback: use first few words of dialogue
        summary = f"Scene {scene_num}: {' '.join(texts[:3])[:80]}..."
    return summary


def insert_parent_chunks(scenes):
    """Insert parent chunks and update child relationships"""
    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor()

    inserted = 0
    for i, scene_chunks in enumerate(scenes):
        start_time = scene_chunks[0]["start_time"]
        end_time = scene_chunks[-1]["end_time"]
        start_frame = int(scene_chunks[0]["start_frame"])
        end_frame = int(scene_chunks[-1]["end_frame"])
        fps = float(scene_chunks[0]["fps"]) if scene_chunks[0]["fps"] else 59.94
        chunk_count = len(scene_chunks)

        print(
            f"  Scene {i}: {start_time:.0f}s-{end_time:.0f}s ({chunk_count} chunks, {end_time - start_time:.0f}s)"
        )

        # Generate summary
        summary = generate_summary(scene_chunks, i)
        print(f"    📝 {summary[:100]}...")

        # Insert parent chunk
        cur.execute(
            """
            INSERT INTO parent_chunks (
                uuid, scene_order, start_time, end_time,
                start_frame, end_frame, fps, summary_text,
                metadata, rule_3_markers, created_at
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())
            RETURNING id
        """,
            (
                UUID,
                i,
                start_time,
                end_time,
                start_frame,
                end_frame,
                fps,
                summary,
                json.dumps({"auto_generated_by": "gemma4", "chunk_count": chunk_count}),
                json.dumps({}),
            ),
        )
        parent_id = cur.fetchone()[0]

        # Update chunks with parent_chunk_id
        chunk_ids = [c["chunk_id"] for c in scene_chunks]
        child_ids_array = chunk_ids  # Store all child chunk IDs

        cur.execute(
            """
            UPDATE chunks
            SET parent_chunk_id = %s::varchar
            WHERE uuid = %s AND chunk_id = ANY(%s)
        """,
            (str(parent_id), UUID, chunk_ids),
        )

        inserted += 1
        if i % 5 == 4 or i == len(scenes) - 1:
            conn.commit()
            print(f"    ✅ Committed scenes 0-{i}")

    conn.commit()
    cur.close()
    conn.close()
    return inserted


def main():
    print(f"🎬 Regenerating parent chunks for {UUID}")
    print(f"   Using model: {MODEL}")
    print("=" * 70)

    # Step 1: Get all chunks
    print("\n📥 Fetching ASR chunks...")
    chunks = get_chunks()
    print(f"   Found {len(chunks)} sentence chunks")
    if chunks:
        print(f"   Time range: 0-{chunks[-1]['end_time']:.0f}s")

    # Step 2: Find scene boundaries
    print(f"\n🔍 Finding {SCENE_TARGET_COUNT} scene boundaries...")
    scenes = find_scene_boundaries(chunks, SCENE_TARGET_COUNT)
    print(f"   Created {len(scenes)} scenes")
    for i, s in enumerate(scenes):
        print(
            f"     Scene {i}: {s[0]['start_time']:.0f}s-{s[-1]['end_time']:.0f}s ({len(s)} chunks)"
        )

    # Step 3: Generate summaries and insert
    print(f"\n🤖 Generating summaries with gemma4...")
    inserted = insert_parent_chunks(scenes)

    print(f"\n{'=' * 70}")
    print(f"✅ Created {inserted} parent chunks")

    # Step 4: Verify
    print("\n📊 Verification:")
    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor()
    cur.execute("SELECT COUNT(*) FROM parent_chunks WHERE uuid = %s", (UUID,))
    print(f"   parent_chunks: {cur.fetchone()[0]}")
    cur.execute(
        "SELECT COUNT(*) FROM chunks WHERE uuid = %s AND parent_chunk_id IS NULL AND chunk_type = 'sentence'",
        (UUID,),
    )
    print(f"   orphan chunks: {cur.fetchone()[0]}")
    cur.close()
    conn.close()


if __name__ == "__main__":
    main()