Files
momentry_core/scripts/generate_parent_chunks_gemma4.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

229 lines
6.8 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Regenerate ALL parent chunks for 384b0ff44aaaa1f1 using gemma4
Groups ASR chunks into ~17 logical scenes and generates summaries.
"""
import json
import subprocess
import psycopg2
import psycopg2.extras
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
UUID = "384b0ff44aaaa1f1"
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "gemma4:latest"
# Target ~17 scenes across 6865s = ~400s per scene
# But use natural breaks (gaps in dialogue) to split
SCENE_TARGET_COUNT = 17
def get_chunks():
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(
"""
SELECT id, chunk_id, start_time, end_time, start_frame, end_frame,
text_content, fps
FROM chunks
WHERE uuid = %s AND chunk_type = 'sentence'
ORDER BY start_time
""",
(UUID,),
)
chunks = cur.fetchall()
cur.close()
conn.close()
return chunks
def call_gemma4(prompt, max_tokens=300):
payload = {
"model": MODEL,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.3, "num_predict": max_tokens},
}
try:
resp = subprocess.run(
["curl", "-s", OLLAMA_URL, "-d", json.dumps(payload)],
capture_output=True,
text=True,
timeout=180,
)
if resp.returncode == 0:
result = json.loads(resp.stdout)
return result.get("response", "").strip()
except Exception as e:
print(f" ⚠️ Ollama error: {e}")
return ""
def find_scene_boundaries(chunks, target_count=SCENE_TARGET_COUNT):
"""Find optimal scene boundaries based on dialogue gaps"""
if not chunks:
return []
# Calculate gaps between consecutive chunks
gaps = []
for i in range(1, len(chunks)):
gap = chunks[i]["start_time"] - chunks[i - 1]["end_time"]
gaps.append((i, gap))
# Sort by gap size, take top (target_count - 1) gaps
gaps.sort(key=lambda x: x[1], reverse=True)
split_indices = sorted([g[0] for g in gaps[: target_count - 1]])
# Create scenes
scenes = []
start = 0
for split in split_indices:
scenes.append(chunks[start:split])
start = split
scenes.append(chunks[start:])
return scenes
def generate_summary(scene_chunks, scene_num):
"""Generate summary for a scene using gemma4"""
texts = [c["text_content"] for c in scene_chunks if c["text_content"]]
if not texts:
return f"Scene {scene_num}: No dialogue"
combined = " ".join(texts)[:3000]
duration = scene_chunks[-1]["end_time"] - scene_chunks[0]["start_time"]
prompt = f"""You are a professional film scene analyst. Given the following dialogue transcript from a movie scene, write a concise one-sentence English summary.
Duration: {duration:.0f} seconds
Dialogue:
{combined}
Provide ONLY the summary sentence, nothing else. Focus on plot events and character actions."""
summary = call_gemma4(prompt, max_tokens=250)
if not summary:
# Fallback: use first few words of dialogue
summary = f"Scene {scene_num}: {' '.join(texts[:3])[:80]}..."
return summary
def insert_parent_chunks(scenes):
"""Insert parent chunks and update child relationships"""
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
inserted = 0
for i, scene_chunks in enumerate(scenes):
start_time = scene_chunks[0]["start_time"]
end_time = scene_chunks[-1]["end_time"]
start_frame = int(scene_chunks[0]["start_frame"])
end_frame = int(scene_chunks[-1]["end_frame"])
fps = float(scene_chunks[0]["fps"]) if scene_chunks[0]["fps"] else 59.94
chunk_count = len(scene_chunks)
print(
f" Scene {i}: {start_time:.0f}s-{end_time:.0f}s ({chunk_count} chunks, {end_time - start_time:.0f}s)"
)
# Generate summary
summary = generate_summary(scene_chunks, i)
print(f" 📝 {summary[:100]}...")
# Insert parent chunk
cur.execute(
"""
INSERT INTO parent_chunks (
uuid, scene_order, start_time, end_time,
start_frame, end_frame, fps, summary_text,
metadata, rule_3_markers, created_at
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())
RETURNING id
""",
(
UUID,
i,
start_time,
end_time,
start_frame,
end_frame,
fps,
summary,
json.dumps({"auto_generated_by": "gemma4", "chunk_count": chunk_count}),
json.dumps({}),
),
)
parent_id = cur.fetchone()[0]
# Update chunks with parent_chunk_id
chunk_ids = [c["chunk_id"] for c in scene_chunks]
child_ids_array = chunk_ids # Store all child chunk IDs
cur.execute(
"""
UPDATE chunks
SET parent_chunk_id = %s::varchar
WHERE uuid = %s AND chunk_id = ANY(%s)
""",
(str(parent_id), UUID, chunk_ids),
)
inserted += 1
if i % 5 == 4 or i == len(scenes) - 1:
conn.commit()
print(f" ✅ Committed scenes 0-{i}")
conn.commit()
cur.close()
conn.close()
return inserted
def main():
print(f"🎬 Regenerating parent chunks for {UUID}")
print(f" Using model: {MODEL}")
print("=" * 70)
# Step 1: Get all chunks
print("\n📥 Fetching ASR chunks...")
chunks = get_chunks()
print(f" Found {len(chunks)} sentence chunks")
if chunks:
print(f" Time range: 0-{chunks[-1]['end_time']:.0f}s")
# Step 2: Find scene boundaries
print(f"\n🔍 Finding {SCENE_TARGET_COUNT} scene boundaries...")
scenes = find_scene_boundaries(chunks, SCENE_TARGET_COUNT)
print(f" Created {len(scenes)} scenes")
for i, s in enumerate(scenes):
print(
f" Scene {i}: {s[0]['start_time']:.0f}s-{s[-1]['end_time']:.0f}s ({len(s)} chunks)"
)
# Step 3: Generate summaries and insert
print(f"\n🤖 Generating summaries with gemma4...")
inserted = insert_parent_chunks(scenes)
print(f"\n{'=' * 70}")
print(f"✅ Created {inserted} parent chunks")
# Step 4: Verify
print("\n📊 Verification:")
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
cur.execute("SELECT COUNT(*) FROM parent_chunks WHERE uuid = %s", (UUID,))
print(f" parent_chunks: {cur.fetchone()[0]}")
cur.execute(
"SELECT COUNT(*) FROM chunks WHERE uuid = %s AND parent_chunk_id IS NULL AND chunk_type = 'sentence'",
(UUID,),
)
print(f" orphan chunks: {cur.fetchone()[0]}")
cur.close()
conn.close()
if __name__ == "__main__":
main()