- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
229 lines
6.8 KiB
Python
229 lines
6.8 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Regenerate ALL parent chunks for 384b0ff44aaaa1f1 using gemma4
|
|
Groups ASR chunks into ~17 logical scenes and generates summaries.
|
|
"""
|
|
|
|
import json
|
|
import subprocess
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
|
|
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
|
|
UUID = "384b0ff44aaaa1f1"
|
|
OLLAMA_URL = "http://localhost:11434/api/generate"
|
|
MODEL = "gemma4:latest"
|
|
|
|
# Target ~17 scenes across 6865s = ~400s per scene
|
|
# But use natural breaks (gaps in dialogue) to split
|
|
SCENE_TARGET_COUNT = 17
|
|
|
|
|
|
def get_chunks():
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
|
cur.execute(
|
|
"""
|
|
SELECT id, chunk_id, start_time, end_time, start_frame, end_frame,
|
|
text_content, fps
|
|
FROM chunks
|
|
WHERE uuid = %s AND chunk_type = 'sentence'
|
|
ORDER BY start_time
|
|
""",
|
|
(UUID,),
|
|
)
|
|
chunks = cur.fetchall()
|
|
cur.close()
|
|
conn.close()
|
|
return chunks
|
|
|
|
|
|
def call_gemma4(prompt, max_tokens=300):
|
|
payload = {
|
|
"model": MODEL,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {"temperature": 0.3, "num_predict": max_tokens},
|
|
}
|
|
try:
|
|
resp = subprocess.run(
|
|
["curl", "-s", OLLAMA_URL, "-d", json.dumps(payload)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=180,
|
|
)
|
|
if resp.returncode == 0:
|
|
result = json.loads(resp.stdout)
|
|
return result.get("response", "").strip()
|
|
except Exception as e:
|
|
print(f" ⚠️ Ollama error: {e}")
|
|
return ""
|
|
|
|
|
|
def find_scene_boundaries(chunks, target_count=SCENE_TARGET_COUNT):
|
|
"""Find optimal scene boundaries based on dialogue gaps"""
|
|
if not chunks:
|
|
return []
|
|
|
|
# Calculate gaps between consecutive chunks
|
|
gaps = []
|
|
for i in range(1, len(chunks)):
|
|
gap = chunks[i]["start_time"] - chunks[i - 1]["end_time"]
|
|
gaps.append((i, gap))
|
|
|
|
# Sort by gap size, take top (target_count - 1) gaps
|
|
gaps.sort(key=lambda x: x[1], reverse=True)
|
|
split_indices = sorted([g[0] for g in gaps[: target_count - 1]])
|
|
|
|
# Create scenes
|
|
scenes = []
|
|
start = 0
|
|
for split in split_indices:
|
|
scenes.append(chunks[start:split])
|
|
start = split
|
|
scenes.append(chunks[start:])
|
|
|
|
return scenes
|
|
|
|
|
|
def generate_summary(scene_chunks, scene_num):
|
|
"""Generate summary for a scene using gemma4"""
|
|
texts = [c["text_content"] for c in scene_chunks if c["text_content"]]
|
|
if not texts:
|
|
return f"Scene {scene_num}: No dialogue"
|
|
|
|
combined = " ".join(texts)[:3000]
|
|
duration = scene_chunks[-1]["end_time"] - scene_chunks[0]["start_time"]
|
|
|
|
prompt = f"""You are a professional film scene analyst. Given the following dialogue transcript from a movie scene, write a concise one-sentence English summary.
|
|
|
|
Duration: {duration:.0f} seconds
|
|
Dialogue:
|
|
{combined}
|
|
|
|
Provide ONLY the summary sentence, nothing else. Focus on plot events and character actions."""
|
|
|
|
summary = call_gemma4(prompt, max_tokens=250)
|
|
if not summary:
|
|
# Fallback: use first few words of dialogue
|
|
summary = f"Scene {scene_num}: {' '.join(texts[:3])[:80]}..."
|
|
return summary
|
|
|
|
|
|
def insert_parent_chunks(scenes):
|
|
"""Insert parent chunks and update child relationships"""
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
cur = conn.cursor()
|
|
|
|
inserted = 0
|
|
for i, scene_chunks in enumerate(scenes):
|
|
start_time = scene_chunks[0]["start_time"]
|
|
end_time = scene_chunks[-1]["end_time"]
|
|
start_frame = int(scene_chunks[0]["start_frame"])
|
|
end_frame = int(scene_chunks[-1]["end_frame"])
|
|
fps = float(scene_chunks[0]["fps"]) if scene_chunks[0]["fps"] else 59.94
|
|
chunk_count = len(scene_chunks)
|
|
|
|
print(
|
|
f" Scene {i}: {start_time:.0f}s-{end_time:.0f}s ({chunk_count} chunks, {end_time - start_time:.0f}s)"
|
|
)
|
|
|
|
# Generate summary
|
|
summary = generate_summary(scene_chunks, i)
|
|
print(f" 📝 {summary[:100]}...")
|
|
|
|
# Insert parent chunk
|
|
cur.execute(
|
|
"""
|
|
INSERT INTO parent_chunks (
|
|
uuid, scene_order, start_time, end_time,
|
|
start_frame, end_frame, fps, summary_text,
|
|
metadata, rule_3_markers, created_at
|
|
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())
|
|
RETURNING id
|
|
""",
|
|
(
|
|
UUID,
|
|
i,
|
|
start_time,
|
|
end_time,
|
|
start_frame,
|
|
end_frame,
|
|
fps,
|
|
summary,
|
|
json.dumps({"auto_generated_by": "gemma4", "chunk_count": chunk_count}),
|
|
json.dumps({}),
|
|
),
|
|
)
|
|
parent_id = cur.fetchone()[0]
|
|
|
|
# Update chunks with parent_chunk_id
|
|
chunk_ids = [c["chunk_id"] for c in scene_chunks]
|
|
child_ids_array = chunk_ids # Store all child chunk IDs
|
|
|
|
cur.execute(
|
|
"""
|
|
UPDATE chunks
|
|
SET parent_chunk_id = %s::varchar
|
|
WHERE uuid = %s AND chunk_id = ANY(%s)
|
|
""",
|
|
(str(parent_id), UUID, chunk_ids),
|
|
)
|
|
|
|
inserted += 1
|
|
if i % 5 == 4 or i == len(scenes) - 1:
|
|
conn.commit()
|
|
print(f" ✅ Committed scenes 0-{i}")
|
|
|
|
conn.commit()
|
|
cur.close()
|
|
conn.close()
|
|
return inserted
|
|
|
|
|
|
def main():
|
|
print(f"🎬 Regenerating parent chunks for {UUID}")
|
|
print(f" Using model: {MODEL}")
|
|
print("=" * 70)
|
|
|
|
# Step 1: Get all chunks
|
|
print("\n📥 Fetching ASR chunks...")
|
|
chunks = get_chunks()
|
|
print(f" Found {len(chunks)} sentence chunks")
|
|
if chunks:
|
|
print(f" Time range: 0-{chunks[-1]['end_time']:.0f}s")
|
|
|
|
# Step 2: Find scene boundaries
|
|
print(f"\n🔍 Finding {SCENE_TARGET_COUNT} scene boundaries...")
|
|
scenes = find_scene_boundaries(chunks, SCENE_TARGET_COUNT)
|
|
print(f" Created {len(scenes)} scenes")
|
|
for i, s in enumerate(scenes):
|
|
print(
|
|
f" Scene {i}: {s[0]['start_time']:.0f}s-{s[-1]['end_time']:.0f}s ({len(s)} chunks)"
|
|
)
|
|
|
|
# Step 3: Generate summaries and insert
|
|
print(f"\n🤖 Generating summaries with gemma4...")
|
|
inserted = insert_parent_chunks(scenes)
|
|
|
|
print(f"\n{'=' * 70}")
|
|
print(f"✅ Created {inserted} parent chunks")
|
|
|
|
# Step 4: Verify
|
|
print("\n📊 Verification:")
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT COUNT(*) FROM parent_chunks WHERE uuid = %s", (UUID,))
|
|
print(f" parent_chunks: {cur.fetchone()[0]}")
|
|
cur.execute(
|
|
"SELECT COUNT(*) FROM chunks WHERE uuid = %s AND parent_chunk_id IS NULL AND chunk_type = 'sentence'",
|
|
(UUID,),
|
|
)
|
|
print(f" orphan chunks: {cur.fetchone()[0]}")
|
|
cur.close()
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|