refactor: remove face embedding architecture - single Qdrant _faces collection
- Delete FaceEmbeddingDb module (face_embedding_db.rs) - Stub match_faces_iterative, generate_seed_embeddings, tmdb_match_handler - Remove sync_trace_embeddings, populate_face_embeddings_to_qdrant - Remove embedding from face.json output (face_processor.py) - Remove embedding from PG UPDATE (store_traced_faces.py) - Remove workspace traces staging (checkin.rs, qdrant_workspace.rs) - Fix tests: add pose_angle to Face, hand_nodes to TkgResult Disabled functions (need reimplement with _faces): - match_faces_iterative (identity agent) - generate_seed_embeddings (TMDb seeds) - tmdb_match_handler (TMDb matching) - cluster_face_embeddings, search_similar_faces - merge_traces_within_cuts
This commit is contained in:
@@ -1,200 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
POC: MediaPipe Face Detection vs Apple Vision Framework vs InsightFace
|
||||
|
||||
Tests face detection on video frames and reports:
|
||||
- Detection count
|
||||
- Bounding box quality
|
||||
- Landmarks (468 face mesh)
|
||||
- Processing speed
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import subprocess
|
||||
import argparse
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
def extract_frames(video_path, sample_interval=30, max_frames=50):
|
||||
"""Extract frames using ffmpeg"""
|
||||
import tempfile
|
||||
tmpdir = tempfile.mkdtemp(prefix="face_test_")
|
||||
pattern = os.path.join(tmpdir, "frame_%05d.jpg")
|
||||
cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
|
||||
"-vf", f"select=not(mod(n\\,{sample_interval}))",
|
||||
"-vsync", "vfr", "-q:v", "5", pattern]
|
||||
subprocess.run(cmd, check=True)
|
||||
files = sorted([f for f in os.listdir(tmpdir) if f.endswith(".jpg")])[:max_frames]
|
||||
return tmpdir, [os.path.join(tmpdir, f) for f in files]
|
||||
|
||||
|
||||
def test_mediapipe(frame_paths, fps):
|
||||
"""MediaPipe Face Detection + Face Mesh"""
|
||||
try:
|
||||
from mediapipe.tasks import vision
|
||||
from mediapipe.tasks.python.core.base_options import BaseOptions
|
||||
from mediapipe.tasks.python.vision.face_detector import FaceDetector, FaceDetectorOptions
|
||||
from mediapipe.tasks.python.vision.face_landmarker import FaceLandmarker, FaceLandmarkerOptions
|
||||
except ImportError:
|
||||
print("[MediaPipe] Not available, skipping")
|
||||
return None
|
||||
|
||||
model_dir = os.path.join(os.path.dirname(__file__), "models")
|
||||
os.makedirs(model_dir, exist_ok=True)
|
||||
|
||||
# Check model files - MediaPipe downloads automatically via the API
|
||||
base_opts_detect = BaseOptions(model_asset_path="")
|
||||
detect_opts = FaceDetectorOptions(base_options=BaseOptions())
|
||||
|
||||
t0 = time.time()
|
||||
total_faces = 0
|
||||
frames_with_faces = 0
|
||||
landmarks_total = 0
|
||||
|
||||
# MediaPipe Face Detector
|
||||
try:
|
||||
detector = vision.FaceDetector.create_from_options(
|
||||
FaceDetectorOptions(
|
||||
base_options=BaseOptions(model_asset_buffer=None),
|
||||
running_mode=vision.RunningMode.IMAGE
|
||||
)
|
||||
)
|
||||
except:
|
||||
# Download model first
|
||||
import urllib.request
|
||||
model_url = "https://storage.googleapis.com/mediapipe-models/face_detector/blaze_face_short_range/float16/latest/face_detector.task"
|
||||
model_path = os.path.join(model_dir, "face_detector.task")
|
||||
if not os.path.exists(model_path):
|
||||
print(f"[MediaPipe] Downloading model: {model_url}")
|
||||
urllib.request.urlretrieve(model_url, model_path)
|
||||
|
||||
detector = vision.FaceDetector.create_from_options(
|
||||
FaceDetectorOptions(
|
||||
base_options=BaseOptions(model_asset_path=model_path),
|
||||
running_mode=vision.RunningMode.IMAGE
|
||||
)
|
||||
)
|
||||
|
||||
import cv2
|
||||
for path in frame_paths:
|
||||
img = cv2.imread(path)
|
||||
if img is None:
|
||||
continue
|
||||
h, w = img.shape[:2]
|
||||
|
||||
mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
|
||||
result = detector.detect(mp_img)
|
||||
|
||||
if result.detections:
|
||||
frames_with_faces += 1
|
||||
for det in result.detections:
|
||||
total_faces += 1
|
||||
bbox = det.bounding_box
|
||||
# bbox is [x, y, width, height] in pixels
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"[MediaPipe] Detection: {len(frame_paths)} frames, {frames_with_faces} with faces, {total_faces} faces, {elapsed:.2f}s")
|
||||
|
||||
# Face Landmarker (468 points)
|
||||
landmark_path = os.path.join(model_dir, "face_landmarker.task")
|
||||
if not os.path.exists(landmark_path):
|
||||
model_url = "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/latest/face_landmarker.task"
|
||||
print(f"[MediaPipe] Downloading landmark model...")
|
||||
import urllib.request
|
||||
urllib.request.urlretrieve(model_url, landmark_path)
|
||||
|
||||
landmarker = vision.FaceLandmarker.create_from_options(
|
||||
FaceLandmarkerOptions(
|
||||
base_options=BaseOptions(model_asset_path=landmark_path),
|
||||
running_mode=vision.RunningMode.IMAGE,
|
||||
output_face_blendshapes=False,
|
||||
output_facial_transformation_matrixes=False,
|
||||
)
|
||||
)
|
||||
|
||||
t1 = time.time()
|
||||
for path in frame_paths[:10]: # Only test 10 frames for landmarks
|
||||
img = cv2.imread(path)
|
||||
if img is None:
|
||||
continue
|
||||
mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
|
||||
result = landmarker.detect(mp_img)
|
||||
if result.face_landmarks:
|
||||
for face in result.face_landmarks:
|
||||
landmarks_total += len(face)
|
||||
|
||||
elapsed2 = time.time() - t1
|
||||
print(f"[MediaPipe] Face Mesh (10 frames): {landmarks_total} total landmarks (~{landmarks_total//max(len(result.face_landmarks),1)} per face)")
|
||||
|
||||
return {
|
||||
"frames_processed": len(frame_paths),
|
||||
"frames_with_faces": frames_with_faces,
|
||||
"total_faces": total_faces,
|
||||
"time_sec": elapsed,
|
||||
"landmarks_per_face": 468,
|
||||
}
|
||||
|
||||
|
||||
def test_vision_framework(frame_paths, fps):
|
||||
"""Apple Vision Framework face detection via swift binary"""
|
||||
# Use the existing swift binary
|
||||
swift_bin = os.path.join(os.path.dirname(__file__),
|
||||
"swift_processors/.build/debug/swift_ocr")
|
||||
# swift_ocr doesn't do face detection, use the face_compare_test
|
||||
swift_face = os.path.join(os.path.dirname(__file__),
|
||||
"swift_processors/.build/debug/face_compare_test")
|
||||
|
||||
if not os.path.exists(swift_face):
|
||||
print("[Vision] Binary not found, skipping")
|
||||
return None
|
||||
|
||||
print(f"[Vision] Running face compare test...")
|
||||
t0 = time.time()
|
||||
result = subprocess.run(
|
||||
[swift_face, frame_paths[0].rsplit("/", 2)[0].replace("/frames", ""), # This won't work for single files
|
||||
"--sample-interval", "1", "--max-frames", str(len(frame_paths))],
|
||||
capture_output=True, text=True, timeout=120
|
||||
)
|
||||
elapsed = time.time() - t0
|
||||
print(result.stdout[-500:])
|
||||
return {"time_sec": elapsed}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("video_path")
|
||||
parser.add_argument("--sample-interval", type=int, default=30)
|
||||
parser.add_argument("--max-frames", type=int, default=50)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Testing: {args.video_path}")
|
||||
|
||||
# Extract frames
|
||||
tmpdir, frames = extract_frames(args.video_path, args.sample_interval, args.max_frames)
|
||||
print(f"Extracted {len(frames)} frames")
|
||||
|
||||
# MediaPipe
|
||||
print("\n=== MediaPipe ===")
|
||||
mp_result = test_mediapipe(frames, 24)
|
||||
|
||||
# Vision Framework
|
||||
print("\n=== Apple Vision Framework ===")
|
||||
vf_result = test_vision_framework(frames, 24)
|
||||
|
||||
# Summary
|
||||
print("\n=== Comparison ===")
|
||||
if mp_result:
|
||||
print(f"MediaPipe: {mp_result['total_faces']} faces in {mp_result['frames_with_faces']} frames, {mp_result['time_sec']:.2f}s")
|
||||
print(f" Landmarks: {mp_result['landmarks_per_face']} per face")
|
||||
print(f"Vision Framework: (see above)")
|
||||
|
||||
# Cleanup
|
||||
import shutil
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/face_mediapipe_test_v1.11.py
|
||||
@@ -225,8 +225,9 @@ class FaceProcessorVision:
|
||||
if face_img.size == 0:
|
||||
continue
|
||||
|
||||
# CoreML embedding
|
||||
emb = self.extract_face_embedding(face_img)
|
||||
# CoreML embedding - TODO: push to Qdrant _faces collection instead
|
||||
# emb = self.extract_face_embedding(face_img)
|
||||
emb = None
|
||||
if emb is not None:
|
||||
embed_count += 1
|
||||
|
||||
@@ -240,7 +241,6 @@ class FaceProcessorVision:
|
||||
faces.append({
|
||||
"x": x, "y": y, "width": w, "height": h,
|
||||
"confidence": face.get("confidence", 0.5),
|
||||
"embedding": emb,
|
||||
"pose_angle": {
|
||||
"angle": pose_angle,
|
||||
"roll": pose_info.get("roll", 0),
|
||||
@@ -262,20 +262,17 @@ class FaceProcessorVision:
|
||||
|
||||
if len(face_data["frames"]) % 100 == 0:
|
||||
elapsed = time.time() - t0
|
||||
print(f"[FACE_V2] {len(face_data['frames'])} frames, {embed_count} embeddings, {elapsed:.0f}s")
|
||||
print(f"[FACE_V2] {len(face_data['frames'])} frames, {elapsed:.0f}s")
|
||||
if self.publisher:
|
||||
pct = int(len(face_data["frames"]) * 100 / max(len(frames), 1))
|
||||
if pct > last_pct:
|
||||
last_pct = pct
|
||||
self.publisher.progress("face", len(face_data["frames"]), len(frames),
|
||||
f"{embed_count} faces", embed_count, "faces")
|
||||
"", 0, "faces")
|
||||
|
||||
self.video.release()
|
||||
|
||||
# Finalize
|
||||
face_data["metadata"]["status"] = "completed"
|
||||
face_data["metadata"]["total_embeddings"] = embed_count
|
||||
face_data["metadata"]["embedder"] = "coreml_facenet"
|
||||
|
||||
# Convert dict frames to list for Rust FaceResult format
|
||||
frames_list = []
|
||||
|
||||
@@ -1,228 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Regenerate ALL parent chunks for 384b0ff44aaaa1f1 using gemma4
|
||||
Groups ASR chunks into ~17 logical scenes and generates summaries.
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
|
||||
UUID = "384b0ff44aaaa1f1"
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
MODEL = "gemma4:latest"
|
||||
|
||||
# Target ~17 scenes across 6865s = ~400s per scene
|
||||
# But use natural breaks (gaps in dialogue) to split
|
||||
SCENE_TARGET_COUNT = 17
|
||||
|
||||
|
||||
def get_chunks():
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT id, chunk_id, start_time, end_time, start_frame, end_frame,
|
||||
text_content, fps
|
||||
FROM chunks
|
||||
WHERE uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY start_time
|
||||
""",
|
||||
(UUID,),
|
||||
)
|
||||
chunks = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return chunks
|
||||
|
||||
|
||||
def call_gemma4(prompt, max_tokens=300):
|
||||
payload = {
|
||||
"model": MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.3, "num_predict": max_tokens},
|
||||
}
|
||||
try:
|
||||
resp = subprocess.run(
|
||||
["curl", "-s", OLLAMA_URL, "-d", json.dumps(payload)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=180,
|
||||
)
|
||||
if resp.returncode == 0:
|
||||
result = json.loads(resp.stdout)
|
||||
return result.get("response", "").strip()
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Ollama error: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def find_scene_boundaries(chunks, target_count=SCENE_TARGET_COUNT):
|
||||
"""Find optimal scene boundaries based on dialogue gaps"""
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
# Calculate gaps between consecutive chunks
|
||||
gaps = []
|
||||
for i in range(1, len(chunks)):
|
||||
gap = chunks[i]["start_time"] - chunks[i - 1]["end_time"]
|
||||
gaps.append((i, gap))
|
||||
|
||||
# Sort by gap size, take top (target_count - 1) gaps
|
||||
gaps.sort(key=lambda x: x[1], reverse=True)
|
||||
split_indices = sorted([g[0] for g in gaps[: target_count - 1]])
|
||||
|
||||
# Create scenes
|
||||
scenes = []
|
||||
start = 0
|
||||
for split in split_indices:
|
||||
scenes.append(chunks[start:split])
|
||||
start = split
|
||||
scenes.append(chunks[start:])
|
||||
|
||||
return scenes
|
||||
|
||||
|
||||
def generate_summary(scene_chunks, scene_num):
|
||||
"""Generate summary for a scene using gemma4"""
|
||||
texts = [c["text_content"] for c in scene_chunks if c["text_content"]]
|
||||
if not texts:
|
||||
return f"Scene {scene_num}: No dialogue"
|
||||
|
||||
combined = " ".join(texts)[:3000]
|
||||
duration = scene_chunks[-1]["end_time"] - scene_chunks[0]["start_time"]
|
||||
|
||||
prompt = f"""You are a professional film scene analyst. Given the following dialogue transcript from a movie scene, write a concise one-sentence English summary.
|
||||
|
||||
Duration: {duration:.0f} seconds
|
||||
Dialogue:
|
||||
{combined}
|
||||
|
||||
Provide ONLY the summary sentence, nothing else. Focus on plot events and character actions."""
|
||||
|
||||
summary = call_gemma4(prompt, max_tokens=250)
|
||||
if not summary:
|
||||
# Fallback: use first few words of dialogue
|
||||
summary = f"Scene {scene_num}: {' '.join(texts[:3])[:80]}..."
|
||||
return summary
|
||||
|
||||
|
||||
def insert_parent_chunks(scenes):
|
||||
"""Insert parent chunks and update child relationships"""
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
inserted = 0
|
||||
for i, scene_chunks in enumerate(scenes):
|
||||
start_time = scene_chunks[0]["start_time"]
|
||||
end_time = scene_chunks[-1]["end_time"]
|
||||
start_frame = int(scene_chunks[0]["start_frame"])
|
||||
end_frame = int(scene_chunks[-1]["end_frame"])
|
||||
fps = float(scene_chunks[0]["fps"]) if scene_chunks[0]["fps"] else 59.94
|
||||
chunk_count = len(scene_chunks)
|
||||
|
||||
print(
|
||||
f" Scene {i}: {start_time:.0f}s-{end_time:.0f}s ({chunk_count} chunks, {end_time - start_time:.0f}s)"
|
||||
)
|
||||
|
||||
# Generate summary
|
||||
summary = generate_summary(scene_chunks, i)
|
||||
print(f" 📝 {summary[:100]}...")
|
||||
|
||||
# Insert parent chunk
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO parent_chunks (
|
||||
uuid, scene_order, start_time, end_time,
|
||||
start_frame, end_frame, fps, summary_text,
|
||||
metadata, rule_3_markers, created_at
|
||||
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())
|
||||
RETURNING id
|
||||
""",
|
||||
(
|
||||
UUID,
|
||||
i,
|
||||
start_time,
|
||||
end_time,
|
||||
start_frame,
|
||||
end_frame,
|
||||
fps,
|
||||
summary,
|
||||
json.dumps({"auto_generated_by": "gemma4", "chunk_count": chunk_count}),
|
||||
json.dumps({}),
|
||||
),
|
||||
)
|
||||
parent_id = cur.fetchone()[0]
|
||||
|
||||
# Update chunks with parent_chunk_id
|
||||
chunk_ids = [c["chunk_id"] for c in scene_chunks]
|
||||
child_ids_array = chunk_ids # Store all child chunk IDs
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE chunks
|
||||
SET parent_chunk_id = %s::varchar
|
||||
WHERE uuid = %s AND chunk_id = ANY(%s)
|
||||
""",
|
||||
(str(parent_id), UUID, chunk_ids),
|
||||
)
|
||||
|
||||
inserted += 1
|
||||
if i % 5 == 4 or i == len(scenes) - 1:
|
||||
conn.commit()
|
||||
print(f" ✅ Committed scenes 0-{i}")
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return inserted
|
||||
|
||||
|
||||
def main():
|
||||
print(f"🎬 Regenerating parent chunks for {UUID}")
|
||||
print(f" Using model: {MODEL}")
|
||||
print("=" * 70)
|
||||
|
||||
# Step 1: Get all chunks
|
||||
print("\n📥 Fetching ASR chunks...")
|
||||
chunks = get_chunks()
|
||||
print(f" Found {len(chunks)} sentence chunks")
|
||||
if chunks:
|
||||
print(f" Time range: 0-{chunks[-1]['end_time']:.0f}s")
|
||||
|
||||
# Step 2: Find scene boundaries
|
||||
print(f"\n🔍 Finding {SCENE_TARGET_COUNT} scene boundaries...")
|
||||
scenes = find_scene_boundaries(chunks, SCENE_TARGET_COUNT)
|
||||
print(f" Created {len(scenes)} scenes")
|
||||
for i, s in enumerate(scenes):
|
||||
print(
|
||||
f" Scene {i}: {s[0]['start_time']:.0f}s-{s[-1]['end_time']:.0f}s ({len(s)} chunks)"
|
||||
)
|
||||
|
||||
# Step 3: Generate summaries and insert
|
||||
print("\n🤖 Generating summaries with gemma4...")
|
||||
inserted = insert_parent_chunks(scenes)
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"✅ Created {inserted} parent chunks")
|
||||
|
||||
# Step 4: Verify
|
||||
print("\n📊 Verification:")
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT COUNT(*) FROM parent_chunks WHERE uuid = %s", (UUID,))
|
||||
print(f" parent_chunks: {cur.fetchone()[0]}")
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) FROM chunks WHERE uuid = %s AND parent_chunk_id IS NULL AND chunk_type = 'sentence'",
|
||||
(UUID,),
|
||||
)
|
||||
print(f" orphan chunks: {cur.fetchone()[0]}")
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/generate_parent_chunks_gemma4_v1.11.py
|
||||
@@ -1,711 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
MediaPipe Holistic Processor - Full body keypoint extraction
|
||||
|
||||
Purpose:
|
||||
1. Extract Face Mesh (468 keypoints) → eye/mouth actions
|
||||
2. Extract Pose (33 keypoints) → arm/leg/feet actions
|
||||
3. Extract Hands (21 keypoints × 2) → hand gestures
|
||||
|
||||
Output structure:
|
||||
{
|
||||
"metadata": {...},
|
||||
"frames": {
|
||||
"frame_num": {
|
||||
"persons": [
|
||||
{
|
||||
"person_id": 0,
|
||||
"bbox": {...},
|
||||
"face_mesh": {
|
||||
"landmarks": [[x,y,z], ...], # 468 points
|
||||
"eye_features": {...},
|
||||
"mouth_features": {...},
|
||||
},
|
||||
"pose": {
|
||||
"landmarks": [[x,y,z,visibility], ...], # 33 points
|
||||
"arm_features": {...},
|
||||
"leg_features": {...},
|
||||
},
|
||||
"hands": {
|
||||
"left": {
|
||||
"landmarks": [[x,y,z], ...], # 21 points
|
||||
"gesture": "...",
|
||||
},
|
||||
"right": {
|
||||
"landmarks": [[x,y,z], ...], # 21 points
|
||||
"gesture": "...",
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import cv2
|
||||
import numpy as np
|
||||
import mediapipe as mp
|
||||
from typing import Dict
|
||||
|
||||
|
||||
class MediaPipeHolisticProcessor:
|
||||
"""
|
||||
Process video with MediaPipe Holistic (Face + Pose + Hands)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_complexity: int = 1, # 0, 1, 2
|
||||
refine_face_landmarks: bool = True,
|
||||
enable_segmentation: bool = False,
|
||||
min_detection_confidence: float = 0.5,
|
||||
min_tracking_confidence: float = 0.5,
|
||||
):
|
||||
"""
|
||||
Initialize MediaPipe Holistic
|
||||
|
||||
Args:
|
||||
model_complexity: 0 (lite), 1 (full), 2 (heavy)
|
||||
refine_face_landmarks: Enable iris detection
|
||||
enable_segmentation: Enable segmentation mask
|
||||
min_detection_confidence: Detection confidence threshold
|
||||
min_tracking_confidence: Tracking confidence threshold
|
||||
"""
|
||||
self.mp_holistic = mp.solutions.holistic
|
||||
self.mp_drawing = mp.solutions.drawing_utils
|
||||
self.mp_drawing_styles = mp.solutions.drawing_styles
|
||||
|
||||
self.holistic = self.mp_holistic.Holistic(
|
||||
static_image_mode=False, # Video mode
|
||||
model_complexity=model_complexity,
|
||||
smooth_landmarks=True, # Smooth landmarks across frames
|
||||
enable_segmentation=enable_segmentation,
|
||||
smooth_segmentation=True,
|
||||
refine_face_landmarks=refine_face_landmarks,
|
||||
min_detection_confidence=min_detection_confidence,
|
||||
min_tracking_confidence=min_tracking_confidence,
|
||||
)
|
||||
|
||||
# Eye landmark indices (Face Mesh)
|
||||
self.LEFT_EYE_INDICES = [33, 133, 159, 145, 158, 144] # 6 points
|
||||
self.RIGHT_EYE_INDICES = [362, 263, 386, 374, 385, 373]
|
||||
|
||||
# Iris indices
|
||||
self.LEFT_IRIS_CENTER = 468
|
||||
self.RIGHT_IRIS_CENTER = 473
|
||||
|
||||
# Mouth indices
|
||||
self.MOUTH_TOP = 13
|
||||
self.MOUTH_BOTTOM = 14
|
||||
self.MOUTH_LEFT = 61
|
||||
self.MOUTH_RIGHT = 291
|
||||
|
||||
# Pose key indices
|
||||
self.POSE_KEYPOINTS = {
|
||||
"nose": 0,
|
||||
"left_shoulder": 11,
|
||||
"right_shoulder": 12,
|
||||
"left_elbow": 13,
|
||||
"right_elbow": 14,
|
||||
"left_wrist": 15,
|
||||
"right_wrist": 16,
|
||||
"left_hip": 23,
|
||||
"right_hip": 24,
|
||||
"left_knee": 25,
|
||||
"right_knee": 26,
|
||||
"left_ankle": 27,
|
||||
"right_ankle": 28,
|
||||
}
|
||||
|
||||
# Hand key indices
|
||||
self.HAND_KEYPOINTS = {
|
||||
"wrist": 0,
|
||||
"thumb_cmc": 1,
|
||||
"thumb_mcp": 2,
|
||||
"thumb_ip": 3,
|
||||
"thumb_tip": 4,
|
||||
"index_mcp": 5,
|
||||
"index_pip": 6,
|
||||
"index_dip": 7,
|
||||
"index_tip": 8,
|
||||
"middle_mcp": 9,
|
||||
"middle_pip": 10,
|
||||
"middle_dip": 11,
|
||||
"middle_tip": 12,
|
||||
"ring_mcp": 13,
|
||||
"ring_pip": 14,
|
||||
"ring_dip": 15,
|
||||
"ring_tip": 16,
|
||||
"pinky_mcp": 17,
|
||||
"pinky_pip": 18,
|
||||
"pinky_dip": 19,
|
||||
"pinky_tip": 20,
|
||||
}
|
||||
|
||||
def process_frame(self, frame: np.ndarray) -> Dict:
|
||||
"""
|
||||
Process single frame
|
||||
|
||||
Args:
|
||||
frame: BGR image
|
||||
|
||||
Returns:
|
||||
Dict with face_mesh, pose, hands data
|
||||
"""
|
||||
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
|
||||
results = self.holistic.process(frame_rgb)
|
||||
|
||||
person_data = {
|
||||
"person_id": 0,
|
||||
"bbox": None,
|
||||
"face_mesh": None,
|
||||
"pose": None,
|
||||
"hands": {"left": None, "right": None},
|
||||
}
|
||||
|
||||
# Extract face mesh
|
||||
height, width = frame.shape[:2]
|
||||
if results.face_landmarks:
|
||||
person_data["face_mesh"] = self._extract_face_mesh(results.face_landmarks, width, height)
|
||||
|
||||
# Extract pose
|
||||
if results.pose_landmarks:
|
||||
person_data["pose"] = self._extract_pose(results.pose_landmarks, width, height)
|
||||
|
||||
# Extract hands
|
||||
if results.left_hand_landmarks:
|
||||
person_data["hands"]["left"] = self._extract_hand(results.left_hand_landmarks, "left", width, height)
|
||||
|
||||
if results.right_hand_landmarks:
|
||||
person_data["hands"]["right"] = self._extract_hand(results.right_hand_landmarks, "right", width, height)
|
||||
|
||||
# Calculate bbox from pose landmarks
|
||||
if results.pose_landmarks:
|
||||
landmarks = results.pose_landmarks.landmark
|
||||
x_coords = [lm.x for lm in landmarks if lm.visibility > 0.5]
|
||||
y_coords = [lm.y for lm in landmarks if lm.visibility > 0.5]
|
||||
|
||||
if x_coords and y_coords:
|
||||
x_min, x_max = min(x_coords), max(x_coords)
|
||||
y_min, y_max = min(y_coords), max(y_coords)
|
||||
|
||||
person_data["bbox"] = {
|
||||
"x": int(x_min * width),
|
||||
"y": int(y_min * height),
|
||||
"width": int((x_max - x_min) * width),
|
||||
"height": int((y_max - y_min) * height),
|
||||
}
|
||||
|
||||
return person_data
|
||||
|
||||
def _extract_face_mesh(self, face_landmarks, width: int, height: int) -> Dict:
|
||||
"""
|
||||
Extract face mesh landmarks and calculate features
|
||||
|
||||
Args:
|
||||
face_landmarks: MediaPipe face landmarks
|
||||
width: Frame width in pixels
|
||||
height: Frame height in pixels
|
||||
|
||||
Returns:
|
||||
Dict with landmarks (in pixels), eye_features, mouth_features
|
||||
"""
|
||||
landmarks = []
|
||||
for lm in face_landmarks.landmark:
|
||||
landmarks.append([int(lm.x * width), int(lm.y * height), lm.z])
|
||||
|
||||
# Eye Aspect Ratio (EAR)
|
||||
def calculate_ear(eye_indices):
|
||||
# Get eye points
|
||||
p1 = face_landmarks.landmark[eye_indices[0]]
|
||||
p2 = face_landmarks.landmark[eye_indices[1]]
|
||||
p3 = face_landmarks.landmark[eye_indices[2]]
|
||||
p4 = face_landmarks.landmark[eye_indices[3]]
|
||||
p5 = face_landmarks.landmark[eye_indices[4]]
|
||||
p6 = face_landmarks.landmark[eye_indices[5]]
|
||||
|
||||
# Vertical distances
|
||||
vertical_1 = np.linalg.norm([p3.x - p5.x, p3.y - p5.y])
|
||||
vertical_2 = np.linalg.norm([p4.x - p6.x, p4.y - p6.y])
|
||||
|
||||
# Horizontal distance
|
||||
horizontal = np.linalg.norm([p1.x - p2.x, p1.y - p2.y])
|
||||
|
||||
ear = (vertical_1 + vertical_2) / (2 * horizontal) if horizontal > 0 else 0
|
||||
return ear
|
||||
|
||||
left_ear = calculate_ear(self.LEFT_EYE_INDICES)
|
||||
right_ear = calculate_ear(self.RIGHT_EYE_INDICES)
|
||||
avg_ear = (left_ear + right_ear) / 2
|
||||
|
||||
# Iris position (if refined landmarks enabled)
|
||||
left_iris_x = None
|
||||
right_iris_x = None
|
||||
|
||||
if len(face_landmarks.landmark) > 477:
|
||||
left_iris = face_landmarks.landmark[self.LEFT_IRIS_CENTER]
|
||||
right_iris = face_landmarks.landmark[self.RIGHT_IRIS_CENTER]
|
||||
|
||||
# Normalize iris position relative to eye
|
||||
left_eye_center_x = (face_landmarks.landmark[33].x + face_landmarks.landmark[133].x) / 2
|
||||
right_eye_center_x = (face_landmarks.landmark[362].x + face_landmarks.landmark[263].x) / 2
|
||||
|
||||
left_eye_width = abs(face_landmarks.landmark[33].x - face_landmarks.landmark[133].x)
|
||||
right_eye_width = abs(face_landmarks.landmark[362].x - face_landmarks.landmark[263].x)
|
||||
|
||||
left_iris_x = (left_iris.x - left_eye_center_x) / left_eye_width if left_eye_width > 0 else 0
|
||||
right_iris_x = (right_iris.x - right_eye_center_x) / right_eye_width if right_eye_width > 0 else 0
|
||||
|
||||
# Eye action detection
|
||||
eye_action = "unknown"
|
||||
if avg_ear < 0.15:
|
||||
eye_action = "closed"
|
||||
elif avg_ear > 0.4:
|
||||
eye_action = "wide_open"
|
||||
elif 0.15 <= avg_ear < 0.25:
|
||||
eye_action = "squint"
|
||||
else:
|
||||
eye_action = "normal"
|
||||
|
||||
# Gaze direction
|
||||
gaze_direction = "center"
|
||||
if left_iris_x and right_iris_x:
|
||||
avg_iris_x = (left_iris_x + right_iris_x) / 2
|
||||
if avg_iris_x < -0.2:
|
||||
gaze_direction = "left"
|
||||
elif avg_iris_x > 0.2:
|
||||
gaze_direction = "right"
|
||||
|
||||
# Mouth Aspect Ratio (MAR)
|
||||
mouth_top = face_landmarks.landmark[self.MOUTH_TOP]
|
||||
mouth_bottom = face_landmarks.landmark[self.MOUTH_BOTTOM]
|
||||
mouth_left = face_landmarks.landmark[self.MOUTH_LEFT]
|
||||
mouth_right = face_landmarks.landmark[self.MOUTH_RIGHT]
|
||||
|
||||
mouth_height = np.linalg.norm([mouth_top.x - mouth_bottom.x, mouth_top.y - mouth_bottom.y])
|
||||
mouth_width = np.linalg.norm([mouth_left.x - mouth_right.x, mouth_left.y - mouth_right.y])
|
||||
|
||||
mar = mouth_height / mouth_width if mouth_width > 0 else 0
|
||||
|
||||
# Mouth corner distance (for smile detection)
|
||||
mouth_center_y = (mouth_top.y + mouth_bottom.y) / 2
|
||||
corner_lift = (mouth_center_y - mouth_left.y) + (mouth_center_y - mouth_right.y)
|
||||
|
||||
# Mouth action detection
|
||||
mouth_action = "unknown"
|
||||
if mar > 0.7:
|
||||
mouth_action = "yawn"
|
||||
elif mar > 0.5:
|
||||
mouth_action = "open"
|
||||
elif mar < 0.2:
|
||||
if corner_lift > 0.02:
|
||||
mouth_action = "smile"
|
||||
else:
|
||||
mouth_action = "closed"
|
||||
else:
|
||||
mouth_action = "slightly_open"
|
||||
|
||||
return {
|
||||
"landmarks": landmarks,
|
||||
"num_landmarks": len(landmarks),
|
||||
"eye_features": {
|
||||
"left_ear": round(left_ear, 4),
|
||||
"right_ear": round(right_ear, 4),
|
||||
"avg_ear": round(avg_ear, 4),
|
||||
"left_iris_x": round(left_iris_x, 4) if left_iris_x else None,
|
||||
"right_iris_x": round(right_iris_x, 4) if right_iris_x else None,
|
||||
"eye_action": eye_action,
|
||||
"gaze_direction": gaze_direction,
|
||||
},
|
||||
"mouth_features": {
|
||||
"mar": round(mar, 4),
|
||||
"mouth_height": round(mouth_height, 4),
|
||||
"mouth_width": round(mouth_width, 4),
|
||||
"corner_lift": round(corner_lift, 4),
|
||||
"mouth_action": mouth_action,
|
||||
},
|
||||
}
|
||||
|
||||
def _extract_pose(self, pose_landmarks, width: int, height: int) -> Dict:
|
||||
"""
|
||||
Extract pose landmarks and calculate features
|
||||
|
||||
Args:
|
||||
pose_landmarks: MediaPipe pose landmarks
|
||||
width: Frame width in pixels
|
||||
height: Frame height in pixels
|
||||
|
||||
Returns:
|
||||
Dict with landmarks (in pixels), arm_features, leg_features
|
||||
"""
|
||||
landmarks = []
|
||||
for lm in pose_landmarks.landmark:
|
||||
landmarks.append([int(lm.x * width), int(lm.y * height), lm.z, lm.visibility])
|
||||
|
||||
# Helper function to calculate angle
|
||||
def calculate_angle(p1_idx, p2_idx, p3_idx):
|
||||
p1 = pose_landmarks.landmark[p1_idx]
|
||||
p2 = pose_landmarks.landmark[p2_idx]
|
||||
p3 = pose_landmarks.landmark[p3_idx]
|
||||
|
||||
v1 = np.array([p1.x, p1.y]) - np.array([p2.x, p2.y])
|
||||
v2 = np.array([p3.x, p3.y]) - np.array([p2.x, p2.y])
|
||||
|
||||
angle = np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
|
||||
return np.degrees(angle)
|
||||
|
||||
# Arm features
|
||||
left_elbow_angle = calculate_angle(11, 13, 15) # shoulder-elbow-wrist
|
||||
right_elbow_angle = calculate_angle(12, 14, 16)
|
||||
|
||||
# Check if arms raised
|
||||
left_wrist = pose_landmarks.landmark[15]
|
||||
left_elbow = pose_landmarks.landmark[13]
|
||||
left_shoulder = pose_landmarks.landmark[11]
|
||||
|
||||
right_wrist = pose_landmarks.landmark[16]
|
||||
right_elbow = pose_landmarks.landmark[14]
|
||||
right_shoulder = pose_landmarks.landmark[12]
|
||||
|
||||
left_arm_raised = left_wrist.y < left_elbow.y < left_shoulder.y
|
||||
right_arm_raised = right_wrist.y < right_elbow.y < right_shoulder.y
|
||||
|
||||
# Arm action detection
|
||||
left_arm_action = "unknown"
|
||||
if left_arm_raised:
|
||||
left_arm_action = "raise_left"
|
||||
elif left_elbow_angle > 150:
|
||||
left_arm_action = "extend_left"
|
||||
elif left_elbow_angle < 90:
|
||||
left_arm_action = "fold_left"
|
||||
else:
|
||||
left_arm_action = "neutral_left"
|
||||
|
||||
right_arm_action = "unknown"
|
||||
if right_arm_raised:
|
||||
right_arm_action = "raise_right"
|
||||
elif right_elbow_angle > 150:
|
||||
right_arm_action = "extend_right"
|
||||
elif right_elbow_angle < 90:
|
||||
right_arm_action = "fold_right"
|
||||
else:
|
||||
right_arm_action = "neutral_right"
|
||||
|
||||
# Cross arms detection
|
||||
cross_arms = False
|
||||
if left_wrist.x > right_wrist.x and right_wrist.x < left_shoulder.x:
|
||||
cross_arms = True
|
||||
|
||||
# Leg features
|
||||
left_knee_angle = calculate_angle(23, 25, 27) # hip-knee-ankle
|
||||
right_knee_angle = calculate_angle(24, 26, 28)
|
||||
|
||||
# Check standing/sitting
|
||||
left_hip = pose_landmarks.landmark[23]
|
||||
left_knee = pose_landmarks.landmark[25]
|
||||
left_ankle = pose_landmarks.landmark[27]
|
||||
|
||||
right_hip = pose_landmarks.landmark[24]
|
||||
right_knee = pose_landmarks.landmark[26]
|
||||
right_ankle = pose_landmarks.landmark[28]
|
||||
|
||||
hip_avg_y = (left_hip.y + right_hip.y) / 2
|
||||
knee_avg_y = (left_knee.y + right_knee.y) / 2
|
||||
|
||||
# Standing: hip < knee < ankle (y increases downward)
|
||||
standing = left_hip.y < left_knee.y < left_ankle.y and right_hip.y < right_knee.y < right_ankle.y
|
||||
|
||||
# Sitting: hip ≈ knee height
|
||||
sitting = abs(hip_avg_y - knee_avg_y) < 0.1
|
||||
|
||||
# Leg action detection
|
||||
leg_action = "unknown"
|
||||
if sitting:
|
||||
leg_action = "sit"
|
||||
elif standing:
|
||||
if left_knee_angle < 120 or right_knee_angle < 120:
|
||||
leg_action = "knee_bend"
|
||||
else:
|
||||
leg_action = "stand"
|
||||
|
||||
return {
|
||||
"landmarks": landmarks,
|
||||
"num_landmarks": len(landmarks),
|
||||
"arm_features": {
|
||||
"left_elbow_angle": round(left_elbow_angle, 2),
|
||||
"right_elbow_angle": round(right_elbow_angle, 2),
|
||||
"left_arm_raised": left_arm_raised,
|
||||
"right_arm_raised": right_arm_raised,
|
||||
"left_arm_action": left_arm_action,
|
||||
"right_arm_action": right_arm_action,
|
||||
"cross_arms": cross_arms,
|
||||
},
|
||||
"leg_features": {
|
||||
"left_knee_angle": round(left_knee_angle, 2),
|
||||
"right_knee_angle": round(right_knee_angle, 2),
|
||||
"standing": standing,
|
||||
"sitting": sitting,
|
||||
"leg_action": leg_action,
|
||||
},
|
||||
}
|
||||
|
||||
def _extract_hand(self, hand_landmarks, hand_type: str, width: int, height: int) -> Dict:
|
||||
"""
|
||||
Extract hand landmarks and detect gesture
|
||||
|
||||
Args:
|
||||
hand_landmarks: MediaPipe hand landmarks
|
||||
hand_type: "left" or "right"
|
||||
width: Frame width in pixels
|
||||
height: Frame height in pixels
|
||||
|
||||
Returns:
|
||||
Dict with landmarks (in pixels), gesture
|
||||
"""
|
||||
landmarks = []
|
||||
for lm in hand_landmarks.landmark:
|
||||
landmarks.append([int(lm.x * width), int(lm.y * height), lm.z])
|
||||
|
||||
# Check finger extensions
|
||||
def is_finger_extended(tip_idx, pip_idx):
|
||||
tip = hand_landmarks.landmark[tip_idx]
|
||||
pip = hand_landmarks.landmark[pip_idx]
|
||||
|
||||
# Finger is extended if tip is higher (lower y) than pip
|
||||
return tip.y < pip.y
|
||||
|
||||
thumb_extended = is_finger_extended(4, 3)
|
||||
index_extended = is_finger_extended(8, 6)
|
||||
middle_extended = is_finger_extended(12, 10)
|
||||
ring_extended = is_finger_extended(16, 14)
|
||||
pinky_extended = is_finger_extended(20, 18)
|
||||
|
||||
extensions = {
|
||||
"thumb": thumb_extended,
|
||||
"index": index_extended,
|
||||
"middle": middle_extended,
|
||||
"ring": ring_extended,
|
||||
"pinky": pinky_extended,
|
||||
}
|
||||
|
||||
# Gesture detection
|
||||
gesture = "unknown"
|
||||
|
||||
num_extended = sum(extensions.values())
|
||||
|
||||
if num_extended == 5:
|
||||
gesture = "open_hand"
|
||||
elif num_extended == 0:
|
||||
gesture = "fist"
|
||||
elif thumb_extended and num_extended == 1:
|
||||
gesture = "thumbs_up"
|
||||
elif index_extended and middle_extended and num_extended == 2:
|
||||
gesture = "peace_sign"
|
||||
elif index_extended and num_extended == 1:
|
||||
gesture = "pointing"
|
||||
elif thumb_extended and index_extended and not any([middle_extended, ring_extended, pinky_extended]):
|
||||
# Check thumb-index distance for OK gesture
|
||||
thumb_tip = hand_landmarks.landmark[4]
|
||||
index_tip = hand_landmarks.landmark[8]
|
||||
|
||||
distance = np.linalg.norm([thumb_tip.x - index_tip.x, thumb_tip.y - index_tip.y])
|
||||
|
||||
if distance < 0.05:
|
||||
gesture = "ok_sign"
|
||||
else:
|
||||
gesture = "grab"
|
||||
|
||||
return {
|
||||
"landmarks": landmarks,
|
||||
"num_landmarks": len(landmarks),
|
||||
"finger_extensions": extensions,
|
||||
"num_fingers_extended": num_extended,
|
||||
"gesture": gesture,
|
||||
"hand_type": hand_type,
|
||||
}
|
||||
|
||||
def process_video(
|
||||
self,
|
||||
video_path: str,
|
||||
output_path: str,
|
||||
sample_interval: int = 1,
|
||||
uuid: str = "",
|
||||
) -> Dict:
|
||||
"""
|
||||
Process entire video
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
output_path: Path to output JSON
|
||||
sample_interval: Process every N frames
|
||||
uuid: UUID for progress reporting
|
||||
|
||||
Returns:
|
||||
Dict with all processed data
|
||||
"""
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
|
||||
if not cap.isOpened():
|
||||
print(f"MEDIAPIPE_ERROR:Cannot open video: {video_path}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
print(f"MEDIAPIPE_START", file=sys.stderr)
|
||||
print(f"MEDIAPIPE_INFO:FPS={fps},total={total_frames},interval={sample_interval}", file=sys.stderr)
|
||||
|
||||
output_data = {
|
||||
"metadata": {
|
||||
"video_path": video_path,
|
||||
"fps": fps,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"total_frames": total_frames,
|
||||
"sample_interval": sample_interval,
|
||||
"processor": "mediapipe_holistic",
|
||||
"model_complexity": 1,
|
||||
"refine_face_landmarks": True,
|
||||
},
|
||||
"frames": {},
|
||||
}
|
||||
|
||||
frame_count = 0
|
||||
processed_count = 0
|
||||
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
frame_count += 1
|
||||
|
||||
if frame_count % sample_interval != 0:
|
||||
continue
|
||||
|
||||
# Process frame
|
||||
person_data = self.process_frame(frame)
|
||||
|
||||
# Only save if landmarks detected
|
||||
if person_data["face_mesh"] or person_data["pose"] or person_data["hands"]["left"] or person_data["hands"]["right"]:
|
||||
timestamp = frame_count / fps if fps > 0 else 0
|
||||
|
||||
output_data["frames"][str(frame_count)] = {
|
||||
"frame_number": frame_count,
|
||||
"timestamp": round(timestamp, 3),
|
||||
"persons": [person_data],
|
||||
}
|
||||
|
||||
processed_count += 1
|
||||
|
||||
if processed_count % 100 == 0:
|
||||
print(f"MEDIAPIPE_FRAME:{processed_count}", file=sys.stderr)
|
||||
|
||||
cap.release()
|
||||
|
||||
# Update metadata
|
||||
output_data["metadata"]["processed_frames"] = processed_count
|
||||
|
||||
# Save output
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_data, f, indent=2)
|
||||
|
||||
print(f"MEDIAPIPE_COMPLETE:{processed_count}", file=sys.stderr)
|
||||
|
||||
return output_data
|
||||
|
||||
def close(self):
|
||||
"""Close MediaPipe model"""
|
||||
self.holistic.close()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="MediaPipe Holistic Processor")
|
||||
parser.add_argument("video_path", nargs="?", help="Path to video file (positional)")
|
||||
parser.add_argument("output_path", nargs="?", help="Path to output JSON (positional)")
|
||||
parser.add_argument("--video", help="Path to video file")
|
||||
parser.add_argument("--output", help="Path to output JSON")
|
||||
parser.add_argument("--sample-interval", type=int, default=1, help="Process every N frames")
|
||||
parser.add_argument("--model-complexity", type=int, default=1, choices=[0, 1, 2], help="Model complexity")
|
||||
parser.add_argument("--test-frame", type=int, help="Test single frame only")
|
||||
parser.add_argument("--uuid", default="", help="UUID for progress reporting")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Resolve positional vs flagged args
|
||||
video_path = args.video or args.video_path
|
||||
output_path = args.output or args.output_path
|
||||
if not video_path or not output_path:
|
||||
parser.error("video_path and output_path are required")
|
||||
|
||||
print("=" * 70)
|
||||
print("MediaPipe Holistic Processor")
|
||||
print("=" * 70)
|
||||
|
||||
processor = MediaPipeHolisticProcessor(
|
||||
model_complexity=args.model_complexity,
|
||||
refine_face_landmarks=True,
|
||||
)
|
||||
|
||||
if args.test_frame:
|
||||
# Test single frame
|
||||
print(f"\nTesting frame {args.test_frame}...")
|
||||
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, args.test_frame - 1)
|
||||
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
|
||||
if ret:
|
||||
person_data = processor.process_frame(frame)
|
||||
|
||||
print("\n=== Results ===")
|
||||
|
||||
if person_data["face_mesh"]:
|
||||
face = person_data["face_mesh"]
|
||||
print(f"\nFace Mesh: {face['num_landmarks']} landmarks")
|
||||
print(f" Eye: {face['eye_features']['eye_action']} (EAR: {face['eye_features']['avg_ear']})")
|
||||
print(f" Gaze: {face['eye_features']['gaze_direction']}")
|
||||
print(f" Mouth: {face['mouth_features']['mouth_action']} (MAR: {face['mouth_features']['mar']})")
|
||||
|
||||
if person_data["pose"]:
|
||||
pose = person_data["pose"]
|
||||
print(f"\nPose: {pose['num_landmarks']} keypoints")
|
||||
print(f" Left arm: {pose['arm_features']['left_arm_action']} (angle: {pose['arm_features']['left_elbow_angle']}°)")
|
||||
print(f" Right arm: {pose['arm_features']['right_arm_action']} (angle: {pose['arm_features']['right_elbow_angle']}°)")
|
||||
print(f" Cross arms: {pose['arm_features']['cross_arms']}")
|
||||
print(f" Leg: {pose['leg_features']['leg_action']}")
|
||||
|
||||
if person_data["hands"]["left"]:
|
||||
hand = person_data["hands"]["left"]
|
||||
print(f"\nLeft hand: {hand['num_landmarks']} keypoints")
|
||||
print(f" Gesture: {hand['gesture']}")
|
||||
print(f" Fingers extended: {hand['num_fingers_extended']}")
|
||||
|
||||
if person_data["hands"]["right"]:
|
||||
hand = person_data["hands"]["right"]
|
||||
print(f"\nRight hand: {hand['num_landmarks']} keypoints")
|
||||
print(f" Gesture: {hand['gesture']}")
|
||||
print(f" Fingers extended: {hand['num_fingers_extended']}")
|
||||
else:
|
||||
print("❌ Cannot read frame")
|
||||
else:
|
||||
# Process entire video
|
||||
processor.process_video(
|
||||
video_path,
|
||||
output_path,
|
||||
args.sample_interval,
|
||||
uuid=args.uuid,
|
||||
)
|
||||
|
||||
processor.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/mediapipe_holistic_processor_v1.11.py
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/mediapipe_processor_v1.11.py
|
||||
@@ -1,381 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Story Processor V2.0 — Dual Pipeline: Story-based + LLM-based Parent-Child Summarization
|
||||
|
||||
Pipeline 1 (Story): Template-based, instant, no LLM cost
|
||||
→ Parent story summary + Child story summary
|
||||
→ Embedding (Ollama nomic-embed) → pgvector
|
||||
→ BM25 (PostgreSQL tsvector) → full-text search
|
||||
|
||||
Pipeline 2 (LLM): LLM-based summarization (Gemma4/Qwen when resources allow)
|
||||
→ Parent LLM summary + Child LLM summary
|
||||
→ Embedding → pgvector + BM25
|
||||
|
||||
Both pipelines store into chunks table with distinct chunk_types:
|
||||
story_parent, story_child, llm_parent, llm_child
|
||||
|
||||
Usage:
|
||||
python parent_chunk_5w1h.py --file-uuid <uuid> --mode story [--embed]
|
||||
python parent_chunk_5w1h.py --file-uuid <uuid> --mode llm [--embed]
|
||||
"""
|
||||
|
||||
import json, os, sys, argparse, time, requests, psycopg2
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
DB_URL = os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
|
||||
SCHEMA = os.getenv("DATABASE_SCHEMA", "dev")
|
||||
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
|
||||
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://localhost:11436/v1/embeddings")
|
||||
|
||||
def load_speaker_map(file_uuid: str) -> dict:
|
||||
"""Load speaker→identity mapping from DB (generalized, not hardcoded)"""
|
||||
try:
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SET search_path TO %s, public", (SCHEMA,))
|
||||
cur.execute(
|
||||
"SELECT metadata->>'speaker_id', name FROM identities "
|
||||
"WHERE metadata->>'speaker_id' IS NOT NULL"
|
||||
)
|
||||
spk_map = {}
|
||||
for spk_id, name in cur.fetchall():
|
||||
spk_map[spk_id] = (name, 0.85) # default confidence from MAR
|
||||
cur.close(); conn.close()
|
||||
return spk_map if spk_map else DEFAULT_SPEAKER_MAP
|
||||
except Exception:
|
||||
return DEFAULT_SPEAKER_MAP
|
||||
|
||||
# Default fallback (used when DB has no speaker mapping)
|
||||
DEFAULT_SPEAKER_MAP = {}
|
||||
|
||||
CURRENT_VERSIONS = {
|
||||
"asr": "faster-whisper/small/v1",
|
||||
"asrx": "speechbrain/ecapa-tdnn/v1",
|
||||
"cut": "pyscenedetect/default",
|
||||
"yolo": "yolov5-coreml/v2",
|
||||
"face_detection": "apple-vision/v2",
|
||||
"face_embedding": "coreml-facenet/v2",
|
||||
"speaker_binding": "mar-lip/v1",
|
||||
"identity_clustering": "cosine-threshold/v1",
|
||||
"story_agent": "template/v2.0",
|
||||
"embedding_agent": "nomic-embed-768d/v1",
|
||||
}
|
||||
|
||||
LLM_URL = os.getenv("MOMENTRY_LLM_URL", os.getenv("MOMENTRY_LLM_SUMMARY_URL", "http://127.0.0.1:8082/v1/chat/completions"))
|
||||
LLM_MODEL = os.getenv("MOMENTRY_LLM_SUMMARY_MODEL", "gemma4")
|
||||
|
||||
|
||||
def load_data(file_uuid: str) -> dict:
|
||||
data = {}
|
||||
for name in ["asr", "asrx", "cut"]:
|
||||
path = os.path.join(OUTPUT_DIR, f"{file_uuid}.{name}.json")
|
||||
data[name] = json.load(open(path)) if os.path.exists(path) else None
|
||||
return data
|
||||
|
||||
|
||||
def build_child_chunks(data: dict, file_uuid: str) -> List[dict]:
|
||||
"""Group ASR sentences by CUT scene boundaries → parent/child structure."""
|
||||
asr_segs = data["asr"].get("segments", []) if data["asr"] else []
|
||||
asrx_segs = data["asrx"].get("segments", []) if data["asrx"] else []
|
||||
cut_scenes = data["cut"].get("scenes", []) if data["cut"] else []
|
||||
|
||||
# Dynamically load speaker→identity mapping from DB
|
||||
speaker_map = load_speaker_map(file_uuid)
|
||||
|
||||
if not cut_scenes:
|
||||
max_t = max(
|
||||
(asr_segs[-1].get("end", 0) if asr_segs else 0),
|
||||
(asrx_segs[-1].get("end_time", 0) if asrx_segs else 0),
|
||||
)
|
||||
cut_scenes = [{"start_time": t, "end_time": min(t + 60, max_t)} for t in range(0, int(max_t) + 60, 60)]
|
||||
|
||||
scenes = []
|
||||
for cs in cut_scenes:
|
||||
s, e = cs["start_time"], cs["end_time"]
|
||||
|
||||
children = []
|
||||
for seg_idx, seg in enumerate(asr_segs):
|
||||
st, en = seg.get("start", 0), seg.get("end", 0)
|
||||
text = seg.get("text", "").strip()
|
||||
if st < s or en > e or not text: continue
|
||||
|
||||
spk_id = "unknown"
|
||||
for ax in asrx_segs:
|
||||
if ax["start_time"] <= st and ax["end_time"] >= en:
|
||||
spk_id = ax.get("speaker_id", "unknown"); break
|
||||
|
||||
spk_info = speaker_map.get(spk_id)
|
||||
if spk_info:
|
||||
character, spk_conf = spk_info
|
||||
else:
|
||||
character, spk_conf = spk_id, 0.0
|
||||
|
||||
children.append({
|
||||
"start": st, "end": en, "text": text,
|
||||
"speaker_id": spk_id, "speaker_name": character,
|
||||
"speaker_confidence": spk_conf,
|
||||
"chunk_id": f"{file_uuid}_{seg_idx}",
|
||||
})
|
||||
|
||||
# Boundary overlap: even empty scenes get partial children
|
||||
for seg_idx, seg in enumerate(asr_segs):
|
||||
st, en = seg.get("start", 0), seg.get("end", 0)
|
||||
text = seg.get("text", "").strip()
|
||||
if not text: continue
|
||||
if st >= s and en <= e: continue
|
||||
if not (st < e and en > s): continue
|
||||
|
||||
spk_id = "unknown"
|
||||
for ax in asrx_segs:
|
||||
if ax["start_time"] <= st and ax["end_time"] >= en:
|
||||
spk_id = ax.get("speaker_id", "unknown"); break
|
||||
spk_info = speaker_map.get(spk_id)
|
||||
if spk_info:
|
||||
character, spk_conf = spk_info
|
||||
else:
|
||||
character, spk_conf = spk_id, 0.0
|
||||
children.append({
|
||||
"start": st, "end": en, "text": text,
|
||||
"speaker_id": spk_id, "speaker_name": character,
|
||||
"speaker_confidence": spk_conf,
|
||||
"chunk_id": f"{file_uuid}_{seg_idx}",
|
||||
"overlap_type": "partial",
|
||||
})
|
||||
|
||||
if children:
|
||||
scenes.append({
|
||||
"start_time": s, "end_time": e, "duration": e - s,
|
||||
"children": children, "child_count": len(children),
|
||||
})
|
||||
return scenes
|
||||
|
||||
|
||||
# ===== Pipeline 1: Story (Template) Summaries =====
|
||||
|
||||
def generate_story_parent_summary(scene: dict) -> str:
|
||||
children = scene["children"]
|
||||
characters = sorted(set(c["speaker_name"] for c in children))
|
||||
total_words = sum(len(c["text"].split()) for c in children)
|
||||
by_speaker = defaultdict(list)
|
||||
for c in children: by_speaker[c["speaker_name"]].append(c["text"])
|
||||
speakers = []
|
||||
for char, texts in sorted(by_speaker.items()):
|
||||
speakers.append(f"{char} ({len(texts)} lines)")
|
||||
|
||||
return (
|
||||
f"[{scene['start_time']:.0f}s-{scene['end_time']:.0f}s, {scene['duration']:.0f}s] "
|
||||
f"Cast: {', '.join(characters)}. Total: {len(children)} lines, {total_words} words. "
|
||||
f"Speakers: {' | '.join(speakers[:3])}"
|
||||
)
|
||||
|
||||
|
||||
def generate_story_child_summary(child: dict, parent_summary: str) -> str:
|
||||
return (
|
||||
f"[{child['start']:.0f}s-{child['end']:.0f}s] "
|
||||
f"{child['speaker_name']}: \"{child['text']}\""
|
||||
)
|
||||
|
||||
|
||||
# ===== Pipeline 2: LLM Summaries (requires LLM server) =====
|
||||
|
||||
def generate_llm_parent_summary(scene: dict, max_scenes_processed: int) -> Optional[str]:
|
||||
"""LLM-based parent summary"""
|
||||
if not LLM_URL: return None
|
||||
children = scene["children"]
|
||||
dialogue = "\n".join(
|
||||
f"[{c['start']:.0f}s] {c['speaker_name']}: {c['text'][:150]}"
|
||||
for c in children[:15]
|
||||
)
|
||||
prompt = (
|
||||
"You are a film analyst. Summarize this scene in one flowing paragraph (60-100 words). "
|
||||
"Include: who is present, what they discuss, tone/mood.\n\n"
|
||||
f"Scene: {scene['start_time']:.0f}s - {scene['end_time']:.0f}s\n"
|
||||
f"Dialogue:\n{dialogue}\n\nSummary:"
|
||||
)
|
||||
try:
|
||||
resp = requests.post(LLM_URL, json={
|
||||
"model": LLM_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 200, "temperature": 0.3,
|
||||
}, timeout=60)
|
||||
return resp.json()["choices"][0]["message"]["content"].strip()
|
||||
except Exception as e:
|
||||
print(f" ⚠️ LLM parent summary failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def generate_llm_child_summary(child: dict, parent_summary: str) -> Optional[str]:
|
||||
"""LLM-based child (sentence) summary"""
|
||||
return f"[{child['start']:.0f}s-{child['end']:.0f}s] {child['speaker_name']}: \"{child['text']}\""
|
||||
|
||||
|
||||
# ===== Embedding (Ollama nomic-embed) =====
|
||||
|
||||
def embed_text(text: str, max_retries: int = 3) -> Optional[List[float]]:
|
||||
"""Get embedding via EmbeddingGemma server"""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = requests.post(EMBEDDING_URL, json={
|
||||
"input": [text],
|
||||
}, timeout=30)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
items = data.get("data", [])
|
||||
if items:
|
||||
return items[0]["embedding"]
|
||||
except Exception as e:
|
||||
if attempt == max_retries - 1:
|
||||
print(f" ⚠️ Embedding failed: {e}")
|
||||
return None
|
||||
time.sleep(1)
|
||||
return None
|
||||
|
||||
|
||||
# ===== DB Store (chunks table with embedding + BM25) =====
|
||||
|
||||
def store_chunks(file_uuid: str, scenes: List[dict], mode: str, do_embed: bool, conn):
|
||||
"""Store parent + child summaries into chunks table."""
|
||||
cur = conn.cursor()
|
||||
parent_type = f"{mode}_parent"
|
||||
child_type = f"{mode}_child"
|
||||
|
||||
parent_count = 0
|
||||
child_count = 0
|
||||
|
||||
# Get base chunk_index
|
||||
cur.execute(
|
||||
f"SELECT COALESCE(MAX(chunk_index), 0) FROM {SCHEMA}.chunk WHERE file_uuid = %s",
|
||||
(file_uuid,),
|
||||
)
|
||||
next_index = (cur.fetchone()[0] or 0) + 1
|
||||
|
||||
for scene in scenes:
|
||||
parent_text = generate_story_parent_summary(scene) if mode == "story" else generate_llm_parent_summary(scene, parent_count)
|
||||
if not parent_text: continue
|
||||
|
||||
parent_id = f"{mode}_parent_{file_uuid}_{scene['start_time']:.0f}_{scene['end_time']:.0f}"
|
||||
|
||||
parent_embedding = embed_text(parent_text) if do_embed else None
|
||||
if do_embed and parent_embedding:
|
||||
cur.execute(
|
||||
f"""
|
||||
INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
|
||||
start_time, end_time, content, text_content, parent_chunk_id, embedding)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s, %s::vector)
|
||||
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
|
||||
SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
|
||||
embedding = EXCLUDED.embedding
|
||||
""",
|
||||
(parent_id, parent_id, file_uuid, parent_type, next_index,
|
||||
scene["start_time"], scene["end_time"],
|
||||
json.dumps({"summary": parent_text, "mode": mode, "type": "parent",
|
||||
"source_versions": CURRENT_VERSIONS}),
|
||||
parent_text, None, parent_embedding),
|
||||
)
|
||||
else:
|
||||
cur.execute(
|
||||
f"""
|
||||
INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
|
||||
start_time, end_time, content, text_content, parent_chunk_id)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s)
|
||||
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
|
||||
SET content = EXCLUDED.content, text_content = EXCLUDED.text_content
|
||||
""",
|
||||
(parent_id, parent_id, file_uuid, parent_type, next_index,
|
||||
scene["start_time"], scene["end_time"],
|
||||
json.dumps({"summary": parent_text, "mode": mode, "type": "parent",
|
||||
"source_versions": CURRENT_VERSIONS}),
|
||||
parent_text, None),
|
||||
)
|
||||
next_index += 1
|
||||
parent_count += 1
|
||||
|
||||
for child in scene["children"]:
|
||||
child_id = child["chunk_id"]
|
||||
child_text = generate_story_child_summary(child, parent_text) if mode == "story" else generate_llm_child_summary(child, parent_text)
|
||||
|
||||
child_embedding = embed_text(child_text) if do_embed else None
|
||||
if do_embed and child_embedding:
|
||||
cur.execute(
|
||||
f"""
|
||||
INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
|
||||
start_time, end_time, content, text_content, parent_chunk_id, embedding)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s, %s::vector)
|
||||
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
|
||||
SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
|
||||
parent_chunk_id = EXCLUDED.parent_chunk_id,
|
||||
embedding = EXCLUDED.embedding
|
||||
""",
|
||||
(child_id, child_id, file_uuid, child_type, next_index,
|
||||
child["start"], child["end"],
|
||||
json.dumps({"speaker": child["speaker_name"], "text": child["text"], "mode": mode,
|
||||
"speaker_confidence": child.get("speaker_confidence", 0),
|
||||
"source_versions": CURRENT_VERSIONS}),
|
||||
child_text, parent_id, child_embedding),
|
||||
)
|
||||
else:
|
||||
cur.execute(
|
||||
f"""
|
||||
INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
|
||||
start_time, end_time, content, text_content, parent_chunk_id)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s)
|
||||
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
|
||||
SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
|
||||
parent_chunk_id = EXCLUDED.parent_chunk_id
|
||||
""",
|
||||
(child_id, child_id, file_uuid, child_type, next_index,
|
||||
child["start"], child["end"],
|
||||
json.dumps({"speaker": child["speaker_name"], "text": child["text"], "mode": mode,
|
||||
"speaker_confidence": child.get("speaker_confidence", 0),
|
||||
"source_versions": CURRENT_VERSIONS}),
|
||||
child_text, parent_id),
|
||||
)
|
||||
next_index += 1
|
||||
child_count += 1
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
return parent_count, child_count
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Story Processor V2.0")
|
||||
parser.add_argument("--file-uuid", required=True)
|
||||
parser.add_argument("--mode", choices=["story", "llm"], default="story")
|
||||
parser.add_argument("--max-scenes", type=int, default=99999)
|
||||
parser.add_argument("--embed", action="store_true", help="Generate embeddings (Ollama)")
|
||||
parser.add_argument("--no-db", action="store_true", help="Skip DB storage")
|
||||
args = parser.parse_args()
|
||||
|
||||
file_uuid = args.file_uuid
|
||||
print(f"[STORY] Mode: {args.mode}, Embed: {args.embed}")
|
||||
|
||||
data = load_data(file_uuid)
|
||||
if not data["asr"]:
|
||||
print("[STORY] ❌ No ASR data"); return
|
||||
|
||||
scenes = build_child_chunks(data, file_uuid)[:args.max_scenes]
|
||||
total_children = sum(s["child_count"] for s in scenes)
|
||||
print(f"[STORY] {len(scenes)} scenes, {total_children} child chunks")
|
||||
|
||||
if not args.no_db:
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
try:
|
||||
pc, cc = store_chunks(file_uuid, scenes, args.mode, args.embed, conn)
|
||||
print(f"[STORY] DB: {pc} parent, {cc} child chunks ({args.mode})")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# Save JSON output
|
||||
out_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.story_{args.mode}.json")
|
||||
out_data = {"file_uuid": file_uuid, "mode": args.mode, "scenes": scenes}
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(out_data, f, indent=2, ensure_ascii=False, default=str)
|
||||
print(f"[STORY] ✅ {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/parent_chunk_5w1h_v1.11.py
|
||||
@@ -1,320 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments.
|
||||
Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections.
|
||||
"""
|
||||
|
||||
import json, sys, time, urllib.request
|
||||
from urllib.request import Request, urlopen
|
||||
import psycopg2
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
|
||||
def call_llm(dialogue_text):
|
||||
prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:"
|
||||
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1, "max_tokens": 100}).encode()
|
||||
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urlopen(req, timeout=120)
|
||||
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
||||
except Exception as e:
|
||||
print(f" LLM error: {e}")
|
||||
return ""
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
except Exception as e:
|
||||
print(f" Embed error: {e}")
|
||||
return [0.0] * 768
|
||||
|
||||
print("=== Step 1: Load sentence chunks with new speaker info ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT chunk_index, text_content, metadata->>'new_speaker_name',
|
||||
metadata->>'speaker_name', content
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
sentence_rows = cur.fetchall()
|
||||
print(f"Loaded {len(sentence_rows)} sentence chunks")
|
||||
|
||||
# Build lookup
|
||||
sentences = {}
|
||||
for r in sentence_rows:
|
||||
idx, old_text, new_name, old_name, content = r
|
||||
sentences[idx] = {
|
||||
"old_text": old_text or "",
|
||||
"new_name": new_name or old_name or "Unknown",
|
||||
"old_name": old_name or "Unknown",
|
||||
"content": content or {},
|
||||
}
|
||||
|
||||
# Rebuild sentence text_content with new speaker names
|
||||
print("\n=== Step 2: Rebuild sentence text_content ===")
|
||||
updated_sentences = 0
|
||||
for r in sentence_rows:
|
||||
idx, old_text, new_name, old_name, content = r
|
||||
new_name = new_name or old_name or "Unknown"
|
||||
|
||||
# Extract the text part (remove old speaker prefix if exists)
|
||||
raw_text = ""
|
||||
if content and isinstance(content, dict):
|
||||
raw_text = content.get("data", {}).get("text", "")
|
||||
if not raw_text and old_text:
|
||||
# Parse old format: [Speaker] text
|
||||
import re
|
||||
m = re.search(r'\]\s*(.*)', old_text)
|
||||
if m:
|
||||
raw_text = m.group(1)
|
||||
else:
|
||||
raw_text = old_text
|
||||
|
||||
new_text = f"[{new_name}] {raw_text}"
|
||||
|
||||
cur.execute("""
|
||||
UPDATE dev.chunks
|
||||
SET text_content = %s, updated_at = NOW()
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s
|
||||
""", (new_text, UUID, idx))
|
||||
updated_sentences += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Updated {updated_sentences} sentence chunks text_content")
|
||||
|
||||
print("\n=== Step 3: Rebuild story chunk text_content ===")
|
||||
cur.execute("""
|
||||
SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time,
|
||||
text_content, summary_text
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'story'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
story_rows = cur.fetchall()
|
||||
print(f"Loaded {len(story_rows)} story chunks")
|
||||
|
||||
# Build child text per story chunk
|
||||
story_dialogue_texts = []
|
||||
for r in story_rows:
|
||||
db_id, cid, idx, child_ids, st, et, old_text, old_summary = r
|
||||
|
||||
dialogue_parts = []
|
||||
for child_cid in (child_ids or []):
|
||||
parts = child_cid.split("_")
|
||||
child_idx = int(parts[-1])
|
||||
if child_idx in sentences:
|
||||
s = sentences[child_idx]
|
||||
raw = ""
|
||||
if s["content"] and isinstance(s["content"], dict):
|
||||
raw = s["content"].get("data", {}).get("text", "")
|
||||
if not raw:
|
||||
import re
|
||||
m = re.search(r'\]\s*(.*)', s["old_text"])
|
||||
if m:
|
||||
raw = m.group(1)
|
||||
else:
|
||||
raw = s["old_text"]
|
||||
if raw:
|
||||
dialogue_parts.append(f'({s["new_name"]}) {raw}')
|
||||
|
||||
dialogue_text = " ".join(dialogue_parts)
|
||||
story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary))
|
||||
|
||||
print(f"Built {len(story_dialogue_texts)} story dialogue texts")
|
||||
|
||||
# Update DB with new text_content (dialogue only, not summary yet)
|
||||
for item in story_dialogue_texts:
|
||||
db_id, cid, idx, st, et, dialogue_text, old_summary = item
|
||||
cur.execute("""
|
||||
UPDATE dev.chunks
|
||||
SET text_content = %s, updated_at = NOW()
|
||||
WHERE id = %s
|
||||
""", (dialogue_text, db_id))
|
||||
|
||||
conn.commit()
|
||||
print("Updated story chunk dialogue texts")
|
||||
|
||||
print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===")
|
||||
summaries = []
|
||||
for i, item in enumerate(story_dialogue_texts):
|
||||
db_id, cid, idx, st, et, dialogue_text, old_summary = item
|
||||
|
||||
if len(dialogue_text) < 10:
|
||||
summary = "[no dialogue]"
|
||||
embedding = [0.0] * 768
|
||||
else:
|
||||
print(f" [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="")
|
||||
try:
|
||||
summary = call_llm(dialogue_text[:3000])
|
||||
print(f" -> {len(summary)} chars")
|
||||
time.sleep(0.3)
|
||||
embedding = call_embed(summary)
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
summary = "[error]"
|
||||
embedding = [0.0] * 768
|
||||
|
||||
# Update DB
|
||||
s_esc = summary.replace("'", "''")
|
||||
cur.execute(f"""
|
||||
UPDATE dev.chunks
|
||||
SET summary_text = '{s_esc}', updated_at = NOW()
|
||||
WHERE id = {db_id}
|
||||
""")
|
||||
|
||||
summaries.append({
|
||||
"db_id": db_id,
|
||||
"chunk_id": cid,
|
||||
"chunk_index": idx,
|
||||
"start_time": st,
|
||||
"end_time": et,
|
||||
"dialogue": dialogue_text,
|
||||
"summary": summary,
|
||||
"embedding": embedding,
|
||||
})
|
||||
|
||||
conn.commit()
|
||||
print(f"\nGenerated {len(summaries)} summaries")
|
||||
|
||||
print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===")
|
||||
# Delete existing
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
|
||||
try:
|
||||
urlopen(req)
|
||||
time.sleep(0.3)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Recreate
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time.sleep(0.3)
|
||||
|
||||
# Upload dialogue points (0..227) and summary points (228..455)
|
||||
dialogue_points = []
|
||||
summary_points = []
|
||||
for s in summaries:
|
||||
idx = s["chunk_index"]
|
||||
dialogue_points.append({
|
||||
"id": idx + 1,
|
||||
"vector": [0.0] * 768,
|
||||
"payload": {
|
||||
"chunk_id": s["chunk_id"],
|
||||
"file_uuid": UUID,
|
||||
"start_time": s["start_time"],
|
||||
"end_time": s["end_time"],
|
||||
"type": "story_dialogue",
|
||||
"text": s["dialogue"][:500],
|
||||
}
|
||||
})
|
||||
summary_points.append({
|
||||
"id": idx + 1 + 228,
|
||||
"vector": s["embedding"],
|
||||
"payload": {
|
||||
"chunk_id": s["chunk_id"],
|
||||
"file_uuid": UUID,
|
||||
"start_time": s["start_time"],
|
||||
"end_time": s["end_time"],
|
||||
"type": "story_summary",
|
||||
"summary": s["summary"],
|
||||
}
|
||||
})
|
||||
|
||||
all_story_points = dialogue_points + summary_points
|
||||
|
||||
batch_size = 100
|
||||
for start in range(0, len(all_story_points), batch_size):
|
||||
batch = all_story_points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urlopen(req)
|
||||
except Exception as e:
|
||||
print(f" Batch {start}: {e}")
|
||||
if (start // batch_size) % 3 == 0:
|
||||
print(f" Uploaded {start + len(batch)}/{len(all_story_points)}")
|
||||
|
||||
print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories")
|
||||
|
||||
print("\n=== Step 6: Populate sentence_story and sentence_summary ===")
|
||||
# These are the per-sentence template + summary collections
|
||||
# sentence_story: 3417 points, 768D, template payloads
|
||||
# sentence_summary: 3417 points, 768D, LLM summary payloads
|
||||
|
||||
for col_name in ["sentence_story", "sentence_summary"]:
|
||||
req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE")
|
||||
try:
|
||||
urlopen(req)
|
||||
time.sleep(0.2)
|
||||
except:
|
||||
pass
|
||||
|
||||
req = Request(f"{QDRANT_URL}/collections/{col_name}",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time.sleep(0.2)
|
||||
|
||||
# Build points for sentence_story and sentence_summary
|
||||
story_sentence_points = []
|
||||
summary_sentence_points = []
|
||||
for idx in sorted(sentences.keys()):
|
||||
s = sentences[idx]
|
||||
raw_text = ""
|
||||
if s["content"] and isinstance(s["content"], dict):
|
||||
raw_text = s["content"].get("data", {}).get("text", "")
|
||||
|
||||
dialog_line = f'({s["new_name"]}) {raw_text}'
|
||||
|
||||
story_sentence_points.append({
|
||||
"id": idx + 1,
|
||||
"vector": [0.0] * 768,
|
||||
"payload": {
|
||||
"chunk_id": f"{UUID}_{idx}",
|
||||
"file_uuid": UUID,
|
||||
"start_time": 0,
|
||||
"end_time": 0,
|
||||
"text": dialog_line,
|
||||
"speaker_name": s["new_name"],
|
||||
"chunk_type": "sentence",
|
||||
}
|
||||
})
|
||||
|
||||
# Upload sentence_story (dialogue template)
|
||||
batch_size = 200
|
||||
for start in range(0, len(story_sentence_points), batch_size):
|
||||
batch = story_sentence_points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urlopen(req)
|
||||
except Exception as e:
|
||||
print(f" sentence_story batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" Uploaded {start + len(batch)}/3417 sentence_story")
|
||||
|
||||
print("Uploaded sentence_story points")
|
||||
|
||||
# sentence_summary will be populated when we generate per-sentence summaries
|
||||
# For now, mark as TODO
|
||||
print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("\n=== Done ===")
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/rebuild_story_content_v1.11.py
|
||||
@@ -1,197 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Regenerate parent chunk summaries using 5W1H multi-dimensional structure via gemma4.
|
||||
|
||||
5W1H Structure:
|
||||
- Who: Main characters/people involved
|
||||
- What: Key actions/events
|
||||
- When: Temporal context (sequence in story)
|
||||
- Where: Location/setting
|
||||
- Why: Motivation/conflict driving the scene
|
||||
- How: Emotional tone/manner of events
|
||||
"""
|
||||
|
||||
import json
|
||||
import requests
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
|
||||
UUID = "384b0ff44aaaa1f1"
|
||||
LLAMA_URL = "http://127.0.0.1:8081/v1/chat/completions"
|
||||
|
||||
|
||||
def get_parent_with_children():
|
||||
"""Get all parent chunks with their child chunk texts"""
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT pc.id, pc.scene_order, pc.start_time, pc.end_time,
|
||||
pc.start_frame, pc.end_frame, pc.fps, pc.summary_text as old_summary,
|
||||
pc.metadata,
|
||||
ARRAY_AGG(c.text_content ORDER BY c.start_time) as child_texts
|
||||
FROM parent_chunks pc
|
||||
LEFT JOIN chunks c ON c.parent_chunk_id = pc.id::varchar
|
||||
WHERE pc.uuid = %s
|
||||
GROUP BY pc.id, pc.scene_order, pc.start_time, pc.end_time,
|
||||
pc.start_frame, pc.end_frame, pc.fps, pc.summary_text, pc.metadata
|
||||
ORDER BY pc.scene_order
|
||||
""",
|
||||
(UUID,),
|
||||
)
|
||||
|
||||
parents = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return parents
|
||||
|
||||
|
||||
def call_gemma4(prompt, max_tokens=1500):
|
||||
"""Call Gemma4 via llama-server OpenAI-compatible API"""
|
||||
payload = {
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": 0.3,
|
||||
"min_p": 0.1,
|
||||
}
|
||||
try:
|
||||
resp = requests.post(LLAMA_URL, json=payload, timeout=180)
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
content = (
|
||||
result.get("choices", [{}])[0]
|
||||
.get("message", {})
|
||||
.get("content", "")
|
||||
.strip()
|
||||
)
|
||||
return content
|
||||
except Exception as e:
|
||||
print(f" ⚠️ llama-server error: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def generate_5w1h_summary(parent, scene_num):
|
||||
"""Generate 5W1H structured summary using gemma4"""
|
||||
texts = [t for t in (parent["child_texts"] or []) if t]
|
||||
if not texts:
|
||||
return None
|
||||
|
||||
# Use only first 3 and last 3 dialogue lines for context (much faster)
|
||||
sample_texts = texts[:3] + ["..."] + texts[-3:] if len(texts) > 6 else texts
|
||||
combined = "\n".join(sample_texts)[:1500]
|
||||
duration = parent["end_time"] - parent["start_time"]
|
||||
|
||||
prompt = f"""You are a film scene analyst. Analyze this scene and provide 5W1H analysis.
|
||||
|
||||
Scene {scene_num}/17 | {duration:.0f}s | {len(texts)} dialogue lines
|
||||
|
||||
Key dialogue:
|
||||
{combined}
|
||||
|
||||
Respond with ONLY this JSON:
|
||||
{{"summary_5lines":"...","who":"...","what":"...","when":"...","where":"...","why":"...","how":"...","characters":[],"tone":[],"key_events":[]}}
|
||||
IMPORTANT: "summary_5lines" must be EXACTLY 5 lines describing the scene. Each line should be a complete sentence separated by \\n."""
|
||||
|
||||
response = call_gemma4(prompt, max_tokens=2000)
|
||||
|
||||
if not response:
|
||||
return None
|
||||
|
||||
# Simple JSON extraction: find first { and last }
|
||||
try:
|
||||
start = response.find("{")
|
||||
end = response.rfind("}") + 1
|
||||
if start >= 0 and end > start:
|
||||
return json.loads(response[start:end])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def update_parent_chunk(parent, analysis):
|
||||
"""Update parent chunk with 5W1H structured data"""
|
||||
if not analysis:
|
||||
return False
|
||||
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Create structured summary text (5 lines)
|
||||
structured_text = f"{analysis.get('summary_5lines', '')}"
|
||||
|
||||
# Update metadata with full 5W1H structure
|
||||
metadata = parent["metadata"] if parent["metadata"] else {}
|
||||
metadata["auto_generated_by"] = "gemma4"
|
||||
metadata["chunk_count"] = len(parent["child_texts"] or [])
|
||||
metadata["structured_summary"] = {
|
||||
"summary_5lines": analysis.get("summary_5lines", ""),
|
||||
"who": analysis.get("who", ""),
|
||||
"what": analysis.get("what", ""),
|
||||
"when": analysis.get("when", ""),
|
||||
"where": analysis.get("where", ""),
|
||||
"why": analysis.get("why", ""),
|
||||
"how": analysis.get("how", ""),
|
||||
"characters": analysis.get("characters", []),
|
||||
"tone": analysis.get("tone", []),
|
||||
"key_events": analysis.get("key_events", []),
|
||||
}
|
||||
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE parent_chunks
|
||||
SET summary_text = %s,
|
||||
metadata = %s::jsonb
|
||||
WHERE id = %s
|
||||
""",
|
||||
(structured_text, json.dumps(metadata, ensure_ascii=False), parent["id"]),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
print(f"🎬 Regenerating 5W1H summaries for {UUID}")
|
||||
print(f" Using llama.cpp server at {LLAMA_URL}")
|
||||
print("=" * 70)
|
||||
|
||||
parents = get_parent_with_children()
|
||||
print(f"📥 Found {len(parents)} parent chunks")
|
||||
|
||||
success_count = 0
|
||||
for i, parent in enumerate(parents):
|
||||
duration = parent["end_time"] - parent["start_time"]
|
||||
text_count = len(parent["child_texts"] or [])
|
||||
print(
|
||||
f"\n🎬 Scene {parent['scene_order']}: {parent['start_time']:.0f}s-{parent['end_time']:.0f}s ({duration:.0f}s, {text_count} chunks)"
|
||||
)
|
||||
if parent["old_summary"]:
|
||||
print(f" Old: {parent['old_summary'][:80]}...")
|
||||
|
||||
analysis = generate_5w1h_summary(parent, parent["scene_order"])
|
||||
|
||||
if analysis:
|
||||
summary = analysis.get("summary_5lines", "N/A")
|
||||
print(f" ✅ Summary: {summary[:100]}...")
|
||||
print(f" 👤 Who: {analysis.get('who', 'N/A')[:60]}")
|
||||
print(f" 📍 Where: {analysis.get('where', 'N/A')[:60]}")
|
||||
print(f" 💡 Why: {analysis.get('why', 'N/A')[:60]}")
|
||||
|
||||
if update_parent_chunk(parent, analysis):
|
||||
success_count += 1
|
||||
else:
|
||||
print(" ❌ Failed to generate analysis")
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print(
|
||||
f"✅ Updated {success_count}/{len(parents)} parent chunks with 5W1H summaries"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/regenerate_parent_5w1h_v1.11.py
|
||||
@@ -39,140 +39,8 @@ def get_conn():
|
||||
|
||||
|
||||
def merge_traces_within_cuts(face_data: dict, cut_scenes: list) -> dict:
|
||||
"""Merge traces within the same cut if they have similar embeddings (same person re-appeared)."""
|
||||
frames = face_data.get("frames", {})
|
||||
if not frames:
|
||||
return face_data
|
||||
|
||||
# Map each frame to its scene/cut number
|
||||
frame_to_scene = {}
|
||||
for s in cut_scenes:
|
||||
for f in range(s["start_frame"], s["end_frame"] + 1):
|
||||
frame_to_scene[f] = s["scene_number"]
|
||||
|
||||
# Collect per-trace data: scene numbers, embeddings, face positions
|
||||
trace_frames = defaultdict(list)
|
||||
trace_embeddings = defaultdict(list)
|
||||
trace_poses = {}
|
||||
|
||||
for fnum_str, frm_data in frames.items():
|
||||
fnum = int(fnum_str)
|
||||
for face in frm_data.get("faces", []):
|
||||
tid = face.get("trace_id")
|
||||
if tid is None:
|
||||
continue
|
||||
trace_frames[tid].append(fnum)
|
||||
emb = face.get("embedding")
|
||||
if emb is not None:
|
||||
trace_embeddings[tid].append(emb)
|
||||
if tid not in trace_poses:
|
||||
trace_poses[tid] = (
|
||||
face.get("x", 0),
|
||||
face.get("y", 0),
|
||||
face.get("width", 0),
|
||||
face.get("height", 0),
|
||||
)
|
||||
|
||||
if len(trace_embeddings) < 2:
|
||||
return face_data
|
||||
|
||||
# Compute centroid per trace
|
||||
trace_centroids = {}
|
||||
for tid, embs in trace_embeddings.items():
|
||||
centroid = np.mean(embs, axis=0)
|
||||
norm = np.linalg.norm(centroid)
|
||||
trace_centroids[tid] = centroid / norm if norm > 0 else centroid
|
||||
|
||||
# Determine which scene each trace belongs to (majority of frames)
|
||||
trace_scene = {}
|
||||
for tid, fns in trace_frames.items():
|
||||
scene_votes = defaultdict(int)
|
||||
for fn in fns:
|
||||
scene = frame_to_scene.get(fn, -1)
|
||||
scene_votes[scene] += 1
|
||||
trace_scene[tid] = max(scene_votes, key=scene_votes.get) if scene_votes else -1
|
||||
|
||||
# Within each scene, merge traces with similar centroids
|
||||
scene_traces = defaultdict(list)
|
||||
for tid, scene in trace_scene.items():
|
||||
if scene >= 0 and tid in trace_centroids:
|
||||
scene_traces[scene].append(tid)
|
||||
|
||||
merged = 0
|
||||
next_new_id = max(trace_frames.keys()) + 1 if trace_frames else 0
|
||||
SIMILARITY_THRESHOLD = 0.75
|
||||
|
||||
for scene, tids in scene_traces.items():
|
||||
if len(tids) < 2:
|
||||
continue
|
||||
used = set()
|
||||
for i in range(len(tids)):
|
||||
if tids[i] in used:
|
||||
continue
|
||||
keep_tid = tids[i]
|
||||
for j in range(i + 1, len(tids)):
|
||||
if tids[j] in used:
|
||||
continue
|
||||
sim = float(np.dot(trace_centroids[tids[i]], trace_centroids[tids[j]]))
|
||||
if sim >= SIMILARITY_THRESHOLD:
|
||||
# Merge tids[j] into keep_tid
|
||||
for fnum_str, frm_data in frames.items():
|
||||
for face in frm_data.get("faces", []):
|
||||
if face.get("trace_id") == tids[j]:
|
||||
face["trace_id"] = keep_tid
|
||||
used.add(tids[j])
|
||||
merged += 1
|
||||
|
||||
# If any merges happened, rebuild trace metadata
|
||||
if merged > 0:
|
||||
# Rebuild traces dict
|
||||
new_traces = {}
|
||||
new_trace_frames = defaultdict(list)
|
||||
for fnum_str, frm_data in frames.items():
|
||||
fnum = int(fnum_str)
|
||||
for face in frm_data.get("faces", []):
|
||||
tid = face.get("trace_id")
|
||||
if tid is not None:
|
||||
new_trace_frames[tid].append(
|
||||
{
|
||||
"frame": fnum,
|
||||
"face_index": 0,
|
||||
"bbox": {
|
||||
"x": face.get("x", 0),
|
||||
"y": face.get("y", 0),
|
||||
"width": face.get("width", 0),
|
||||
"height": face.get("height", 0),
|
||||
},
|
||||
"confidence": face.get("confidence", 0.0),
|
||||
}
|
||||
)
|
||||
|
||||
for tid, path in new_trace_frames.items():
|
||||
if len(path) >= 1:
|
||||
frames_sorted = sorted(set(p["frame"] for p in path))
|
||||
new_traces[str(tid)] = {
|
||||
"trace_id": tid,
|
||||
"start_frame": frames_sorted[0],
|
||||
"end_frame": frames_sorted[-1],
|
||||
"duration_frames": frames_sorted[-1] - frames_sorted[0] + 1,
|
||||
"duration_seconds": (frames_sorted[-1] - frames_sorted[0])
|
||||
/ face_data.get("metadata", {}).get("fps", 25.0),
|
||||
"total_appearances": len(path),
|
||||
"path": path,
|
||||
}
|
||||
|
||||
face_data["traces"] = new_traces
|
||||
face_data["metadata"]["trace_stats"] = {
|
||||
"total_traces": len(new_traces),
|
||||
"active_traces": len(new_traces),
|
||||
"long_traces": len(
|
||||
[t for t in new_traces.values() if t["duration_frames"] >= 2]
|
||||
),
|
||||
}
|
||||
print(
|
||||
f"[TRACE] Post-merge: {merged} traces merged, {len(new_traces)} total traces"
|
||||
)
|
||||
|
||||
"""Merge traces within the same cut - DISABLED (no embeddings)."""
|
||||
# TODO: Reimplement with Qdrant _faces collection
|
||||
return face_data
|
||||
|
||||
|
||||
@@ -235,57 +103,12 @@ def run_face_tracker(
|
||||
|
||||
print(f"[TRACE] Processing {len(face_data.get('frames', {}))} frames")
|
||||
|
||||
# Load embeddings from DB for the face tracker
|
||||
# Embeddings no longer loaded from DB - use IoU-only tracking
|
||||
file_uuid = (
|
||||
face_json_path.split("/")[-1]
|
||||
.replace(".face.json", "")
|
||||
.replace("_traced.json", "")
|
||||
)
|
||||
try:
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
f"""
|
||||
SELECT frame_number, x, y, width, height, embedding
|
||||
FROM {SCHEMA}.face_detections
|
||||
WHERE file_uuid = %s AND embedding IS NOT NULL
|
||||
""",
|
||||
(file_uuid,),
|
||||
)
|
||||
emb_rows = cur.fetchall()
|
||||
conn.close()
|
||||
# Build lookup: frame_number → list of (bbox, embedding)
|
||||
emb_map = {}
|
||||
for fn, x, y, w, h, emb in emb_rows:
|
||||
emb_map.setdefault(fn, []).append(((x, y, w, h), emb))
|
||||
print(f"[TRACE] Loaded {len(emb_rows)} embeddings from DB")
|
||||
|
||||
# Attach embeddings to face data
|
||||
attached = 0
|
||||
for fnum_str, frm_data in face_data.get("frames", {}).items():
|
||||
fnum = int(fnum_str)
|
||||
for face in frm_data.get("faces", []):
|
||||
x, y, w, h = (
|
||||
face.get("x", 0),
|
||||
face.get("y", 0),
|
||||
face.get("width", 0),
|
||||
face.get("height", 0),
|
||||
)
|
||||
candidates = emb_map.get(fnum, [])
|
||||
# Find matching embedding by bbox proximity
|
||||
for (ex, ey, ew, eh), emb in candidates:
|
||||
if (
|
||||
abs(x - ex) < 10
|
||||
and abs(y - ey) < 10
|
||||
and abs(w - ew) < 10
|
||||
and abs(h - eh) < 10
|
||||
):
|
||||
face["embedding"] = emb
|
||||
attached += 1
|
||||
break
|
||||
print(f"[TRACE] Attached {attached} embeddings to faces")
|
||||
except Exception as e:
|
||||
print(f"[TRACE] WARNING: Could not load embeddings: {e}")
|
||||
|
||||
# Load cut boundaries from cut.json (same directory as face.json)
|
||||
cut_boundaries = None
|
||||
@@ -301,7 +124,7 @@ def run_face_tracker(
|
||||
print(f"[TRACE] Loaded {len(cut_boundaries)} cut boundaries")
|
||||
|
||||
face_data = track_faces(
|
||||
face_data, use_embedding=True, cut_boundaries=cut_boundaries
|
||||
face_data, use_embedding=False, cut_boundaries=cut_boundaries
|
||||
)
|
||||
|
||||
# Merge traces within same cut (same person re-appearing after occlusion/pose change)
|
||||
@@ -309,7 +132,7 @@ def run_face_tracker(
|
||||
face_data = merge_traces_within_cuts(face_data, cut_scenes)
|
||||
|
||||
metadata = face_data.get("metadata", {})
|
||||
metadata["tracking_method"] = "iou_embedding"
|
||||
metadata["tracking_method"] = "iou_only"
|
||||
metadata["tracked_at"] = datetime.now().isoformat()
|
||||
face_data["metadata"] = metadata
|
||||
|
||||
@@ -350,22 +173,19 @@ def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHE
|
||||
if face_id is None:
|
||||
face_id = f"face_{trace_id}"
|
||||
attributes = face.get("attributes")
|
||||
embedding = face.get("embedding")
|
||||
|
||||
bbox = json.dumps({"x": x, "y": y, "width": w, "height": h})
|
||||
embed_vec = embedding if embedding and len(embedding) > 0 else None
|
||||
|
||||
try:
|
||||
cur.execute(
|
||||
f"""
|
||||
UPDATE {schema}.face_detections
|
||||
SET trace_id = %s, embedding = %s, face_id = %s
|
||||
SET trace_id = %s, face_id = %s
|
||||
WHERE file_uuid = %s AND frame_number = %s
|
||||
AND x = %s AND y = %s AND width = %s AND height = %s
|
||||
""",
|
||||
(
|
||||
trace_id,
|
||||
embed_vec,
|
||||
face_id,
|
||||
file_uuid,
|
||||
frame_num,
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Story Embedding Pipeline:
|
||||
1. Read story chunks → LLM summary (Gemma4)
|
||||
2. Embed summary (EmbeddingGemma)
|
||||
3. Store in chunks table + Qdrant
|
||||
"""
|
||||
|
||||
import json, urllib.request, subprocess, sys, time, os
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
QDRANT_COL = "momentry_dev_stories"
|
||||
|
||||
def psql(sql):
|
||||
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
|
||||
return r.stdout.strip()
|
||||
|
||||
def call_llm(dialogue):
|
||||
prompt = f"Dialogue: {dialogue}\n\n50-word summary:"
|
||||
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1, "max_tokens": 100}).encode()
|
||||
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urllib.request.urlopen(req, timeout=120)
|
||||
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
|
||||
# Step 0: Ensure Qdrant collection exists (768 dims)
|
||||
subprocess.run(["curl", "-s", "-X", "PUT", f"{QDRANT_URL}/collections/{QDRANT_COL}",
|
||||
"-H", "Content-Type: application/json",
|
||||
"-d", '{"vectors":{"size":768,"distance":"Cosine"}}'], capture_output=True)
|
||||
|
||||
# Step 1: Get all story chunks that need summaries
|
||||
lines = [l for l in psql(f"SELECT chunk_id, chunk_index, start_time, end_time, text_content FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND (summary_text IS NULL OR summary_text = '') ORDER BY chunk_index").split('\n') if l.strip() and '|' in l]
|
||||
|
||||
print(f"Chunks to process: {len(lines)}")
|
||||
total = len(lines)
|
||||
errors = 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
parts = line.split('|', 4)
|
||||
cid, idx, st, et, dialogue = parts[0].strip(), int(parts[1]), float(parts[2]), float(parts[3]), parts[4] if len(parts) > 4 else ""
|
||||
|
||||
if len(dialogue) < 10:
|
||||
summary = "[no dialogue]"
|
||||
embedding = [0.0] * 768
|
||||
else:
|
||||
try:
|
||||
summary = call_llm(dialogue)
|
||||
time.sleep(0.3)
|
||||
embedding = call_embed(summary)
|
||||
except Exception as e:
|
||||
print(f"[{i+1}/{total}] Error: {cid} - {e}")
|
||||
errors += 1
|
||||
summary = "[error]"
|
||||
embedding = [0.0] * 768
|
||||
|
||||
# Update DB
|
||||
s_esc = summary.replace("'", "''")
|
||||
psql(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE chunk_id='{cid}'")
|
||||
|
||||
# Store in Qdrant
|
||||
point = json.dumps({"points": [{"id": idx + 1, "vector": embedding,
|
||||
"payload": {"chunk_id": cid, "file_uuid": UUID, "start_time": st, "end_time": et,
|
||||
"summary": summary, "type": "story_summary"}
|
||||
}]}).encode()
|
||||
req = urllib.request.Request(f"{QDRANT_URL}/collections/{QDRANT_COL}/points?wait=true",
|
||||
data=point, headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urllib.request.urlopen(req, timeout=10)
|
||||
except:
|
||||
pass
|
||||
|
||||
if (i+1) % 20 == 0:
|
||||
print(f"[{i+1}/{total}] {errors} errors so far")
|
||||
|
||||
print(f"\nDone. Processed: {total}, Errors: {errors}")
|
||||
print(f"Qdrant: {QDRANT_COL}")
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/story_embed_v1.11.py
|
||||
@@ -1,230 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Story Pipeline Full — Speaker + Story + Summary
|
||||
Step 1: Update sentence chunks with speaker name
|
||||
Step 2: Rebuild story chunks + re-embed
|
||||
Step 3: LLM summary × 228 + embed
|
||||
"""
|
||||
|
||||
import json, urllib.request, subprocess, sys, time, os
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DIR = "/Users/accusys/momentry/output_dev"
|
||||
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
QDRANT_URL = "http://localhost:6333/collections/momentry_dev_stories/points"
|
||||
|
||||
def psql(sql):
|
||||
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
|
||||
return r.stdout.strip()
|
||||
|
||||
def psql_file(path):
|
||||
r = subprocess.run(PSQL + ["-f", path], capture_output=True, text=True, timeout=60)
|
||||
if r.stderr and "ERROR" in r.stderr:
|
||||
print(f"SQL Error: {r.stderr[:200]}")
|
||||
return r.returncode
|
||||
|
||||
def embed_text(text):
|
||||
body = json.dumps({"input": text[:1024]}).encode()
|
||||
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
return json.loads(urllib.request.urlopen(req, timeout=30).read())["data"][0]["embedding"]
|
||||
|
||||
def llm_summary(dialogue):
|
||||
body = json.dumps({
|
||||
"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": f"Summarize concisely:\n{dialogue}\n\n50-word summary:"}],
|
||||
"temperature": 0.1, "max_tokens": 100,
|
||||
}).encode()
|
||||
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
return json.loads(urllib.request.urlopen(req, timeout=120).read())["choices"][0]["message"]["content"].strip()
|
||||
|
||||
fps = 25.0
|
||||
FILE_ID = 242
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 0: Load ASR + ASRX + speaker map
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("=" * 60)
|
||||
print("Step 0: Loading data...")
|
||||
asr = json.load(open(f"{DIR}/{UUID}.asr.json"))
|
||||
segs = asr["segments"]
|
||||
asrx = json.load(open(f"{DIR}/{UUID}.asrx.json"))
|
||||
asrx_segs = asrx["segments"]
|
||||
|
||||
# Speaker map from identity_bindings
|
||||
r = psql("SELECT ib.identity_value, i.name FROM dev.identity_bindings ib JOIN dev.identities i ON i.id=ib.identity_id WHERE ib.identity_type='speaker'")
|
||||
speaker_map = {}
|
||||
for line in r.strip().split('\n'):
|
||||
if line.strip() and '|' in line:
|
||||
p = line.split('|')
|
||||
speaker_map[p[0].strip()] = p[1].strip()
|
||||
speaker_map["SPEAKER_0"] = "Speaker_0" # Fallback for unbounded
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 1: Update sentence chunks with speaker
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 1: Updating sentence chunks with speaker...")
|
||||
|
||||
sql = ["BEGIN;"]
|
||||
chunk_meta = {} # idx → {speaker_id, speaker_name}
|
||||
|
||||
for idx, seg in enumerate(segs):
|
||||
st, et = seg["start"], seg["end"]
|
||||
text = seg["text"].strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Find overlapping ASRX segment → speaker_id
|
||||
spk_id = "SPEAKER_0"
|
||||
for ax in asrx_segs:
|
||||
if ax.get("start_time", 0) <= st and ax.get("end_time", 0) >= et:
|
||||
spk_id = ax.get("speaker_id", "SPEAKER_0")
|
||||
break
|
||||
|
||||
spk_name = speaker_map.get(spk_id, spk_id)
|
||||
new_text = f"[{spk_name}] {text}"
|
||||
meta = json.dumps({"speaker_id": spk_id, "speaker_name": spk_name})
|
||||
esc = new_text.replace("'", "''")
|
||||
|
||||
sql.append(f"UPDATE dev.chunks SET text_content='{esc}', metadata='{meta}'::jsonb WHERE file_uuid='{UUID}' AND chunk_id='{UUID}_{idx}';")
|
||||
chunk_meta[idx] = {"speaker_id": spk_id, "speaker_name": spk_name}
|
||||
|
||||
sql.append("COMMIT;")
|
||||
with open("/tmp/s1_speaker.sql", "w") as f:
|
||||
f.write("\n".join(sql))
|
||||
|
||||
psql_file("/tmp/s1_speaker.sql")
|
||||
print(f" Updated {len(chunk_meta)} sentence chunks with speaker")
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 2: Rebuild story chunks + re-embed
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 2: Rebuilding story chunks...")
|
||||
|
||||
# Delete old story chunks
|
||||
psql(f"DELETE FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story';")
|
||||
|
||||
# Recreate
|
||||
CHUNK_SIZE = 15
|
||||
sql2 = ["BEGIN;"]
|
||||
story_meta = []
|
||||
|
||||
for i in range(0, len(segs), CHUNK_SIZE):
|
||||
group = segs[i:i+CHUNK_SIZE]
|
||||
st, et = group[0]["start"], group[-1]["end"]
|
||||
idx = i // CHUNK_SIZE
|
||||
chunk_id = f"{UUID}_story_{idx}"
|
||||
|
||||
# Build speaker text from individual sentences
|
||||
texts = []
|
||||
speakers_used = {}
|
||||
for j, seg in enumerate(group):
|
||||
seg_idx = i + j
|
||||
if seg_idx in chunk_meta:
|
||||
cm = chunk_meta[seg_idx]
|
||||
text = seg["text"].strip()
|
||||
if text:
|
||||
texts.append(f"[{cm['speaker_name']}] {text}")
|
||||
speakers_used[cm['speaker_name']] = speakers_used.get(cm['speaker_name'], 0) + 1
|
||||
|
||||
dialogue = " ".join(texts)
|
||||
child_ids = ", ".join([f"'{UUID}_{j}'" for j in range(i, min(i+CHUNK_SIZE, len(segs)))])
|
||||
words = sum(len(t.split()) for t in texts)
|
||||
|
||||
meta = json.dumps({"method": "fixed_15", "seg_count": len(group), "words": words, "speakers": speakers_used})
|
||||
esc = dialogue.replace("'", "''")
|
||||
|
||||
sql2.append(f"""INSERT INTO dev.chunks (file_id,file_uuid,chunk_id,old_chunk_id,chunk_index,chunk_type,start_time,end_time,fps,start_frame,end_frame,text_content,content,metadata,frame_count,child_chunk_ids)
|
||||
VALUES ({FILE_ID},'{UUID}','{chunk_id}','{chunk_id}',{idx},'story',{st},{et},{fps},{int(st*fps)},{int(et*fps)},'{esc}','{{"type":"story_parent"}}'::jsonb,'{meta}'::jsonb,{int((et-st)*fps)},ARRAY[{child_ids}]);""")
|
||||
|
||||
story_meta.append({"idx": idx, "st": st, "et": et, "dialogue": dialogue, "words": words, "speakers": speakers_used})
|
||||
|
||||
sql2.append("COMMIT;")
|
||||
with open("/tmp/s2_story.sql", "w") as f:
|
||||
f.write("\n".join(sql2))
|
||||
psql_file("/tmp/s2_story.sql")
|
||||
print(f" Created {len(story_meta)} story chunks")
|
||||
|
||||
# Embed + upsert to Qdrant
|
||||
print("\n Embedding story chunks...")
|
||||
points_dialogue = []
|
||||
for sm in story_meta:
|
||||
if len(sm["dialogue"]) < 10:
|
||||
continue
|
||||
vec = embed_text(sm["dialogue"])
|
||||
points_dialogue.append({"id": sm["idx"] + 1, "vector": vec, "payload": {
|
||||
"chunk_id": f"{UUID}_story_{sm['idx']}", "file_uuid": UUID,
|
||||
"start_time": sm["st"], "end_time": sm["et"], "type": "story_dialogue"
|
||||
}})
|
||||
|
||||
for i in range(0, len(points_dialogue), 100):
|
||||
batch = points_dialogue[i:i+100]
|
||||
data = json.dumps({"points": batch, "wait": True}).encode()
|
||||
req = urllib.request.Request(f"{QDRANT_URL}?wait=true", data=data, headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urllib.request.urlopen(req, timeout=30)
|
||||
print(f" Qdrant: {len(points_dialogue)} dialogue vectors")
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 3: LLM summaries + embed
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 3: LLM summaries...")
|
||||
|
||||
points_summary = []
|
||||
summary_sql = ["BEGIN;"]
|
||||
|
||||
for i, sm in enumerate(story_meta):
|
||||
if len(sm["dialogue"]) < 10:
|
||||
continue
|
||||
|
||||
try:
|
||||
summary = llm_summary(sm["dialogue"])
|
||||
time.sleep(0.3)
|
||||
vec = embed_text(summary)
|
||||
time.sleep(0.1)
|
||||
except Exception as e:
|
||||
print(f" Error on story {sm['idx']}: {e}")
|
||||
summary = "[error]"
|
||||
vec = [0.0] * 768
|
||||
|
||||
s_esc = summary.replace("'", "''")
|
||||
summary_sql.append(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE file_uuid='{UUID}' AND chunk_id='{UUID}_story_{sm['idx']}';")
|
||||
|
||||
points_summary.append({"id": 100000 + sm["idx"] + 1, "vector": vec, "payload": {
|
||||
"chunk_id": f"{UUID}_story_{sm['idx']}", "file_uuid": UUID,
|
||||
"start_time": sm["st"], "end_time": sm["et"],
|
||||
"summary": summary, "type": "story_summary"
|
||||
}})
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" {i+1}/{len(story_meta)}")
|
||||
|
||||
# Update DB with summaries
|
||||
summary_sql.append("COMMIT;")
|
||||
with open("/tmp/s3_summary.sql", "w") as f:
|
||||
f.write("\n".join(summary_sql))
|
||||
psql_file("/tmp/s3_summary.sql")
|
||||
|
||||
# Upsert summary vectors to Qdrant
|
||||
for i in range(0, len(points_summary), 100):
|
||||
batch = points_summary[i:i+100]
|
||||
data = json.dumps({"points": batch, "wait": True}).encode()
|
||||
req = urllib.request.Request(f"{QDRANT_URL}?wait=true", data=data, headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urllib.request.urlopen(req, timeout=30)
|
||||
|
||||
print(f" Qdrant: {len(points_summary)} summary vectors")
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 4: Verify
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("\n" + "=" * 60)
|
||||
print("Done.")
|
||||
r1 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' AND text_content LIKE '[%'")
|
||||
r2 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story'")
|
||||
r3 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND summary_text IS NOT NULL")
|
||||
print(f"Sentence chunks with speaker: {r1}")
|
||||
print(f"Story chunks: {r2}")
|
||||
print(f"Story chunks with summary: {r3}")
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/story_pipeline_full_v1.11.py
|
||||
@@ -1,325 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Story Processor - Generate parent-child chunk hierarchy for RAG
|
||||
Uses LOCAL video analysis (ASR, YOLO, OCR, Scene) to create parent chunks.
|
||||
NO cloud API calls - fully offline processing
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
from typing import Dict, List, Any
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def extract_video_metadata(video_path: str) -> Dict[str, Any]:
|
||||
"""Extract basic video metadata using ffprobe"""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"quiet",
|
||||
"-print_format",
|
||||
"json",
|
||||
"-show_format",
|
||||
"-show_streams",
|
||||
video_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
return json.loads(result.stdout)
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def generate_parent_child_chunks(
|
||||
asr_data: Dict,
|
||||
cut_data: Dict,
|
||||
yolo_data: Dict,
|
||||
ocr_data: Dict,
|
||||
scene_data: Dict,
|
||||
parent_chunk_size: int = 5,
|
||||
) -> Dict:
|
||||
"""
|
||||
Generate parent-child chunk hierarchy using LOCAL data only.
|
||||
No LLM/API calls - uses template-based narrative generation.
|
||||
"""
|
||||
child_chunks = []
|
||||
parent_chunks = []
|
||||
|
||||
# Create child chunks from ASR
|
||||
for seg in asr_data.get("segments", []):
|
||||
child_chunks.append(
|
||||
{
|
||||
"chunk_id": f"asr_{seg.get('start', 0):.1f}_{seg.get('end', 0):.1f}",
|
||||
"chunk_type": "asr",
|
||||
"source": "asr",
|
||||
"start_time": seg.get("start", 0),
|
||||
"end_time": seg.get("end", 0),
|
||||
"text_content": seg.get("text", ""),
|
||||
"content": {
|
||||
"text": seg.get("text", ""),
|
||||
"confidence": seg.get("confidence", 0),
|
||||
},
|
||||
"child_chunk_ids": [],
|
||||
"parent_chunk_id": None,
|
||||
}
|
||||
)
|
||||
|
||||
# Create child chunks from CUT scenes
|
||||
for scene in cut_data.get("scenes", []):
|
||||
child_chunks.append(
|
||||
{
|
||||
"chunk_id": f"cut_{scene.get('scene_number', 0)}",
|
||||
"chunk_type": "cut",
|
||||
"source": "cut",
|
||||
"start_time": scene.get("start_time", 0),
|
||||
"end_time": scene.get("end_time", 0),
|
||||
"text_content": f"Scene {scene.get('scene_number', 0)}",
|
||||
"content": {
|
||||
"scene_number": scene.get("scene_number", 0),
|
||||
"duration": scene.get("duration", 0),
|
||||
},
|
||||
"child_chunk_ids": [],
|
||||
"parent_chunk_id": None,
|
||||
}
|
||||
)
|
||||
|
||||
asr_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "asr"]
|
||||
cut_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "cut"]
|
||||
|
||||
yolo_frames = yolo_data.get("frames", [])
|
||||
ocr_frames = ocr_data.get("frames", [])
|
||||
|
||||
# Group ASR segments into parent chunks
|
||||
for i in range(0, len(asr_child_ids), parent_chunk_size):
|
||||
batch = asr_child_ids[i : i + parent_chunk_size]
|
||||
if not batch:
|
||||
continue
|
||||
|
||||
batch_texts = []
|
||||
batch_objects = []
|
||||
batch_times = []
|
||||
|
||||
for child_id in batch:
|
||||
for child in child_chunks:
|
||||
if child["chunk_id"] == child_id:
|
||||
if child["text_content"]:
|
||||
batch_texts.append(child["text_content"])
|
||||
batch_times.append((child["start_time"], child["end_time"]))
|
||||
break
|
||||
|
||||
start_time = batch_times[0][0] if batch_times else 0
|
||||
end_time = batch_times[-1][1] if batch_times else 0
|
||||
|
||||
# Find objects in this time range
|
||||
for frame in yolo_frames[:50]:
|
||||
ts = frame.get("timestamp", 0)
|
||||
if start_time <= ts <= end_time:
|
||||
for obj in frame.get("objects", []):
|
||||
batch_objects.append(obj.get("class_name", "unknown"))
|
||||
|
||||
narrative = generate_narrative(batch_texts, batch_objects, start_time, end_time)
|
||||
|
||||
parent_chunk = {
|
||||
"chunk_id": f"story_asr_{i // parent_chunk_size:04d}",
|
||||
"chunk_type": "story",
|
||||
"source": "story_asr",
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"text_content": narrative,
|
||||
"content": {
|
||||
"description": narrative,
|
||||
"child_count": len(batch),
|
||||
"speech_preview": " ".join(batch_texts[:3]) if batch_texts else None,
|
||||
"detected_objects": list(set(batch_objects))[:5],
|
||||
},
|
||||
"child_chunk_ids": batch,
|
||||
"parent_chunk_id": None,
|
||||
}
|
||||
parent_chunks.append(parent_chunk)
|
||||
|
||||
for child_id in batch:
|
||||
for child in child_chunks:
|
||||
if child["chunk_id"] == child_id:
|
||||
child["parent_chunk_id"] = parent_chunk["chunk_id"]
|
||||
break
|
||||
|
||||
# Group CUT scenes into parent chunks
|
||||
for i in range(0, len(cut_child_ids), parent_chunk_size):
|
||||
batch = cut_child_ids[i : i + parent_chunk_size]
|
||||
if not batch:
|
||||
continue
|
||||
|
||||
batch_times = []
|
||||
batch_objects = []
|
||||
|
||||
for child_id in batch:
|
||||
for child in child_chunks:
|
||||
if child["chunk_id"] == child_id:
|
||||
batch_times.append((child["start_time"], child["end_time"]))
|
||||
break
|
||||
|
||||
start_time = batch_times[0][0] if batch_times else 0
|
||||
end_time = batch_times[-1][1] if batch_times else 0
|
||||
|
||||
for frame in yolo_frames[:50]:
|
||||
ts = frame.get("timestamp", 0)
|
||||
if start_time <= ts <= end_time:
|
||||
for obj in frame.get("objects", []):
|
||||
batch_objects.append(obj.get("class_name", "unknown"))
|
||||
|
||||
narrative = generate_scene_narrative(
|
||||
batch_objects, start_time, end_time, len(batch)
|
||||
)
|
||||
|
||||
parent_chunk = {
|
||||
"chunk_id": f"story_cut_{i // parent_chunk_size:04d}",
|
||||
"chunk_type": "story",
|
||||
"source": "story_cut",
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"text_content": narrative,
|
||||
"content": {
|
||||
"description": narrative,
|
||||
"child_count": len(batch),
|
||||
"scenes": batch,
|
||||
"detected_objects": list(set(batch_objects))[:5],
|
||||
},
|
||||
"child_chunk_ids": batch,
|
||||
"parent_chunk_id": None,
|
||||
}
|
||||
parent_chunks.append(parent_chunk)
|
||||
|
||||
for child_id in batch:
|
||||
for child in child_chunks:
|
||||
if child["chunk_id"] == child_id:
|
||||
child["parent_chunk_id"] = parent_chunk["chunk_id"]
|
||||
break
|
||||
|
||||
return {
|
||||
"child_chunks": child_chunks,
|
||||
"parent_chunks": parent_chunks,
|
||||
"stats": {
|
||||
"total_child_chunks": len(child_chunks),
|
||||
"total_parent_chunks": len(parent_chunks),
|
||||
"asr_children": len(asr_child_ids),
|
||||
"cut_children": len(cut_child_ids),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def generate_narrative(
|
||||
texts: List[str], objects: List[str], start: float, end: float
|
||||
) -> str:
|
||||
"""Generate narrative description from LOCAL text snippets and objects"""
|
||||
if not texts and not objects:
|
||||
return f"Video segment from {start:.1f}s to {end:.1f}s"
|
||||
|
||||
parts = []
|
||||
if texts:
|
||||
combined = " ".join(texts[:5])
|
||||
if len(combined) > 150:
|
||||
combined = combined[:150] + "..."
|
||||
parts.append(f"Speech: {combined}")
|
||||
|
||||
if objects:
|
||||
unique_objs = list(set(objects))[:5]
|
||||
parts.append(f"Visuals: {', '.join(unique_objs)}")
|
||||
|
||||
return f"[{start:.0f}s-{end:.0f}s] {' | '.join(parts)}"
|
||||
|
||||
|
||||
def generate_scene_narrative(
|
||||
objects: List[str], start: float, end: float, scene_count: int
|
||||
) -> str:
|
||||
"""Generate scene narrative from LOCAL detected objects"""
|
||||
unique_objects = list(set(objects))[:5]
|
||||
|
||||
if unique_objects:
|
||||
obj_str = ", ".join(unique_objects)
|
||||
return f"[{start:.0f}s-{end:.0f}s] {scene_count} scenes. Visuals: {obj_str}."
|
||||
else:
|
||||
return f"[{start:.0f}s-{end:.0f}s] {scene_count} video scenes."
|
||||
|
||||
|
||||
def run_story(
|
||||
video_path: str, output_path: str, uuid: str = "", parent_chunk_size: int = 5
|
||||
):
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("story", "STORY_START")
|
||||
|
||||
base_path = os.path.dirname(output_path)
|
||||
uuid_name = os.path.basename(output_path).split(".")[0]
|
||||
|
||||
asr_data = {"segments": []}
|
||||
cut_data = {"scenes": []}
|
||||
yolo_data = {"frames": []}
|
||||
ocr_data = {"frames": []}
|
||||
scene_data = {"scenes": []}
|
||||
|
||||
for name, data_var in [
|
||||
("asr", asr_data),
|
||||
("cut", cut_data),
|
||||
("yolo", yolo_data),
|
||||
("ocr", ocr_data),
|
||||
("scene", scene_data),
|
||||
]:
|
||||
path = os.path.join(base_path, f"{uuid_name}.{name}.json")
|
||||
if os.path.exists(path):
|
||||
with open(path) as f:
|
||||
data_var.update(json.load(f))
|
||||
|
||||
result = generate_parent_child_chunks(
|
||||
asr_data, cut_data, yolo_data, ocr_data, scene_data, parent_chunk_size
|
||||
)
|
||||
|
||||
result["video_metadata"] = extract_video_metadata(video_path)
|
||||
result["processing"] = {
|
||||
"method": "local_aggregation",
|
||||
"cloud_api_used": False,
|
||||
"parent_chunk_size": parent_chunk_size,
|
||||
}
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
if publisher:
|
||||
publisher.complete(
|
||||
"story",
|
||||
f"{result['stats']['total_parent_chunks']} parent, {result['stats']['total_child_chunks']} child chunks (LOCAL)",
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Story Processor - Parent-Child Chunk Hierarchy (LOCAL ONLY)"
|
||||
)
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", help="UUID for progress tracking", default="")
|
||||
parser.add_argument(
|
||||
"--parent-chunk-size",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of child chunks per parent",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
result = run_story(
|
||||
args.video_path, args.output_path, args.uuid, args.parent_chunk_size
|
||||
)
|
||||
print(
|
||||
f"Story generated: {result['stats']['total_parent_chunks']} parent, "
|
||||
f"{result['stats']['total_child_chunks']} child chunks (LOCAL)"
|
||||
)
|
||||
@@ -1,848 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Story Processor - AI-Driven Processor Contract Version 1.0
|
||||
|
||||
Compliant with AI-Driven Processor Contract v1.0
|
||||
Effective Date: 2025-03-27
|
||||
|
||||
Features:
|
||||
1. Standardized command-line interface
|
||||
2. Redis progress reporting
|
||||
3. Signal handling (SIGTERM, SIGINT)
|
||||
4. Health check mode
|
||||
5. Resource monitoring
|
||||
6. Contract-compliant JSON output
|
||||
7. Unified configuration
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
import signal
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, List
|
||||
|
||||
# Redis Publisher for progress reporting
|
||||
try:
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
REDIS_AVAILABLE = True
|
||||
except ImportError:
|
||||
REDIS_AVAILABLE = False
|
||||
print(
|
||||
"WARNING: RedisPublisher not available, progress reporting disabled",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
# Contract version
|
||||
CONTRACT_VERSION = "1.0"
|
||||
PROCESSOR_NAME = (
|
||||
"/Users/accusys/momentry_core_0.1/scripts/story_processor_contract_v1.py"
|
||||
)
|
||||
PROCESSOR_VERSION = "1.0.0"
|
||||
MODEL_NAME = "gpt-4"
|
||||
MODEL_VERSION = "latest"
|
||||
|
||||
# Unified configuration defaults
|
||||
DEFAULT_TIMEOUT = 3600 # 1 hour for story generation
|
||||
DEFAULT_PARENT_CHUNK_SIZE = 5
|
||||
DEFAULT_MIN_CHILD_CHUNKS = 3
|
||||
DEFAULT_MAX_CHILD_CHUNKS = 10
|
||||
DEFAULT_SUMMARY_LENGTH = 150
|
||||
DEFAULT_MODEL = "openai" # openai, local, or template
|
||||
DEFAULT_MODEL_NAME = "gpt-4"
|
||||
DEFAULT_TEMPERATURE = 0.7
|
||||
DEFAULT_MAX_TOKENS = 500
|
||||
|
||||
|
||||
# Signal handling with timeout support
|
||||
class SignalHandler:
|
||||
"""Handle system signals for graceful shutdown"""
|
||||
|
||||
def __init__(self):
|
||||
self.should_exit = False
|
||||
self.exit_code = 0
|
||||
signal.signal(signal.SIGTERM, self.handle_signal)
|
||||
signal.signal(signal.SIGINT, self.handle_signal)
|
||||
|
||||
def handle_signal(self, signum, frame):
|
||||
"""Handle termination signals"""
|
||||
print(f"\n收到信号 {signum},正在优雅关闭...")
|
||||
self.should_exit = True
|
||||
self.exit_code = 128 + signum
|
||||
|
||||
def should_stop(self):
|
||||
"""Check if should stop processing"""
|
||||
return self.should_exit
|
||||
|
||||
|
||||
# Timeout manager
|
||||
class TimeoutManager:
|
||||
"""Manage processing timeouts"""
|
||||
|
||||
def __init__(self, timeout_seconds: int):
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.start_time = time.time()
|
||||
self.timer = None
|
||||
|
||||
def check_timeout(self) -> bool:
|
||||
"""Check if timeout has been reached"""
|
||||
elapsed = time.time() - self.start_time
|
||||
return elapsed > self.timeout_seconds
|
||||
|
||||
def get_remaining_time(self) -> float:
|
||||
"""Get remaining time in seconds"""
|
||||
elapsed = time.time() - self.start_time
|
||||
return max(0, self.timeout_seconds - elapsed)
|
||||
|
||||
def format_remaining_time(self) -> str:
|
||||
"""Format remaining time as HH:MM:SS"""
|
||||
remaining = self.get_remaining_time()
|
||||
hours = int(remaining // 3600)
|
||||
minutes = int((remaining % 3600) // 60)
|
||||
seconds = int(remaining % 60)
|
||||
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
||||
|
||||
|
||||
# Health check functions
|
||||
def check_environment() -> Dict[str, Any]:
|
||||
"""Check environment and dependencies"""
|
||||
checks = []
|
||||
|
||||
# Check 1: OpenAI API (optional)
|
||||
try:
|
||||
import openai
|
||||
|
||||
checks.append(
|
||||
{
|
||||
"name": "openai",
|
||||
"status": "available",
|
||||
"version": openai.__version__,
|
||||
}
|
||||
)
|
||||
except ImportError:
|
||||
checks.append({"name": "openai", "status": "optional", "version": None})
|
||||
|
||||
# Check 2: Redis (optional)
|
||||
checks.append(
|
||||
{
|
||||
"name": "redis",
|
||||
"status": "available" if REDIS_AVAILABLE else "optional",
|
||||
"version": None,
|
||||
}
|
||||
)
|
||||
|
||||
# Check 3: Python version
|
||||
checks.append(
|
||||
{
|
||||
"name": "python",
|
||||
"status": "available",
|
||||
"version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"processor_name": PROCESSOR_NAME,
|
||||
"processor_version": PROCESSOR_VERSION,
|
||||
"contract_version": CONTRACT_VERSION,
|
||||
"model_name": MODEL_NAME,
|
||||
"model_version": MODEL_VERSION,
|
||||
"checks": checks,
|
||||
}
|
||||
|
||||
|
||||
def check_input_files(input_files: Dict[str, str]) -> Dict[str, Any]:
|
||||
"""Check input files exist and are valid JSON"""
|
||||
results = {}
|
||||
|
||||
for file_type, file_path in input_files.items():
|
||||
if not file_path:
|
||||
results[file_type] = {
|
||||
"exists": False,
|
||||
"valid": False,
|
||||
"error": "No path provided",
|
||||
}
|
||||
continue
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
results[file_type] = {
|
||||
"exists": False,
|
||||
"valid": False,
|
||||
"error": "File not found",
|
||||
}
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(file_path, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Basic validation based on file type
|
||||
if file_type == "asr":
|
||||
valid = isinstance(data, dict) and "segments" in data
|
||||
elif file_type == "cut":
|
||||
valid = isinstance(data, dict) and "scenes" in data
|
||||
elif file_type == "yolo":
|
||||
valid = isinstance(data, dict) and "detections" in data
|
||||
elif file_type == "ocr":
|
||||
valid = isinstance(data, dict) and "texts" in data
|
||||
else:
|
||||
valid = isinstance(data, dict)
|
||||
|
||||
results[file_type] = {
|
||||
"exists": True,
|
||||
"valid": valid,
|
||||
"size": os.path.getsize(file_path),
|
||||
"data_keys": list(data.keys()) if isinstance(data, dict) else [],
|
||||
}
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
results[file_type] = {
|
||||
"exists": True,
|
||||
"valid": False,
|
||||
"error": f"Invalid JSON: {e}",
|
||||
}
|
||||
except Exception as e:
|
||||
results[file_type] = {"exists": True, "valid": False, "error": str(e)}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def load_input_data(input_files: Dict[str, str]) -> Dict[str, Any]:
|
||||
"""Load input data from JSON files"""
|
||||
data = {}
|
||||
|
||||
for file_type, file_path in input_files.items():
|
||||
if not file_path or not os.path.exists(file_path):
|
||||
data[file_type] = None
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(file_path, "r") as f:
|
||||
data[file_type] = json.load(f)
|
||||
except:
|
||||
data[file_type] = None
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def generate_parent_child_chunks(
|
||||
asr_data: Dict,
|
||||
cut_data: Dict,
|
||||
yolo_data: Dict,
|
||||
ocr_data: Dict,
|
||||
parent_chunk_size: int = DEFAULT_PARENT_CHUNK_SIZE,
|
||||
min_child_chunks: int = DEFAULT_MIN_CHILD_CHUNKS,
|
||||
max_child_chunks: int = DEFAULT_MAX_CHILD_CHUNKS,
|
||||
summary_length: int = DEFAULT_SUMMARY_LENGTH,
|
||||
model: str = DEFAULT_MODEL,
|
||||
**kwargs,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Generate parent-child chunk hierarchy for RAG"""
|
||||
|
||||
parent_chunks = []
|
||||
|
||||
# Extract ASR segments
|
||||
asr_segments = asr_data.get("segments", []) if asr_data else []
|
||||
|
||||
# Extract scenes from CUT data
|
||||
scenes = cut_data.get("scenes", []) if cut_data else []
|
||||
|
||||
# Extract detections from YOLO data
|
||||
yolo_detections = yolo_data.get("detections", []) if yolo_data else []
|
||||
|
||||
# Extract OCR texts
|
||||
ocr_texts = ocr_data.get("texts", []) if ocr_data else []
|
||||
|
||||
# If we have scenes, use them to group content
|
||||
if scenes:
|
||||
for scene in scenes:
|
||||
scene_start = scene.get("start_time", 0)
|
||||
scene_end = scene.get("end_time", 0)
|
||||
scene_duration = scene.get("duration", 0)
|
||||
|
||||
# Find ASR segments in this scene
|
||||
scene_asr_segments = []
|
||||
for segment in asr_segments:
|
||||
seg_start = segment.get("start", 0)
|
||||
if scene_start <= seg_start <= scene_end:
|
||||
scene_asr_segments.append(segment)
|
||||
|
||||
# Find YOLO detections in this scene
|
||||
scene_yolo_detections = []
|
||||
for detection in yolo_detections:
|
||||
det_time = detection.get("timestamp", 0)
|
||||
if scene_start <= det_time <= scene_end:
|
||||
scene_yolo_detections.append(detection)
|
||||
|
||||
# Find OCR texts in this scene
|
||||
scene_ocr_texts = []
|
||||
for text in ocr_texts:
|
||||
text_time = text.get("timestamp", 0)
|
||||
if scene_start <= text_time <= scene_end:
|
||||
scene_ocr_texts.append(text)
|
||||
|
||||
# Create child chunks
|
||||
child_chunks = []
|
||||
|
||||
# Add ASR segments as child chunks
|
||||
for segment in scene_asr_segments[:max_child_chunks]:
|
||||
child_chunks.append(
|
||||
{
|
||||
"type": "asr",
|
||||
"content": segment.get("text", ""),
|
||||
"start_time": segment.get("start", 0),
|
||||
"end_time": segment.get("end", 0),
|
||||
"confidence": segment.get("confidence", 0),
|
||||
"metadata": {"speaker": segment.get("speaker")},
|
||||
}
|
||||
)
|
||||
|
||||
# Add YOLO detections as child chunks
|
||||
for detection in scene_yolo_detections[:max_child_chunks]:
|
||||
child_chunks.append(
|
||||
{
|
||||
"type": "yolo",
|
||||
"content": f"Detected {detection.get('class', 'object')} with confidence {detection.get('confidence', 0):.2f}",
|
||||
"timestamp": detection.get("timestamp", 0),
|
||||
"confidence": detection.get("confidence", 0),
|
||||
"metadata": {
|
||||
"class": detection.get("class"),
|
||||
"bbox": detection.get("bbox"),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Add OCR texts as child chunks
|
||||
for text in scene_ocr_texts[:max_child_chunks]:
|
||||
child_chunks.append(
|
||||
{
|
||||
"type": "ocr",
|
||||
"content": text.get("text", ""),
|
||||
"timestamp": text.get("timestamp", 0),
|
||||
"confidence": text.get("confidence", 0),
|
||||
"metadata": {
|
||||
"bbox": text.get("bbox"),
|
||||
"language": text.get("language"),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Skip if not enough child chunks
|
||||
if len(child_chunks) < min_child_chunks:
|
||||
continue
|
||||
|
||||
# Generate parent summary
|
||||
if model == "openai":
|
||||
parent_summary = generate_openai_summary(child_chunks, scene, **kwargs)
|
||||
elif model == "local":
|
||||
parent_summary = generate_local_summary(child_chunks, scene, **kwargs)
|
||||
else:
|
||||
parent_summary = generate_template_summary(child_chunks, scene)
|
||||
|
||||
# Create parent chunk
|
||||
parent_chunks.append(
|
||||
{
|
||||
"parent_id": len(parent_chunks) + 1,
|
||||
"scene_id": scene.get("scene_id", 0),
|
||||
"start_time": scene_start,
|
||||
"end_time": scene_end,
|
||||
"duration": scene_duration,
|
||||
"summary": parent_summary[:summary_length]
|
||||
if summary_length > 0
|
||||
else parent_summary,
|
||||
"child_count": len(child_chunks),
|
||||
"child_types": list(set(chunk["type"] for chunk in child_chunks)),
|
||||
"child_chunks": child_chunks[
|
||||
:parent_chunk_size
|
||||
], # Limit child chunks in output
|
||||
}
|
||||
)
|
||||
|
||||
# If no scenes, create chunks based on time windows
|
||||
elif asr_segments:
|
||||
# Group ASR segments by time windows
|
||||
time_window = 30 # seconds
|
||||
current_window = 0
|
||||
|
||||
while current_window * time_window < (
|
||||
asr_segments[-1].get("end", 0) if asr_segments else 0
|
||||
):
|
||||
window_start = current_window * time_window
|
||||
window_end = (current_window + 1) * time_window
|
||||
|
||||
# Find segments in this window
|
||||
window_segments = []
|
||||
for segment in asr_segments:
|
||||
seg_start = segment.get("start", 0)
|
||||
if window_start <= seg_start < window_end:
|
||||
window_segments.append(segment)
|
||||
|
||||
if len(window_segments) >= min_child_chunks:
|
||||
# Create child chunks
|
||||
child_chunks = []
|
||||
for segment in window_segments[:max_child_chunks]:
|
||||
child_chunks.append(
|
||||
{
|
||||
"type": "asr",
|
||||
"content": segment.get("text", ""),
|
||||
"start_time": segment.get("start", 0),
|
||||
"end_time": segment.get("end", 0),
|
||||
"confidence": segment.get("confidence", 0),
|
||||
"metadata": {"speaker": segment.get("speaker")},
|
||||
}
|
||||
)
|
||||
|
||||
# Generate parent summary
|
||||
parent_summary = generate_template_summary(
|
||||
child_chunks,
|
||||
{
|
||||
"start_time": window_start,
|
||||
"end_time": window_end,
|
||||
"duration": time_window,
|
||||
},
|
||||
)
|
||||
|
||||
# Create parent chunk
|
||||
parent_chunks.append(
|
||||
{
|
||||
"parent_id": len(parent_chunks) + 1,
|
||||
"time_window": current_window,
|
||||
"start_time": window_start,
|
||||
"end_time": window_end,
|
||||
"duration": time_window,
|
||||
"summary": parent_summary[:summary_length]
|
||||
if summary_length > 0
|
||||
else parent_summary,
|
||||
"child_count": len(child_chunks),
|
||||
"child_types": ["asr"],
|
||||
"child_chunks": child_chunks[:parent_chunk_size],
|
||||
}
|
||||
)
|
||||
|
||||
current_window += 1
|
||||
|
||||
return parent_chunks
|
||||
|
||||
|
||||
def generate_openai_summary(child_chunks: List[Dict], scene: Dict, **kwargs) -> str:
|
||||
"""Generate summary using OpenAI"""
|
||||
try:
|
||||
import openai
|
||||
|
||||
# Prepare context from child chunks
|
||||
context_parts = []
|
||||
for chunk in child_chunks[:10]: # Limit context size
|
||||
if chunk["type"] == "asr":
|
||||
context_parts.append(f"Speech: {chunk['content']}")
|
||||
elif chunk["type"] == "yolo":
|
||||
context_parts.append(f"Visual: {chunk['content']}")
|
||||
elif chunk["type"] == "ocr":
|
||||
context_parts.append(f"Text: {chunk['content']}")
|
||||
|
||||
context = "\n".join(context_parts)
|
||||
|
||||
# Prepare prompt
|
||||
prompt = f"""Summarize this video scene ({scene.get("duration", 0):.1f} seconds) based on the following elements:
|
||||
|
||||
{context}
|
||||
|
||||
Provide a concise narrative summary that connects the speech, visual elements, and text into a coherent description."""
|
||||
|
||||
# Call OpenAI API
|
||||
response = openai.chat.completions.create(
|
||||
model=kwargs.get("model_name", DEFAULT_MODEL_NAME),
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a video analysis assistant that creates coherent narrative summaries from multiple data sources.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
max_tokens=kwargs.get("max_tokens", DEFAULT_MAX_TOKENS),
|
||||
temperature=kwargs.get("temperature", DEFAULT_TEMPERATURE),
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
|
||||
except ImportError:
|
||||
return "OpenAI not available for summary generation"
|
||||
except Exception as e:
|
||||
return f"Summary generation error: {str(e)}"
|
||||
|
||||
|
||||
def generate_local_summary(child_chunks: List[Dict], scene: Dict, **kwargs) -> str:
|
||||
"""Generate summary using local model (placeholder)"""
|
||||
# This is a placeholder for local model implementation
|
||||
asr_count = sum(1 for chunk in child_chunks if chunk["type"] == "asr")
|
||||
yolo_count = sum(1 for chunk in child_chunks if chunk["type"] == "yolo")
|
||||
ocr_count = sum(1 for chunk in child_chunks if chunk["type"] == "ocr")
|
||||
|
||||
return f"Scene ({scene.get('duration', 0):.1f}s) with {asr_count} speech segments, {yolo_count} visual detections, and {ocr_count} text elements. Local summary model not implemented."
|
||||
|
||||
|
||||
def generate_template_summary(child_chunks: List[Dict], scene: Dict) -> str:
|
||||
"""Generate summary using template"""
|
||||
asr_count = sum(1 for chunk in child_chunks if chunk["type"] == "asr")
|
||||
yolo_count = sum(1 for chunk in child_chunks if chunk["type"] == "yolo")
|
||||
ocr_count = sum(1 for chunk in child_chunks if chunk["type"] == "ocr")
|
||||
|
||||
# Extract some sample content
|
||||
asr_samples = [
|
||||
chunk["content"][:50] for chunk in child_chunks if chunk["type"] == "asr"
|
||||
][:2]
|
||||
yolo_classes = list(
|
||||
set(
|
||||
chunk["metadata"].get("class", "object")
|
||||
for chunk in child_chunks
|
||||
if chunk["type"] == "yolo"
|
||||
)
|
||||
)
|
||||
|
||||
summary_parts = [f"Scene duration: {scene.get('duration', 0):.1f} seconds."]
|
||||
|
||||
if asr_count > 0:
|
||||
summary_parts.append(f"Contains {asr_count} speech segments.")
|
||||
if asr_samples:
|
||||
summary_parts.append(f"Sample speech: {'; '.join(asr_samples)}...")
|
||||
|
||||
if yolo_count > 0:
|
||||
summary_parts.append(
|
||||
f"Detected {yolo_count} objects including: {', '.join(yolo_classes[:3])}."
|
||||
)
|
||||
|
||||
if ocr_count > 0:
|
||||
summary_parts.append(f"Extracted {ocr_count} text elements from the video.")
|
||||
|
||||
return " ".join(summary_parts)
|
||||
|
||||
|
||||
# Main processing function
|
||||
def process_story(
|
||||
asr_path: str,
|
||||
cut_path: str,
|
||||
yolo_path: str,
|
||||
ocr_path: str,
|
||||
output_path: str,
|
||||
uuid: str = "",
|
||||
parent_chunk_size: int = DEFAULT_PARENT_CHUNK_SIZE,
|
||||
min_child_chunks: int = DEFAULT_MIN_CHILD_CHUNKS,
|
||||
max_child_chunks: int = DEFAULT_MAX_CHILD_CHUNKS,
|
||||
summary_length: int = DEFAULT_SUMMARY_LENGTH,
|
||||
model: str = DEFAULT_MODEL,
|
||||
model_name: str = DEFAULT_MODEL_NAME,
|
||||
temperature: float = DEFAULT_TEMPERATURE,
|
||||
max_tokens: int = DEFAULT_MAX_TOKENS,
|
||||
timeout: int = DEFAULT_TIMEOUT,
|
||||
) -> Dict[str, Any]:
|
||||
"""Process video analysis data to create parent-child chunk hierarchy"""
|
||||
|
||||
# Initialize
|
||||
signal_handler = SignalHandler()
|
||||
timeout_manager = TimeoutManager(timeout)
|
||||
publisher = None
|
||||
if REDIS_AVAILABLE and uuid:
|
||||
try:
|
||||
publisher = RedisPublisher(uuid)
|
||||
except:
|
||||
publisher = None
|
||||
|
||||
def publish(stage: str, message: str, data: Dict = None):
|
||||
if publisher:
|
||||
publisher.info(PROCESSOR_NAME, stage, message, data)
|
||||
|
||||
if publisher:
|
||||
publish("STORY_START", "开始生成故事层次结构")
|
||||
|
||||
result = {
|
||||
"processor_name": PROCESSOR_NAME,
|
||||
"processor_version": PROCESSOR_VERSION,
|
||||
"contract_version": CONTRACT_VERSION,
|
||||
"model_name": MODEL_NAME,
|
||||
"model_version": MODEL_VERSION,
|
||||
"input_files": {
|
||||
"asr": asr_path,
|
||||
"cut": cut_path,
|
||||
"yolo": yolo_path,
|
||||
"ocr": ocr_path,
|
||||
},
|
||||
"output_path": output_path,
|
||||
"uuid": uuid,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"parameters": {
|
||||
"parent_chunk_size": parent_chunk_size,
|
||||
"min_child_chunks": min_child_chunks,
|
||||
"max_child_chunks": max_child_chunks,
|
||||
"summary_length": summary_length,
|
||||
"model": model,
|
||||
"model_name": model_name,
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
"timeout": timeout,
|
||||
},
|
||||
"success": False,
|
||||
"error": None,
|
||||
"parent_chunks": [],
|
||||
"chunk_statistics": {},
|
||||
"processing_time": 0,
|
||||
"resource_usage": {},
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Check timeout
|
||||
if timeout_manager.check_timeout():
|
||||
raise TimeoutError(f"超时 ({timeout} 秒)")
|
||||
|
||||
# Check if should exit
|
||||
if signal_handler.should_stop():
|
||||
raise KeyboardInterrupt("收到停止信号")
|
||||
|
||||
# Check input files
|
||||
if publisher:
|
||||
publish("STORY_CHECK_FILES", "检查输入文件")
|
||||
|
||||
input_files = {
|
||||
"asr": asr_path,
|
||||
"cut": cut_path,
|
||||
"yolo": yolo_path,
|
||||
"ocr": ocr_path,
|
||||
}
|
||||
|
||||
file_checks = check_input_files(input_files)
|
||||
result["file_checks"] = file_checks
|
||||
|
||||
# Check if we have at least ASR data
|
||||
if not file_checks.get("asr", {}).get("valid", False):
|
||||
raise ValueError("缺少有效的 ASR 数据文件")
|
||||
|
||||
if publisher:
|
||||
publish("STORY_FILES_VALID", "输入文件检查通过")
|
||||
|
||||
# Load input data
|
||||
if publisher:
|
||||
publish("STORY_LOAD_DATA", "加载输入数据")
|
||||
|
||||
input_data = load_input_data(input_files)
|
||||
|
||||
if publisher:
|
||||
publish("STORY_DATA_LOADED", "数据加载完成")
|
||||
|
||||
# Generate parent-child chunks
|
||||
if publisher:
|
||||
publish("STORY_GENERATE_CHUNKS", "生成父-子块层次结构")
|
||||
|
||||
parent_chunks = generate_parent_child_chunks(
|
||||
asr_data=input_data.get("asr"),
|
||||
cut_data=input_data.get("cut"),
|
||||
yolo_data=input_data.get("yolo"),
|
||||
ocr_data=input_data.get("ocr"),
|
||||
parent_chunk_size=parent_chunk_size,
|
||||
min_child_chunks=min_child_chunks,
|
||||
max_child_chunks=max_child_chunks,
|
||||
summary_length=summary_length,
|
||||
model=model,
|
||||
model_name=model_name,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
result["parent_chunks"] = parent_chunks
|
||||
result["parent_chunk_count"] = len(parent_chunks)
|
||||
|
||||
# Calculate statistics
|
||||
total_child_chunks = sum(chunk.get("child_count", 0) for chunk in parent_chunks)
|
||||
child_types = {}
|
||||
for chunk in parent_chunks:
|
||||
for child_type in chunk.get("child_types", []):
|
||||
child_types[child_type] = child_types.get(child_type, 0) + 1
|
||||
|
||||
result["chunk_statistics"] = {
|
||||
"total_parent_chunks": len(parent_chunks),
|
||||
"total_child_chunks": total_child_chunks,
|
||||
"avg_children_per_parent": total_child_chunks / len(parent_chunks)
|
||||
if parent_chunks
|
||||
else 0,
|
||||
"child_type_distribution": child_types,
|
||||
}
|
||||
|
||||
result["success"] = True
|
||||
|
||||
if publisher:
|
||||
publish("STORY_COMPLETE", f"完成: {len(parent_chunks)} 个父块")
|
||||
|
||||
except TimeoutError as e:
|
||||
result["error"] = f"处理超时: {e}"
|
||||
if publisher:
|
||||
publish("STORY_TIMEOUT", f"超时: {e}")
|
||||
except KeyboardInterrupt:
|
||||
result["error"] = "处理被用户中断"
|
||||
if publisher:
|
||||
publish("STORY_INTERRUPTED", "处理被中断")
|
||||
except ImportError as e:
|
||||
result["error"] = f"依赖缺失: {e}"
|
||||
if publisher:
|
||||
publish("STORY_MISSING_DEPS", f"缺少依赖: {e}")
|
||||
except Exception as e:
|
||||
result["error"] = f"处理错误: {str(e)}"
|
||||
if publisher:
|
||||
publish("STORY_ERROR", f"错误: {str(e)}")
|
||||
traceback.print_exc()
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = time.time() - start_time
|
||||
result["processing_time"] = processing_time
|
||||
|
||||
# Add resource usage
|
||||
try:
|
||||
import psutil
|
||||
|
||||
process = psutil.Process()
|
||||
memory_info = process.memory_info()
|
||||
result["resource_usage"] = {
|
||||
"cpu_percent": process.cpu_percent(),
|
||||
"memory_mb": memory_info.rss / (1024 * 1024),
|
||||
"user_time": process.cpu_times().user,
|
||||
"system_time": process.cpu_times().system,
|
||||
}
|
||||
except ImportError:
|
||||
result["resource_usage"] = {"error": "psutil not available"}
|
||||
|
||||
# Save result
|
||||
try:
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
if publisher:
|
||||
publish("STORY_SAVED", f"结果保存到: {output_path}")
|
||||
except Exception as e:
|
||||
result["error"] = f"保存结果失败: {str(e)}"
|
||||
if publisher:
|
||||
publish("STORY_SAVE_ERROR", f"保存失败: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description=f"{PROCESSOR_NAME.upper()} Processor v{PROCESSOR_VERSION} - Parent-Child Chunk Generation"
|
||||
)
|
||||
parser.add_argument("--asr", help="Path to ASR JSON file", required=True)
|
||||
parser.add_argument("--cut", help="Path to CUT JSON file", default="")
|
||||
parser.add_argument("--yolo", help="Path to YOLO JSON file", default="")
|
||||
parser.add_argument("--ocr", help="Path to OCR JSON file", default="")
|
||||
parser.add_argument("--output", help="Path to output JSON file", required=True)
|
||||
parser.add_argument("--uuid", help="UUID for progress tracking", default="")
|
||||
parser.add_argument(
|
||||
"--parent-chunk-size",
|
||||
help=f"Maximum child chunks per parent (default: {DEFAULT_PARENT_CHUNK_SIZE})",
|
||||
type=int,
|
||||
default=DEFAULT_PARENT_CHUNK_SIZE,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-child-chunks",
|
||||
help=f"Minimum child chunks to create parent (default: {DEFAULT_MIN_CHILD_CHUNKS})",
|
||||
type=int,
|
||||
default=DEFAULT_MIN_CHILD_CHUNKS,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-child-chunks",
|
||||
help=f"Maximum child chunks per parent (default: {DEFAULT_MAX_CHILD_CHUNKS})",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_CHILD_CHUNKS,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--summary-length",
|
||||
help=f"Maximum summary length in characters (default: {DEFAULT_SUMMARY_LENGTH})",
|
||||
type=int,
|
||||
default=DEFAULT_SUMMARY_LENGTH,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
help=f"Summary model to use (default: {DEFAULT_MODEL})",
|
||||
default=DEFAULT_MODEL,
|
||||
choices=["openai", "local", "template"],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
help=f"Model name for OpenAI (default: {DEFAULT_MODEL_NAME})",
|
||||
default=DEFAULT_MODEL_NAME,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--temperature",
|
||||
help=f"Temperature for generation (default: {DEFAULT_TEMPERATURE})",
|
||||
type=float,
|
||||
default=DEFAULT_TEMPERATURE,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-tokens",
|
||||
help=f"Maximum tokens per summary (default: {DEFAULT_MAX_TOKENS})",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_TOKENS,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
help=f"Timeout in seconds (default: {DEFAULT_TIMEOUT})",
|
||||
type=int,
|
||||
default=DEFAULT_TIMEOUT,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--health-check",
|
||||
help="Run health check and exit",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Health check mode
|
||||
if args.health_check:
|
||||
health = check_environment()
|
||||
print(json.dumps(health, indent=2, ensure_ascii=False))
|
||||
return (
|
||||
0
|
||||
if all(c["status"] in ["available", "optional"] for c in health["checks"])
|
||||
else 1
|
||||
)
|
||||
|
||||
# Normal processing mode
|
||||
result = process_story(
|
||||
asr_path=args.asr,
|
||||
cut_path=args.cut,
|
||||
yolo_path=args.yolo,
|
||||
ocr_path=args.ocr,
|
||||
output_path=args.output,
|
||||
uuid=args.uuid,
|
||||
parent_chunk_size=args.parent_chunk_size,
|
||||
min_child_chunks=args.min_child_chunks,
|
||||
max_child_chunks=args.max_child_chunks,
|
||||
summary_length=args.summary_length,
|
||||
model=args.model,
|
||||
model_name=args.model_name,
|
||||
temperature=args.temperature,
|
||||
max_tokens=args.max_tokens,
|
||||
timeout=args.timeout,
|
||||
)
|
||||
|
||||
# Print result summary
|
||||
if result.get("success", False):
|
||||
print(f"✅ {PROCESSOR_NAME.upper()} 处理成功")
|
||||
print(f" 父块数: {result.get('parent_chunk_count', 0)}")
|
||||
stats = result.get("chunk_statistics", {})
|
||||
print(f" 子块总数: {stats.get('total_child_chunks', 0)}")
|
||||
print(f" 平均子块/父块: {stats.get('avg_children_per_parent', 0):.1f}")
|
||||
print(f" 处理时间: {result.get('processing_time', 0):.1f} 秒")
|
||||
print(f" 输出文件: {args.output}")
|
||||
return 0
|
||||
else:
|
||||
print(f"❌ {PROCESSOR_NAME.upper()} 处理失败")
|
||||
print(f" 错误: {result.get('error', '未知错误')}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/story_processor_contract_v1_v1.11.py
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/story_processor_v1.11.py
|
||||
@@ -1,121 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Test Parent Chunk Summary Generation (Gemma 4)
|
||||
"""
|
||||
|
||||
import json
|
||||
import ollama
|
||||
import time
|
||||
|
||||
# Configuration
|
||||
UUID = "384b0ff44aaaa1f1"
|
||||
ASR_PATH = f"output/{UUID}/{UUID}.asr.json"
|
||||
MODEL = "gemma4:latest"
|
||||
|
||||
# The Prompt Template
|
||||
PARENT_SUMMARY_PROMPT = """
|
||||
You are an expert film analyst. Analyze the following movie dialogue segment (approx 60 seconds).
|
||||
Your task is to generate a structured JSON summary containing:
|
||||
1. **narrative_summary**: A one-sentence summary of the main event/plot point.
|
||||
2. **entities**: Key information extracted:
|
||||
- `who`: List of characters involved.
|
||||
- `where`: Inferred location (e.g., "Apartment", "Train").
|
||||
- `objects`: Key props mentioned (e.g., "Ticket", "Money").
|
||||
3. **emotional_arc**: The emotional transition:
|
||||
- `start_mood`: Mood at the beginning.
|
||||
- `end_mood`: Mood at the end.
|
||||
4. **plot_sequence**:
|
||||
- `scene_type`: Type of scene (e.g., "Confrontation", "Romance", "Discovery").
|
||||
- `key_action`: The main action taking place.
|
||||
|
||||
**IMPORTANT RULES:**
|
||||
- Output **ONLY** valid JSON.
|
||||
- Do NOT include "Thinking Process" or markdown formatting.
|
||||
- If information is unknown, use "Unknown".
|
||||
- Context: This is from the movie "Charade" (1963).
|
||||
|
||||
Dialogue:
|
||||
{context}
|
||||
"""
|
||||
|
||||
|
||||
def load_sample(start_index, count=20):
|
||||
"""Load a slice of dialogue to simulate a Parent Chunk"""
|
||||
try:
|
||||
with open(ASR_PATH, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
segments = data.get("segments", [])
|
||||
selected = segments[start_index : start_index + count]
|
||||
text = " ".join([s.get("text", "") for s in selected])
|
||||
print(f"📂 Loaded Sample {start_index}: {len(selected)} segments.")
|
||||
return text
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
|
||||
def run_test(name, context_text):
|
||||
print(f"\n🧪 Testing: {name}")
|
||||
print("-" * 50)
|
||||
print(f"📖 Input Preview: {context_text[:100]}...")
|
||||
|
||||
prompt = PARENT_SUMMARY_PROMPT.format(context=context_text)
|
||||
|
||||
try:
|
||||
start = time.time()
|
||||
response = ollama.chat(
|
||||
model=MODEL, messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
duration = time.time() - start
|
||||
|
||||
content = response["message"]["content"]
|
||||
|
||||
# Clean up thinking tags if present
|
||||
if "```json" in content:
|
||||
content = content.split("```json")[1].split("```")[0]
|
||||
elif "Thinking..." in content:
|
||||
# crude cleanup for demo
|
||||
content = content.split("...")[-1]
|
||||
|
||||
# Attempt parse
|
||||
try:
|
||||
result = json.loads(content.strip())
|
||||
print(f"✅ Success ({duration:.2f}s)")
|
||||
print(json.dumps(result, indent=2))
|
||||
return True
|
||||
except json.JSONDecodeError:
|
||||
print(f"⚠️ JSON Parse Failed ({duration:.2f}s)")
|
||||
print(content[:500])
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ API Error: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print(f"🚀 Starting Parent Chunk Summary Tests on '{UUID}'")
|
||||
|
||||
# Test 1: Early Dialogue (Entities & Narrative Focus)
|
||||
# "possessed a ticket of passage..."
|
||||
txt1 = load_sample(start_index=10)
|
||||
res1 = run_test("Test 1: Early Plot (Entities & Narrative)", txt1)
|
||||
|
||||
time.sleep(2) # Cool down
|
||||
|
||||
# Test 2: Middle Conflict (Emotional Arc Focus)
|
||||
# "where did he keep his money..." (From previous context)
|
||||
txt2 = load_sample(start_index=50)
|
||||
res2 = run_test("Test 2: Conflict (Emotional Arc)", txt2)
|
||||
|
||||
time.sleep(2) # Cool down
|
||||
|
||||
# Test 3: Later Dialogue (Plot Sequence Focus)
|
||||
# Looking for a scene involving a conclusion or death aftermath
|
||||
# Let's pick a later section to test robustness
|
||||
txt3 = load_sample(start_index=150)
|
||||
res3 = run_test("Test 3: Late Plot (Sequence)", txt3)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
../v1.1/scripts/test_parent_chunk_generation_v1.11.py
|
||||
Reference in New Issue
Block a user