refactor: remove face embedding architecture - single Qdrant _faces collection

- Delete FaceEmbeddingDb module (face_embedding_db.rs)
- Stub match_faces_iterative, generate_seed_embeddings, tmdb_match_handler
- Remove sync_trace_embeddings, populate_face_embeddings_to_qdrant
- Remove embedding from face.json output (face_processor.py)
- Remove embedding from PG UPDATE (store_traced_faces.py)
- Remove workspace traces staging (checkin.rs, qdrant_workspace.rs)
- Fix tests: add pose_angle to Face, hand_nodes to TkgResult

Disabled functions (need reimplement with _faces):
- match_faces_iterative (identity agent)
- generate_seed_embeddings (TMDb seeds)
- tmdb_match_handler (TMDb matching)
- cluster_face_embeddings, search_similar_faces
- merge_traces_within_cuts
This commit is contained in:
Accusys
2026-06-24 22:27:09 +08:00
parent 360cb991e1
commit 074cdcdbed
60 changed files with 657 additions and 9454 deletions

View File

@@ -1,200 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
POC: MediaPipe Face Detection vs Apple Vision Framework vs InsightFace
Tests face detection on video frames and reports:
- Detection count
- Bounding box quality
- Landmarks (468 face mesh)
- Processing speed
"""
import sys
import json
import os
import time
import subprocess
import argparse
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
def extract_frames(video_path, sample_interval=30, max_frames=50):
"""Extract frames using ffmpeg"""
import tempfile
tmpdir = tempfile.mkdtemp(prefix="face_test_")
pattern = os.path.join(tmpdir, "frame_%05d.jpg")
cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
"-vf", f"select=not(mod(n\\,{sample_interval}))",
"-vsync", "vfr", "-q:v", "5", pattern]
subprocess.run(cmd, check=True)
files = sorted([f for f in os.listdir(tmpdir) if f.endswith(".jpg")])[:max_frames]
return tmpdir, [os.path.join(tmpdir, f) for f in files]
def test_mediapipe(frame_paths, fps):
"""MediaPipe Face Detection + Face Mesh"""
try:
from mediapipe.tasks import vision
from mediapipe.tasks.python.core.base_options import BaseOptions
from mediapipe.tasks.python.vision.face_detector import FaceDetector, FaceDetectorOptions
from mediapipe.tasks.python.vision.face_landmarker import FaceLandmarker, FaceLandmarkerOptions
except ImportError:
print("[MediaPipe] Not available, skipping")
return None
model_dir = os.path.join(os.path.dirname(__file__), "models")
os.makedirs(model_dir, exist_ok=True)
# Check model files - MediaPipe downloads automatically via the API
base_opts_detect = BaseOptions(model_asset_path="")
detect_opts = FaceDetectorOptions(base_options=BaseOptions())
t0 = time.time()
total_faces = 0
frames_with_faces = 0
landmarks_total = 0
# MediaPipe Face Detector
try:
detector = vision.FaceDetector.create_from_options(
FaceDetectorOptions(
base_options=BaseOptions(model_asset_buffer=None),
running_mode=vision.RunningMode.IMAGE
)
)
except:
# Download model first
import urllib.request
model_url = "https://storage.googleapis.com/mediapipe-models/face_detector/blaze_face_short_range/float16/latest/face_detector.task"
model_path = os.path.join(model_dir, "face_detector.task")
if not os.path.exists(model_path):
print(f"[MediaPipe] Downloading model: {model_url}")
urllib.request.urlretrieve(model_url, model_path)
detector = vision.FaceDetector.create_from_options(
FaceDetectorOptions(
base_options=BaseOptions(model_asset_path=model_path),
running_mode=vision.RunningMode.IMAGE
)
)
import cv2
for path in frame_paths:
img = cv2.imread(path)
if img is None:
continue
h, w = img.shape[:2]
mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
result = detector.detect(mp_img)
if result.detections:
frames_with_faces += 1
for det in result.detections:
total_faces += 1
bbox = det.bounding_box
# bbox is [x, y, width, height] in pixels
elapsed = time.time() - t0
print(f"[MediaPipe] Detection: {len(frame_paths)} frames, {frames_with_faces} with faces, {total_faces} faces, {elapsed:.2f}s")
# Face Landmarker (468 points)
landmark_path = os.path.join(model_dir, "face_landmarker.task")
if not os.path.exists(landmark_path):
model_url = "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/latest/face_landmarker.task"
print(f"[MediaPipe] Downloading landmark model...")
import urllib.request
urllib.request.urlretrieve(model_url, landmark_path)
landmarker = vision.FaceLandmarker.create_from_options(
FaceLandmarkerOptions(
base_options=BaseOptions(model_asset_path=landmark_path),
running_mode=vision.RunningMode.IMAGE,
output_face_blendshapes=False,
output_facial_transformation_matrixes=False,
)
)
t1 = time.time()
for path in frame_paths[:10]: # Only test 10 frames for landmarks
img = cv2.imread(path)
if img is None:
continue
mp_img = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
result = landmarker.detect(mp_img)
if result.face_landmarks:
for face in result.face_landmarks:
landmarks_total += len(face)
elapsed2 = time.time() - t1
print(f"[MediaPipe] Face Mesh (10 frames): {landmarks_total} total landmarks (~{landmarks_total//max(len(result.face_landmarks),1)} per face)")
return {
"frames_processed": len(frame_paths),
"frames_with_faces": frames_with_faces,
"total_faces": total_faces,
"time_sec": elapsed,
"landmarks_per_face": 468,
}
def test_vision_framework(frame_paths, fps):
"""Apple Vision Framework face detection via swift binary"""
# Use the existing swift binary
swift_bin = os.path.join(os.path.dirname(__file__),
"swift_processors/.build/debug/swift_ocr")
# swift_ocr doesn't do face detection, use the face_compare_test
swift_face = os.path.join(os.path.dirname(__file__),
"swift_processors/.build/debug/face_compare_test")
if not os.path.exists(swift_face):
print("[Vision] Binary not found, skipping")
return None
print(f"[Vision] Running face compare test...")
t0 = time.time()
result = subprocess.run(
[swift_face, frame_paths[0].rsplit("/", 2)[0].replace("/frames", ""), # This won't work for single files
"--sample-interval", "1", "--max-frames", str(len(frame_paths))],
capture_output=True, text=True, timeout=120
)
elapsed = time.time() - t0
print(result.stdout[-500:])
return {"time_sec": elapsed}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("video_path")
parser.add_argument("--sample-interval", type=int, default=30)
parser.add_argument("--max-frames", type=int, default=50)
args = parser.parse_args()
print(f"Testing: {args.video_path}")
# Extract frames
tmpdir, frames = extract_frames(args.video_path, args.sample_interval, args.max_frames)
print(f"Extracted {len(frames)} frames")
# MediaPipe
print("\n=== MediaPipe ===")
mp_result = test_mediapipe(frames, 24)
# Vision Framework
print("\n=== Apple Vision Framework ===")
vf_result = test_vision_framework(frames, 24)
# Summary
print("\n=== Comparison ===")
if mp_result:
print(f"MediaPipe: {mp_result['total_faces']} faces in {mp_result['frames_with_faces']} frames, {mp_result['time_sec']:.2f}s")
print(f" Landmarks: {mp_result['landmarks_per_face']} per face")
print(f"Vision Framework: (see above)")
# Cleanup
import shutil
shutil.rmtree(tmpdir, ignore_errors=True)
if __name__ == "__main__":
main()

View File

@@ -1 +0,0 @@
../v1.1/scripts/face_mediapipe_test_v1.11.py

View File

@@ -225,8 +225,9 @@ class FaceProcessorVision:
if face_img.size == 0:
continue
# CoreML embedding
emb = self.extract_face_embedding(face_img)
# CoreML embedding - TODO: push to Qdrant _faces collection instead
# emb = self.extract_face_embedding(face_img)
emb = None
if emb is not None:
embed_count += 1
@@ -240,7 +241,6 @@ class FaceProcessorVision:
faces.append({
"x": x, "y": y, "width": w, "height": h,
"confidence": face.get("confidence", 0.5),
"embedding": emb,
"pose_angle": {
"angle": pose_angle,
"roll": pose_info.get("roll", 0),
@@ -262,20 +262,17 @@ class FaceProcessorVision:
if len(face_data["frames"]) % 100 == 0:
elapsed = time.time() - t0
print(f"[FACE_V2] {len(face_data['frames'])} frames, {embed_count} embeddings, {elapsed:.0f}s")
print(f"[FACE_V2] {len(face_data['frames'])} frames, {elapsed:.0f}s")
if self.publisher:
pct = int(len(face_data["frames"]) * 100 / max(len(frames), 1))
if pct > last_pct:
last_pct = pct
self.publisher.progress("face", len(face_data["frames"]), len(frames),
f"{embed_count} faces", embed_count, "faces")
"", 0, "faces")
self.video.release()
# Finalize
face_data["metadata"]["status"] = "completed"
face_data["metadata"]["total_embeddings"] = embed_count
face_data["metadata"]["embedder"] = "coreml_facenet"
# Convert dict frames to list for Rust FaceResult format
frames_list = []

View File

@@ -1,228 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Regenerate ALL parent chunks for 384b0ff44aaaa1f1 using gemma4
Groups ASR chunks into ~17 logical scenes and generates summaries.
"""
import json
import subprocess
import psycopg2
import psycopg2.extras
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
UUID = "384b0ff44aaaa1f1"
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "gemma4:latest"
# Target ~17 scenes across 6865s = ~400s per scene
# But use natural breaks (gaps in dialogue) to split
SCENE_TARGET_COUNT = 17
def get_chunks():
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(
"""
SELECT id, chunk_id, start_time, end_time, start_frame, end_frame,
text_content, fps
FROM chunks
WHERE uuid = %s AND chunk_type = 'sentence'
ORDER BY start_time
""",
(UUID,),
)
chunks = cur.fetchall()
cur.close()
conn.close()
return chunks
def call_gemma4(prompt, max_tokens=300):
payload = {
"model": MODEL,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.3, "num_predict": max_tokens},
}
try:
resp = subprocess.run(
["curl", "-s", OLLAMA_URL, "-d", json.dumps(payload)],
capture_output=True,
text=True,
timeout=180,
)
if resp.returncode == 0:
result = json.loads(resp.stdout)
return result.get("response", "").strip()
except Exception as e:
print(f" ⚠️ Ollama error: {e}")
return ""
def find_scene_boundaries(chunks, target_count=SCENE_TARGET_COUNT):
"""Find optimal scene boundaries based on dialogue gaps"""
if not chunks:
return []
# Calculate gaps between consecutive chunks
gaps = []
for i in range(1, len(chunks)):
gap = chunks[i]["start_time"] - chunks[i - 1]["end_time"]
gaps.append((i, gap))
# Sort by gap size, take top (target_count - 1) gaps
gaps.sort(key=lambda x: x[1], reverse=True)
split_indices = sorted([g[0] for g in gaps[: target_count - 1]])
# Create scenes
scenes = []
start = 0
for split in split_indices:
scenes.append(chunks[start:split])
start = split
scenes.append(chunks[start:])
return scenes
def generate_summary(scene_chunks, scene_num):
"""Generate summary for a scene using gemma4"""
texts = [c["text_content"] for c in scene_chunks if c["text_content"]]
if not texts:
return f"Scene {scene_num}: No dialogue"
combined = " ".join(texts)[:3000]
duration = scene_chunks[-1]["end_time"] - scene_chunks[0]["start_time"]
prompt = f"""You are a professional film scene analyst. Given the following dialogue transcript from a movie scene, write a concise one-sentence English summary.
Duration: {duration:.0f} seconds
Dialogue:
{combined}
Provide ONLY the summary sentence, nothing else. Focus on plot events and character actions."""
summary = call_gemma4(prompt, max_tokens=250)
if not summary:
# Fallback: use first few words of dialogue
summary = f"Scene {scene_num}: {' '.join(texts[:3])[:80]}..."
return summary
def insert_parent_chunks(scenes):
"""Insert parent chunks and update child relationships"""
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
inserted = 0
for i, scene_chunks in enumerate(scenes):
start_time = scene_chunks[0]["start_time"]
end_time = scene_chunks[-1]["end_time"]
start_frame = int(scene_chunks[0]["start_frame"])
end_frame = int(scene_chunks[-1]["end_frame"])
fps = float(scene_chunks[0]["fps"]) if scene_chunks[0]["fps"] else 59.94
chunk_count = len(scene_chunks)
print(
f" Scene {i}: {start_time:.0f}s-{end_time:.0f}s ({chunk_count} chunks, {end_time - start_time:.0f}s)"
)
# Generate summary
summary = generate_summary(scene_chunks, i)
print(f" 📝 {summary[:100]}...")
# Insert parent chunk
cur.execute(
"""
INSERT INTO parent_chunks (
uuid, scene_order, start_time, end_time,
start_frame, end_frame, fps, summary_text,
metadata, rule_3_markers, created_at
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW())
RETURNING id
""",
(
UUID,
i,
start_time,
end_time,
start_frame,
end_frame,
fps,
summary,
json.dumps({"auto_generated_by": "gemma4", "chunk_count": chunk_count}),
json.dumps({}),
),
)
parent_id = cur.fetchone()[0]
# Update chunks with parent_chunk_id
chunk_ids = [c["chunk_id"] for c in scene_chunks]
child_ids_array = chunk_ids # Store all child chunk IDs
cur.execute(
"""
UPDATE chunks
SET parent_chunk_id = %s::varchar
WHERE uuid = %s AND chunk_id = ANY(%s)
""",
(str(parent_id), UUID, chunk_ids),
)
inserted += 1
if i % 5 == 4 or i == len(scenes) - 1:
conn.commit()
print(f" ✅ Committed scenes 0-{i}")
conn.commit()
cur.close()
conn.close()
return inserted
def main():
print(f"🎬 Regenerating parent chunks for {UUID}")
print(f" Using model: {MODEL}")
print("=" * 70)
# Step 1: Get all chunks
print("\n📥 Fetching ASR chunks...")
chunks = get_chunks()
print(f" Found {len(chunks)} sentence chunks")
if chunks:
print(f" Time range: 0-{chunks[-1]['end_time']:.0f}s")
# Step 2: Find scene boundaries
print(f"\n🔍 Finding {SCENE_TARGET_COUNT} scene boundaries...")
scenes = find_scene_boundaries(chunks, SCENE_TARGET_COUNT)
print(f" Created {len(scenes)} scenes")
for i, s in enumerate(scenes):
print(
f" Scene {i}: {s[0]['start_time']:.0f}s-{s[-1]['end_time']:.0f}s ({len(s)} chunks)"
)
# Step 3: Generate summaries and insert
print("\n🤖 Generating summaries with gemma4...")
inserted = insert_parent_chunks(scenes)
print(f"\n{'=' * 70}")
print(f"✅ Created {inserted} parent chunks")
# Step 4: Verify
print("\n📊 Verification:")
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
cur.execute("SELECT COUNT(*) FROM parent_chunks WHERE uuid = %s", (UUID,))
print(f" parent_chunks: {cur.fetchone()[0]}")
cur.execute(
"SELECT COUNT(*) FROM chunks WHERE uuid = %s AND parent_chunk_id IS NULL AND chunk_type = 'sentence'",
(UUID,),
)
print(f" orphan chunks: {cur.fetchone()[0]}")
cur.close()
conn.close()
if __name__ == "__main__":
main()

View File

@@ -1 +0,0 @@
../v1.1/scripts/generate_parent_chunks_gemma4_v1.11.py

View File

@@ -1,711 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
MediaPipe Holistic Processor - Full body keypoint extraction
Purpose:
1. Extract Face Mesh (468 keypoints) → eye/mouth actions
2. Extract Pose (33 keypoints) → arm/leg/feet actions
3. Extract Hands (21 keypoints × 2) → hand gestures
Output structure:
{
"metadata": {...},
"frames": {
"frame_num": {
"persons": [
{
"person_id": 0,
"bbox": {...},
"face_mesh": {
"landmarks": [[x,y,z], ...], # 468 points
"eye_features": {...},
"mouth_features": {...},
},
"pose": {
"landmarks": [[x,y,z,visibility], ...], # 33 points
"arm_features": {...},
"leg_features": {...},
},
"hands": {
"left": {
"landmarks": [[x,y,z], ...], # 21 points
"gesture": "...",
},
"right": {
"landmarks": [[x,y,z], ...], # 21 points
"gesture": "...",
},
},
}
]
}
}
}
"""
import json
import argparse
import cv2
import numpy as np
import mediapipe as mp
from typing import Dict
class MediaPipeHolisticProcessor:
"""
Process video with MediaPipe Holistic (Face + Pose + Hands)
"""
def __init__(
self,
model_complexity: int = 1, # 0, 1, 2
refine_face_landmarks: bool = True,
enable_segmentation: bool = False,
min_detection_confidence: float = 0.5,
min_tracking_confidence: float = 0.5,
):
"""
Initialize MediaPipe Holistic
Args:
model_complexity: 0 (lite), 1 (full), 2 (heavy)
refine_face_landmarks: Enable iris detection
enable_segmentation: Enable segmentation mask
min_detection_confidence: Detection confidence threshold
min_tracking_confidence: Tracking confidence threshold
"""
self.mp_holistic = mp.solutions.holistic
self.mp_drawing = mp.solutions.drawing_utils
self.mp_drawing_styles = mp.solutions.drawing_styles
self.holistic = self.mp_holistic.Holistic(
static_image_mode=False, # Video mode
model_complexity=model_complexity,
smooth_landmarks=True, # Smooth landmarks across frames
enable_segmentation=enable_segmentation,
smooth_segmentation=True,
refine_face_landmarks=refine_face_landmarks,
min_detection_confidence=min_detection_confidence,
min_tracking_confidence=min_tracking_confidence,
)
# Eye landmark indices (Face Mesh)
self.LEFT_EYE_INDICES = [33, 133, 159, 145, 158, 144] # 6 points
self.RIGHT_EYE_INDICES = [362, 263, 386, 374, 385, 373]
# Iris indices
self.LEFT_IRIS_CENTER = 468
self.RIGHT_IRIS_CENTER = 473
# Mouth indices
self.MOUTH_TOP = 13
self.MOUTH_BOTTOM = 14
self.MOUTH_LEFT = 61
self.MOUTH_RIGHT = 291
# Pose key indices
self.POSE_KEYPOINTS = {
"nose": 0,
"left_shoulder": 11,
"right_shoulder": 12,
"left_elbow": 13,
"right_elbow": 14,
"left_wrist": 15,
"right_wrist": 16,
"left_hip": 23,
"right_hip": 24,
"left_knee": 25,
"right_knee": 26,
"left_ankle": 27,
"right_ankle": 28,
}
# Hand key indices
self.HAND_KEYPOINTS = {
"wrist": 0,
"thumb_cmc": 1,
"thumb_mcp": 2,
"thumb_ip": 3,
"thumb_tip": 4,
"index_mcp": 5,
"index_pip": 6,
"index_dip": 7,
"index_tip": 8,
"middle_mcp": 9,
"middle_pip": 10,
"middle_dip": 11,
"middle_tip": 12,
"ring_mcp": 13,
"ring_pip": 14,
"ring_dip": 15,
"ring_tip": 16,
"pinky_mcp": 17,
"pinky_pip": 18,
"pinky_dip": 19,
"pinky_tip": 20,
}
def process_frame(self, frame: np.ndarray) -> Dict:
"""
Process single frame
Args:
frame: BGR image
Returns:
Dict with face_mesh, pose, hands data
"""
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.holistic.process(frame_rgb)
person_data = {
"person_id": 0,
"bbox": None,
"face_mesh": None,
"pose": None,
"hands": {"left": None, "right": None},
}
# Extract face mesh
height, width = frame.shape[:2]
if results.face_landmarks:
person_data["face_mesh"] = self._extract_face_mesh(results.face_landmarks, width, height)
# Extract pose
if results.pose_landmarks:
person_data["pose"] = self._extract_pose(results.pose_landmarks, width, height)
# Extract hands
if results.left_hand_landmarks:
person_data["hands"]["left"] = self._extract_hand(results.left_hand_landmarks, "left", width, height)
if results.right_hand_landmarks:
person_data["hands"]["right"] = self._extract_hand(results.right_hand_landmarks, "right", width, height)
# Calculate bbox from pose landmarks
if results.pose_landmarks:
landmarks = results.pose_landmarks.landmark
x_coords = [lm.x for lm in landmarks if lm.visibility > 0.5]
y_coords = [lm.y for lm in landmarks if lm.visibility > 0.5]
if x_coords and y_coords:
x_min, x_max = min(x_coords), max(x_coords)
y_min, y_max = min(y_coords), max(y_coords)
person_data["bbox"] = {
"x": int(x_min * width),
"y": int(y_min * height),
"width": int((x_max - x_min) * width),
"height": int((y_max - y_min) * height),
}
return person_data
def _extract_face_mesh(self, face_landmarks, width: int, height: int) -> Dict:
"""
Extract face mesh landmarks and calculate features
Args:
face_landmarks: MediaPipe face landmarks
width: Frame width in pixels
height: Frame height in pixels
Returns:
Dict with landmarks (in pixels), eye_features, mouth_features
"""
landmarks = []
for lm in face_landmarks.landmark:
landmarks.append([int(lm.x * width), int(lm.y * height), lm.z])
# Eye Aspect Ratio (EAR)
def calculate_ear(eye_indices):
# Get eye points
p1 = face_landmarks.landmark[eye_indices[0]]
p2 = face_landmarks.landmark[eye_indices[1]]
p3 = face_landmarks.landmark[eye_indices[2]]
p4 = face_landmarks.landmark[eye_indices[3]]
p5 = face_landmarks.landmark[eye_indices[4]]
p6 = face_landmarks.landmark[eye_indices[5]]
# Vertical distances
vertical_1 = np.linalg.norm([p3.x - p5.x, p3.y - p5.y])
vertical_2 = np.linalg.norm([p4.x - p6.x, p4.y - p6.y])
# Horizontal distance
horizontal = np.linalg.norm([p1.x - p2.x, p1.y - p2.y])
ear = (vertical_1 + vertical_2) / (2 * horizontal) if horizontal > 0 else 0
return ear
left_ear = calculate_ear(self.LEFT_EYE_INDICES)
right_ear = calculate_ear(self.RIGHT_EYE_INDICES)
avg_ear = (left_ear + right_ear) / 2
# Iris position (if refined landmarks enabled)
left_iris_x = None
right_iris_x = None
if len(face_landmarks.landmark) > 477:
left_iris = face_landmarks.landmark[self.LEFT_IRIS_CENTER]
right_iris = face_landmarks.landmark[self.RIGHT_IRIS_CENTER]
# Normalize iris position relative to eye
left_eye_center_x = (face_landmarks.landmark[33].x + face_landmarks.landmark[133].x) / 2
right_eye_center_x = (face_landmarks.landmark[362].x + face_landmarks.landmark[263].x) / 2
left_eye_width = abs(face_landmarks.landmark[33].x - face_landmarks.landmark[133].x)
right_eye_width = abs(face_landmarks.landmark[362].x - face_landmarks.landmark[263].x)
left_iris_x = (left_iris.x - left_eye_center_x) / left_eye_width if left_eye_width > 0 else 0
right_iris_x = (right_iris.x - right_eye_center_x) / right_eye_width if right_eye_width > 0 else 0
# Eye action detection
eye_action = "unknown"
if avg_ear < 0.15:
eye_action = "closed"
elif avg_ear > 0.4:
eye_action = "wide_open"
elif 0.15 <= avg_ear < 0.25:
eye_action = "squint"
else:
eye_action = "normal"
# Gaze direction
gaze_direction = "center"
if left_iris_x and right_iris_x:
avg_iris_x = (left_iris_x + right_iris_x) / 2
if avg_iris_x < -0.2:
gaze_direction = "left"
elif avg_iris_x > 0.2:
gaze_direction = "right"
# Mouth Aspect Ratio (MAR)
mouth_top = face_landmarks.landmark[self.MOUTH_TOP]
mouth_bottom = face_landmarks.landmark[self.MOUTH_BOTTOM]
mouth_left = face_landmarks.landmark[self.MOUTH_LEFT]
mouth_right = face_landmarks.landmark[self.MOUTH_RIGHT]
mouth_height = np.linalg.norm([mouth_top.x - mouth_bottom.x, mouth_top.y - mouth_bottom.y])
mouth_width = np.linalg.norm([mouth_left.x - mouth_right.x, mouth_left.y - mouth_right.y])
mar = mouth_height / mouth_width if mouth_width > 0 else 0
# Mouth corner distance (for smile detection)
mouth_center_y = (mouth_top.y + mouth_bottom.y) / 2
corner_lift = (mouth_center_y - mouth_left.y) + (mouth_center_y - mouth_right.y)
# Mouth action detection
mouth_action = "unknown"
if mar > 0.7:
mouth_action = "yawn"
elif mar > 0.5:
mouth_action = "open"
elif mar < 0.2:
if corner_lift > 0.02:
mouth_action = "smile"
else:
mouth_action = "closed"
else:
mouth_action = "slightly_open"
return {
"landmarks": landmarks,
"num_landmarks": len(landmarks),
"eye_features": {
"left_ear": round(left_ear, 4),
"right_ear": round(right_ear, 4),
"avg_ear": round(avg_ear, 4),
"left_iris_x": round(left_iris_x, 4) if left_iris_x else None,
"right_iris_x": round(right_iris_x, 4) if right_iris_x else None,
"eye_action": eye_action,
"gaze_direction": gaze_direction,
},
"mouth_features": {
"mar": round(mar, 4),
"mouth_height": round(mouth_height, 4),
"mouth_width": round(mouth_width, 4),
"corner_lift": round(corner_lift, 4),
"mouth_action": mouth_action,
},
}
def _extract_pose(self, pose_landmarks, width: int, height: int) -> Dict:
"""
Extract pose landmarks and calculate features
Args:
pose_landmarks: MediaPipe pose landmarks
width: Frame width in pixels
height: Frame height in pixels
Returns:
Dict with landmarks (in pixels), arm_features, leg_features
"""
landmarks = []
for lm in pose_landmarks.landmark:
landmarks.append([int(lm.x * width), int(lm.y * height), lm.z, lm.visibility])
# Helper function to calculate angle
def calculate_angle(p1_idx, p2_idx, p3_idx):
p1 = pose_landmarks.landmark[p1_idx]
p2 = pose_landmarks.landmark[p2_idx]
p3 = pose_landmarks.landmark[p3_idx]
v1 = np.array([p1.x, p1.y]) - np.array([p2.x, p2.y])
v2 = np.array([p3.x, p3.y]) - np.array([p2.x, p2.y])
angle = np.arccos(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
return np.degrees(angle)
# Arm features
left_elbow_angle = calculate_angle(11, 13, 15) # shoulder-elbow-wrist
right_elbow_angle = calculate_angle(12, 14, 16)
# Check if arms raised
left_wrist = pose_landmarks.landmark[15]
left_elbow = pose_landmarks.landmark[13]
left_shoulder = pose_landmarks.landmark[11]
right_wrist = pose_landmarks.landmark[16]
right_elbow = pose_landmarks.landmark[14]
right_shoulder = pose_landmarks.landmark[12]
left_arm_raised = left_wrist.y < left_elbow.y < left_shoulder.y
right_arm_raised = right_wrist.y < right_elbow.y < right_shoulder.y
# Arm action detection
left_arm_action = "unknown"
if left_arm_raised:
left_arm_action = "raise_left"
elif left_elbow_angle > 150:
left_arm_action = "extend_left"
elif left_elbow_angle < 90:
left_arm_action = "fold_left"
else:
left_arm_action = "neutral_left"
right_arm_action = "unknown"
if right_arm_raised:
right_arm_action = "raise_right"
elif right_elbow_angle > 150:
right_arm_action = "extend_right"
elif right_elbow_angle < 90:
right_arm_action = "fold_right"
else:
right_arm_action = "neutral_right"
# Cross arms detection
cross_arms = False
if left_wrist.x > right_wrist.x and right_wrist.x < left_shoulder.x:
cross_arms = True
# Leg features
left_knee_angle = calculate_angle(23, 25, 27) # hip-knee-ankle
right_knee_angle = calculate_angle(24, 26, 28)
# Check standing/sitting
left_hip = pose_landmarks.landmark[23]
left_knee = pose_landmarks.landmark[25]
left_ankle = pose_landmarks.landmark[27]
right_hip = pose_landmarks.landmark[24]
right_knee = pose_landmarks.landmark[26]
right_ankle = pose_landmarks.landmark[28]
hip_avg_y = (left_hip.y + right_hip.y) / 2
knee_avg_y = (left_knee.y + right_knee.y) / 2
# Standing: hip < knee < ankle (y increases downward)
standing = left_hip.y < left_knee.y < left_ankle.y and right_hip.y < right_knee.y < right_ankle.y
# Sitting: hip ≈ knee height
sitting = abs(hip_avg_y - knee_avg_y) < 0.1
# Leg action detection
leg_action = "unknown"
if sitting:
leg_action = "sit"
elif standing:
if left_knee_angle < 120 or right_knee_angle < 120:
leg_action = "knee_bend"
else:
leg_action = "stand"
return {
"landmarks": landmarks,
"num_landmarks": len(landmarks),
"arm_features": {
"left_elbow_angle": round(left_elbow_angle, 2),
"right_elbow_angle": round(right_elbow_angle, 2),
"left_arm_raised": left_arm_raised,
"right_arm_raised": right_arm_raised,
"left_arm_action": left_arm_action,
"right_arm_action": right_arm_action,
"cross_arms": cross_arms,
},
"leg_features": {
"left_knee_angle": round(left_knee_angle, 2),
"right_knee_angle": round(right_knee_angle, 2),
"standing": standing,
"sitting": sitting,
"leg_action": leg_action,
},
}
def _extract_hand(self, hand_landmarks, hand_type: str, width: int, height: int) -> Dict:
"""
Extract hand landmarks and detect gesture
Args:
hand_landmarks: MediaPipe hand landmarks
hand_type: "left" or "right"
width: Frame width in pixels
height: Frame height in pixels
Returns:
Dict with landmarks (in pixels), gesture
"""
landmarks = []
for lm in hand_landmarks.landmark:
landmarks.append([int(lm.x * width), int(lm.y * height), lm.z])
# Check finger extensions
def is_finger_extended(tip_idx, pip_idx):
tip = hand_landmarks.landmark[tip_idx]
pip = hand_landmarks.landmark[pip_idx]
# Finger is extended if tip is higher (lower y) than pip
return tip.y < pip.y
thumb_extended = is_finger_extended(4, 3)
index_extended = is_finger_extended(8, 6)
middle_extended = is_finger_extended(12, 10)
ring_extended = is_finger_extended(16, 14)
pinky_extended = is_finger_extended(20, 18)
extensions = {
"thumb": thumb_extended,
"index": index_extended,
"middle": middle_extended,
"ring": ring_extended,
"pinky": pinky_extended,
}
# Gesture detection
gesture = "unknown"
num_extended = sum(extensions.values())
if num_extended == 5:
gesture = "open_hand"
elif num_extended == 0:
gesture = "fist"
elif thumb_extended and num_extended == 1:
gesture = "thumbs_up"
elif index_extended and middle_extended and num_extended == 2:
gesture = "peace_sign"
elif index_extended and num_extended == 1:
gesture = "pointing"
elif thumb_extended and index_extended and not any([middle_extended, ring_extended, pinky_extended]):
# Check thumb-index distance for OK gesture
thumb_tip = hand_landmarks.landmark[4]
index_tip = hand_landmarks.landmark[8]
distance = np.linalg.norm([thumb_tip.x - index_tip.x, thumb_tip.y - index_tip.y])
if distance < 0.05:
gesture = "ok_sign"
else:
gesture = "grab"
return {
"landmarks": landmarks,
"num_landmarks": len(landmarks),
"finger_extensions": extensions,
"num_fingers_extended": num_extended,
"gesture": gesture,
"hand_type": hand_type,
}
def process_video(
self,
video_path: str,
output_path: str,
sample_interval: int = 1,
uuid: str = "",
) -> Dict:
"""
Process entire video
Args:
video_path: Path to video file
output_path: Path to output JSON
sample_interval: Process every N frames
uuid: UUID for progress reporting
Returns:
Dict with all processed data
"""
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"MEDIAPIPE_ERROR:Cannot open video: {video_path}", file=sys.stderr)
return {}
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"MEDIAPIPE_START", file=sys.stderr)
print(f"MEDIAPIPE_INFO:FPS={fps},total={total_frames},interval={sample_interval}", file=sys.stderr)
output_data = {
"metadata": {
"video_path": video_path,
"fps": fps,
"width": width,
"height": height,
"total_frames": total_frames,
"sample_interval": sample_interval,
"processor": "mediapipe_holistic",
"model_complexity": 1,
"refine_face_landmarks": True,
},
"frames": {},
}
frame_count = 0
processed_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
if frame_count % sample_interval != 0:
continue
# Process frame
person_data = self.process_frame(frame)
# Only save if landmarks detected
if person_data["face_mesh"] or person_data["pose"] or person_data["hands"]["left"] or person_data["hands"]["right"]:
timestamp = frame_count / fps if fps > 0 else 0
output_data["frames"][str(frame_count)] = {
"frame_number": frame_count,
"timestamp": round(timestamp, 3),
"persons": [person_data],
}
processed_count += 1
if processed_count % 100 == 0:
print(f"MEDIAPIPE_FRAME:{processed_count}", file=sys.stderr)
cap.release()
# Update metadata
output_data["metadata"]["processed_frames"] = processed_count
# Save output
with open(output_path, "w") as f:
json.dump(output_data, f, indent=2)
print(f"MEDIAPIPE_COMPLETE:{processed_count}", file=sys.stderr)
return output_data
def close(self):
"""Close MediaPipe model"""
self.holistic.close()
def main():
parser = argparse.ArgumentParser(description="MediaPipe Holistic Processor")
parser.add_argument("video_path", nargs="?", help="Path to video file (positional)")
parser.add_argument("output_path", nargs="?", help="Path to output JSON (positional)")
parser.add_argument("--video", help="Path to video file")
parser.add_argument("--output", help="Path to output JSON")
parser.add_argument("--sample-interval", type=int, default=1, help="Process every N frames")
parser.add_argument("--model-complexity", type=int, default=1, choices=[0, 1, 2], help="Model complexity")
parser.add_argument("--test-frame", type=int, help="Test single frame only")
parser.add_argument("--uuid", default="", help="UUID for progress reporting")
args = parser.parse_args()
# Resolve positional vs flagged args
video_path = args.video or args.video_path
output_path = args.output or args.output_path
if not video_path or not output_path:
parser.error("video_path and output_path are required")
print("=" * 70)
print("MediaPipe Holistic Processor")
print("=" * 70)
processor = MediaPipeHolisticProcessor(
model_complexity=args.model_complexity,
refine_face_landmarks=True,
)
if args.test_frame:
# Test single frame
print(f"\nTesting frame {args.test_frame}...")
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_POS_FRAMES, args.test_frame - 1)
ret, frame = cap.read()
cap.release()
if ret:
person_data = processor.process_frame(frame)
print("\n=== Results ===")
if person_data["face_mesh"]:
face = person_data["face_mesh"]
print(f"\nFace Mesh: {face['num_landmarks']} landmarks")
print(f" Eye: {face['eye_features']['eye_action']} (EAR: {face['eye_features']['avg_ear']})")
print(f" Gaze: {face['eye_features']['gaze_direction']}")
print(f" Mouth: {face['mouth_features']['mouth_action']} (MAR: {face['mouth_features']['mar']})")
if person_data["pose"]:
pose = person_data["pose"]
print(f"\nPose: {pose['num_landmarks']} keypoints")
print(f" Left arm: {pose['arm_features']['left_arm_action']} (angle: {pose['arm_features']['left_elbow_angle']}°)")
print(f" Right arm: {pose['arm_features']['right_arm_action']} (angle: {pose['arm_features']['right_elbow_angle']}°)")
print(f" Cross arms: {pose['arm_features']['cross_arms']}")
print(f" Leg: {pose['leg_features']['leg_action']}")
if person_data["hands"]["left"]:
hand = person_data["hands"]["left"]
print(f"\nLeft hand: {hand['num_landmarks']} keypoints")
print(f" Gesture: {hand['gesture']}")
print(f" Fingers extended: {hand['num_fingers_extended']}")
if person_data["hands"]["right"]:
hand = person_data["hands"]["right"]
print(f"\nRight hand: {hand['num_landmarks']} keypoints")
print(f" Gesture: {hand['gesture']}")
print(f" Fingers extended: {hand['num_fingers_extended']}")
else:
print("❌ Cannot read frame")
else:
# Process entire video
processor.process_video(
video_path,
output_path,
args.sample_interval,
uuid=args.uuid,
)
processor.close()
if __name__ == "__main__":
main()

View File

@@ -1 +0,0 @@
../v1.1/scripts/mediapipe_holistic_processor_v1.11.py

View File

@@ -1 +0,0 @@
../v1.1/scripts/mediapipe_processor_v1.11.py

View File

@@ -1,381 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Story Processor V2.0 — Dual Pipeline: Story-based + LLM-based Parent-Child Summarization
Pipeline 1 (Story): Template-based, instant, no LLM cost
→ Parent story summary + Child story summary
→ Embedding (Ollama nomic-embed) → pgvector
→ BM25 (PostgreSQL tsvector) → full-text search
Pipeline 2 (LLM): LLM-based summarization (Gemma4/Qwen when resources allow)
→ Parent LLM summary + Child LLM summary
→ Embedding → pgvector + BM25
Both pipelines store into chunks table with distinct chunk_types:
story_parent, story_child, llm_parent, llm_child
Usage:
python parent_chunk_5w1h.py --file-uuid <uuid> --mode story [--embed]
python parent_chunk_5w1h.py --file-uuid <uuid> --mode llm [--embed]
"""
import json, os, sys, argparse, time, requests, psycopg2
from collections import defaultdict
from typing import Dict, List, Optional
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
DB_URL = os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
SCHEMA = os.getenv("DATABASE_SCHEMA", "dev")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://localhost:11436/v1/embeddings")
def load_speaker_map(file_uuid: str) -> dict:
"""Load speaker→identity mapping from DB (generalized, not hardcoded)"""
try:
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("SET search_path TO %s, public", (SCHEMA,))
cur.execute(
"SELECT metadata->>'speaker_id', name FROM identities "
"WHERE metadata->>'speaker_id' IS NOT NULL"
)
spk_map = {}
for spk_id, name in cur.fetchall():
spk_map[spk_id] = (name, 0.85) # default confidence from MAR
cur.close(); conn.close()
return spk_map if spk_map else DEFAULT_SPEAKER_MAP
except Exception:
return DEFAULT_SPEAKER_MAP
# Default fallback (used when DB has no speaker mapping)
DEFAULT_SPEAKER_MAP = {}
CURRENT_VERSIONS = {
"asr": "faster-whisper/small/v1",
"asrx": "speechbrain/ecapa-tdnn/v1",
"cut": "pyscenedetect/default",
"yolo": "yolov5-coreml/v2",
"face_detection": "apple-vision/v2",
"face_embedding": "coreml-facenet/v2",
"speaker_binding": "mar-lip/v1",
"identity_clustering": "cosine-threshold/v1",
"story_agent": "template/v2.0",
"embedding_agent": "nomic-embed-768d/v1",
}
LLM_URL = os.getenv("MOMENTRY_LLM_URL", os.getenv("MOMENTRY_LLM_SUMMARY_URL", "http://127.0.0.1:8082/v1/chat/completions"))
LLM_MODEL = os.getenv("MOMENTRY_LLM_SUMMARY_MODEL", "gemma4")
def load_data(file_uuid: str) -> dict:
data = {}
for name in ["asr", "asrx", "cut"]:
path = os.path.join(OUTPUT_DIR, f"{file_uuid}.{name}.json")
data[name] = json.load(open(path)) if os.path.exists(path) else None
return data
def build_child_chunks(data: dict, file_uuid: str) -> List[dict]:
"""Group ASR sentences by CUT scene boundaries → parent/child structure."""
asr_segs = data["asr"].get("segments", []) if data["asr"] else []
asrx_segs = data["asrx"].get("segments", []) if data["asrx"] else []
cut_scenes = data["cut"].get("scenes", []) if data["cut"] else []
# Dynamically load speaker→identity mapping from DB
speaker_map = load_speaker_map(file_uuid)
if not cut_scenes:
max_t = max(
(asr_segs[-1].get("end", 0) if asr_segs else 0),
(asrx_segs[-1].get("end_time", 0) if asrx_segs else 0),
)
cut_scenes = [{"start_time": t, "end_time": min(t + 60, max_t)} for t in range(0, int(max_t) + 60, 60)]
scenes = []
for cs in cut_scenes:
s, e = cs["start_time"], cs["end_time"]
children = []
for seg_idx, seg in enumerate(asr_segs):
st, en = seg.get("start", 0), seg.get("end", 0)
text = seg.get("text", "").strip()
if st < s or en > e or not text: continue
spk_id = "unknown"
for ax in asrx_segs:
if ax["start_time"] <= st and ax["end_time"] >= en:
spk_id = ax.get("speaker_id", "unknown"); break
spk_info = speaker_map.get(spk_id)
if spk_info:
character, spk_conf = spk_info
else:
character, spk_conf = spk_id, 0.0
children.append({
"start": st, "end": en, "text": text,
"speaker_id": spk_id, "speaker_name": character,
"speaker_confidence": spk_conf,
"chunk_id": f"{file_uuid}_{seg_idx}",
})
# Boundary overlap: even empty scenes get partial children
for seg_idx, seg in enumerate(asr_segs):
st, en = seg.get("start", 0), seg.get("end", 0)
text = seg.get("text", "").strip()
if not text: continue
if st >= s and en <= e: continue
if not (st < e and en > s): continue
spk_id = "unknown"
for ax in asrx_segs:
if ax["start_time"] <= st and ax["end_time"] >= en:
spk_id = ax.get("speaker_id", "unknown"); break
spk_info = speaker_map.get(spk_id)
if spk_info:
character, spk_conf = spk_info
else:
character, spk_conf = spk_id, 0.0
children.append({
"start": st, "end": en, "text": text,
"speaker_id": spk_id, "speaker_name": character,
"speaker_confidence": spk_conf,
"chunk_id": f"{file_uuid}_{seg_idx}",
"overlap_type": "partial",
})
if children:
scenes.append({
"start_time": s, "end_time": e, "duration": e - s,
"children": children, "child_count": len(children),
})
return scenes
# ===== Pipeline 1: Story (Template) Summaries =====
def generate_story_parent_summary(scene: dict) -> str:
children = scene["children"]
characters = sorted(set(c["speaker_name"] for c in children))
total_words = sum(len(c["text"].split()) for c in children)
by_speaker = defaultdict(list)
for c in children: by_speaker[c["speaker_name"]].append(c["text"])
speakers = []
for char, texts in sorted(by_speaker.items()):
speakers.append(f"{char} ({len(texts)} lines)")
return (
f"[{scene['start_time']:.0f}s-{scene['end_time']:.0f}s, {scene['duration']:.0f}s] "
f"Cast: {', '.join(characters)}. Total: {len(children)} lines, {total_words} words. "
f"Speakers: {' | '.join(speakers[:3])}"
)
def generate_story_child_summary(child: dict, parent_summary: str) -> str:
return (
f"[{child['start']:.0f}s-{child['end']:.0f}s] "
f"{child['speaker_name']}: \"{child['text']}\""
)
# ===== Pipeline 2: LLM Summaries (requires LLM server) =====
def generate_llm_parent_summary(scene: dict, max_scenes_processed: int) -> Optional[str]:
"""LLM-based parent summary"""
if not LLM_URL: return None
children = scene["children"]
dialogue = "\n".join(
f"[{c['start']:.0f}s] {c['speaker_name']}: {c['text'][:150]}"
for c in children[:15]
)
prompt = (
"You are a film analyst. Summarize this scene in one flowing paragraph (60-100 words). "
"Include: who is present, what they discuss, tone/mood.\n\n"
f"Scene: {scene['start_time']:.0f}s - {scene['end_time']:.0f}s\n"
f"Dialogue:\n{dialogue}\n\nSummary:"
)
try:
resp = requests.post(LLM_URL, json={
"model": LLM_MODEL,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 200, "temperature": 0.3,
}, timeout=60)
return resp.json()["choices"][0]["message"]["content"].strip()
except Exception as e:
print(f" ⚠️ LLM parent summary failed: {e}")
return None
def generate_llm_child_summary(child: dict, parent_summary: str) -> Optional[str]:
"""LLM-based child (sentence) summary"""
return f"[{child['start']:.0f}s-{child['end']:.0f}s] {child['speaker_name']}: \"{child['text']}\""
# ===== Embedding (Ollama nomic-embed) =====
def embed_text(text: str, max_retries: int = 3) -> Optional[List[float]]:
"""Get embedding via EmbeddingGemma server"""
for attempt in range(max_retries):
try:
resp = requests.post(EMBEDDING_URL, json={
"input": [text],
}, timeout=30)
if resp.status_code == 200:
data = resp.json()
items = data.get("data", [])
if items:
return items[0]["embedding"]
except Exception as e:
if attempt == max_retries - 1:
print(f" ⚠️ Embedding failed: {e}")
return None
time.sleep(1)
return None
# ===== DB Store (chunks table with embedding + BM25) =====
def store_chunks(file_uuid: str, scenes: List[dict], mode: str, do_embed: bool, conn):
"""Store parent + child summaries into chunks table."""
cur = conn.cursor()
parent_type = f"{mode}_parent"
child_type = f"{mode}_child"
parent_count = 0
child_count = 0
# Get base chunk_index
cur.execute(
f"SELECT COALESCE(MAX(chunk_index), 0) FROM {SCHEMA}.chunk WHERE file_uuid = %s",
(file_uuid,),
)
next_index = (cur.fetchone()[0] or 0) + 1
for scene in scenes:
parent_text = generate_story_parent_summary(scene) if mode == "story" else generate_llm_parent_summary(scene, parent_count)
if not parent_text: continue
parent_id = f"{mode}_parent_{file_uuid}_{scene['start_time']:.0f}_{scene['end_time']:.0f}"
parent_embedding = embed_text(parent_text) if do_embed else None
if do_embed and parent_embedding:
cur.execute(
f"""
INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
start_time, end_time, content, text_content, parent_chunk_id, embedding)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s, %s::vector)
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
embedding = EXCLUDED.embedding
""",
(parent_id, parent_id, file_uuid, parent_type, next_index,
scene["start_time"], scene["end_time"],
json.dumps({"summary": parent_text, "mode": mode, "type": "parent",
"source_versions": CURRENT_VERSIONS}),
parent_text, None, parent_embedding),
)
else:
cur.execute(
f"""
INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
start_time, end_time, content, text_content, parent_chunk_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s)
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
SET content = EXCLUDED.content, text_content = EXCLUDED.text_content
""",
(parent_id, parent_id, file_uuid, parent_type, next_index,
scene["start_time"], scene["end_time"],
json.dumps({"summary": parent_text, "mode": mode, "type": "parent",
"source_versions": CURRENT_VERSIONS}),
parent_text, None),
)
next_index += 1
parent_count += 1
for child in scene["children"]:
child_id = child["chunk_id"]
child_text = generate_story_child_summary(child, parent_text) if mode == "story" else generate_llm_child_summary(child, parent_text)
child_embedding = embed_text(child_text) if do_embed else None
if do_embed and child_embedding:
cur.execute(
f"""
INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
start_time, end_time, content, text_content, parent_chunk_id, embedding)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s, %s::vector)
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
parent_chunk_id = EXCLUDED.parent_chunk_id,
embedding = EXCLUDED.embedding
""",
(child_id, child_id, file_uuid, child_type, next_index,
child["start"], child["end"],
json.dumps({"speaker": child["speaker_name"], "text": child["text"], "mode": mode,
"speaker_confidence": child.get("speaker_confidence", 0),
"source_versions": CURRENT_VERSIONS}),
child_text, parent_id, child_embedding),
)
else:
cur.execute(
f"""
INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
start_time, end_time, content, text_content, parent_chunk_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s)
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
parent_chunk_id = EXCLUDED.parent_chunk_id
""",
(child_id, child_id, file_uuid, child_type, next_index,
child["start"], child["end"],
json.dumps({"speaker": child["speaker_name"], "text": child["text"], "mode": mode,
"speaker_confidence": child.get("speaker_confidence", 0),
"source_versions": CURRENT_VERSIONS}),
child_text, parent_id),
)
next_index += 1
child_count += 1
conn.commit()
cur.close()
return parent_count, child_count
def main():
parser = argparse.ArgumentParser(description="Story Processor V2.0")
parser.add_argument("--file-uuid", required=True)
parser.add_argument("--mode", choices=["story", "llm"], default="story")
parser.add_argument("--max-scenes", type=int, default=99999)
parser.add_argument("--embed", action="store_true", help="Generate embeddings (Ollama)")
parser.add_argument("--no-db", action="store_true", help="Skip DB storage")
args = parser.parse_args()
file_uuid = args.file_uuid
print(f"[STORY] Mode: {args.mode}, Embed: {args.embed}")
data = load_data(file_uuid)
if not data["asr"]:
print("[STORY] ❌ No ASR data"); return
scenes = build_child_chunks(data, file_uuid)[:args.max_scenes]
total_children = sum(s["child_count"] for s in scenes)
print(f"[STORY] {len(scenes)} scenes, {total_children} child chunks")
if not args.no_db:
conn = psycopg2.connect(DB_URL)
try:
pc, cc = store_chunks(file_uuid, scenes, args.mode, args.embed, conn)
print(f"[STORY] DB: {pc} parent, {cc} child chunks ({args.mode})")
finally:
conn.close()
# Save JSON output
out_path = os.path.join(OUTPUT_DIR, f"{file_uuid}.story_{args.mode}.json")
out_data = {"file_uuid": file_uuid, "mode": args.mode, "scenes": scenes}
with open(out_path, "w") as f:
json.dump(out_data, f, indent=2, ensure_ascii=False, default=str)
print(f"[STORY] ✅ {out_path}")
if __name__ == "__main__":
main()

View File

@@ -1 +0,0 @@
../v1.1/scripts/parent_chunk_5w1h_v1.11.py

View File

@@ -1,320 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments.
Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections.
"""
import json, sys, time, urllib.request
from urllib.request import Request, urlopen
import psycopg2
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
def call_llm(dialogue_text):
prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:"
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "max_tokens": 100}).encode()
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = urlopen(req, timeout=120)
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
except Exception as e:
print(f" LLM error: {e}")
return ""
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
except Exception as e:
print(f" Embed error: {e}")
return [0.0] * 768
print("=== Step 1: Load sentence chunks with new speaker info ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT chunk_index, text_content, metadata->>'new_speaker_name',
metadata->>'speaker_name', content
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'sentence'
ORDER BY chunk_index
""", (UUID,))
sentence_rows = cur.fetchall()
print(f"Loaded {len(sentence_rows)} sentence chunks")
# Build lookup
sentences = {}
for r in sentence_rows:
idx, old_text, new_name, old_name, content = r
sentences[idx] = {
"old_text": old_text or "",
"new_name": new_name or old_name or "Unknown",
"old_name": old_name or "Unknown",
"content": content or {},
}
# Rebuild sentence text_content with new speaker names
print("\n=== Step 2: Rebuild sentence text_content ===")
updated_sentences = 0
for r in sentence_rows:
idx, old_text, new_name, old_name, content = r
new_name = new_name or old_name or "Unknown"
# Extract the text part (remove old speaker prefix if exists)
raw_text = ""
if content and isinstance(content, dict):
raw_text = content.get("data", {}).get("text", "")
if not raw_text and old_text:
# Parse old format: [Speaker] text
import re
m = re.search(r'\]\s*(.*)', old_text)
if m:
raw_text = m.group(1)
else:
raw_text = old_text
new_text = f"[{new_name}] {raw_text}"
cur.execute("""
UPDATE dev.chunks
SET text_content = %s, updated_at = NOW()
WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s
""", (new_text, UUID, idx))
updated_sentences += 1
conn.commit()
print(f"Updated {updated_sentences} sentence chunks text_content")
print("\n=== Step 3: Rebuild story chunk text_content ===")
cur.execute("""
SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time,
text_content, summary_text
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'story'
ORDER BY chunk_index
""", (UUID,))
story_rows = cur.fetchall()
print(f"Loaded {len(story_rows)} story chunks")
# Build child text per story chunk
story_dialogue_texts = []
for r in story_rows:
db_id, cid, idx, child_ids, st, et, old_text, old_summary = r
dialogue_parts = []
for child_cid in (child_ids or []):
parts = child_cid.split("_")
child_idx = int(parts[-1])
if child_idx in sentences:
s = sentences[child_idx]
raw = ""
if s["content"] and isinstance(s["content"], dict):
raw = s["content"].get("data", {}).get("text", "")
if not raw:
import re
m = re.search(r'\]\s*(.*)', s["old_text"])
if m:
raw = m.group(1)
else:
raw = s["old_text"]
if raw:
dialogue_parts.append(f'({s["new_name"]}) {raw}')
dialogue_text = " ".join(dialogue_parts)
story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary))
print(f"Built {len(story_dialogue_texts)} story dialogue texts")
# Update DB with new text_content (dialogue only, not summary yet)
for item in story_dialogue_texts:
db_id, cid, idx, st, et, dialogue_text, old_summary = item
cur.execute("""
UPDATE dev.chunks
SET text_content = %s, updated_at = NOW()
WHERE id = %s
""", (dialogue_text, db_id))
conn.commit()
print("Updated story chunk dialogue texts")
print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===")
summaries = []
for i, item in enumerate(story_dialogue_texts):
db_id, cid, idx, st, et, dialogue_text, old_summary = item
if len(dialogue_text) < 10:
summary = "[no dialogue]"
embedding = [0.0] * 768
else:
print(f" [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="")
try:
summary = call_llm(dialogue_text[:3000])
print(f" -> {len(summary)} chars")
time.sleep(0.3)
embedding = call_embed(summary)
except Exception as e:
print(f" ERROR: {e}")
summary = "[error]"
embedding = [0.0] * 768
# Update DB
s_esc = summary.replace("'", "''")
cur.execute(f"""
UPDATE dev.chunks
SET summary_text = '{s_esc}', updated_at = NOW()
WHERE id = {db_id}
""")
summaries.append({
"db_id": db_id,
"chunk_id": cid,
"chunk_index": idx,
"start_time": st,
"end_time": et,
"dialogue": dialogue_text,
"summary": summary,
"embedding": embedding,
})
conn.commit()
print(f"\nGenerated {len(summaries)} summaries")
print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===")
# Delete existing
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
try:
urlopen(req)
time.sleep(0.3)
except:
pass
# Recreate
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.3)
# Upload dialogue points (0..227) and summary points (228..455)
dialogue_points = []
summary_points = []
for s in summaries:
idx = s["chunk_index"]
dialogue_points.append({
"id": idx + 1,
"vector": [0.0] * 768,
"payload": {
"chunk_id": s["chunk_id"],
"file_uuid": UUID,
"start_time": s["start_time"],
"end_time": s["end_time"],
"type": "story_dialogue",
"text": s["dialogue"][:500],
}
})
summary_points.append({
"id": idx + 1 + 228,
"vector": s["embedding"],
"payload": {
"chunk_id": s["chunk_id"],
"file_uuid": UUID,
"start_time": s["start_time"],
"end_time": s["end_time"],
"type": "story_summary",
"summary": s["summary"],
}
})
all_story_points = dialogue_points + summary_points
batch_size = 100
for start in range(0, len(all_story_points), batch_size):
batch = all_story_points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
urlopen(req)
except Exception as e:
print(f" Batch {start}: {e}")
if (start // batch_size) % 3 == 0:
print(f" Uploaded {start + len(batch)}/{len(all_story_points)}")
print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories")
print("\n=== Step 6: Populate sentence_story and sentence_summary ===")
# These are the per-sentence template + summary collections
# sentence_story: 3417 points, 768D, template payloads
# sentence_summary: 3417 points, 768D, LLM summary payloads
for col_name in ["sentence_story", "sentence_summary"]:
req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE")
try:
urlopen(req)
time.sleep(0.2)
except:
pass
req = Request(f"{QDRANT_URL}/collections/{col_name}",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.2)
# Build points for sentence_story and sentence_summary
story_sentence_points = []
summary_sentence_points = []
for idx in sorted(sentences.keys()):
s = sentences[idx]
raw_text = ""
if s["content"] and isinstance(s["content"], dict):
raw_text = s["content"].get("data", {}).get("text", "")
dialog_line = f'({s["new_name"]}) {raw_text}'
story_sentence_points.append({
"id": idx + 1,
"vector": [0.0] * 768,
"payload": {
"chunk_id": f"{UUID}_{idx}",
"file_uuid": UUID,
"start_time": 0,
"end_time": 0,
"text": dialog_line,
"speaker_name": s["new_name"],
"chunk_type": "sentence",
}
})
# Upload sentence_story (dialogue template)
batch_size = 200
for start in range(0, len(story_sentence_points), batch_size):
batch = story_sentence_points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
urlopen(req)
except Exception as e:
print(f" sentence_story batch {start}: {e}")
if (start // batch_size) % 5 == 0:
print(f" Uploaded {start + len(batch)}/3417 sentence_story")
print("Uploaded sentence_story points")
# sentence_summary will be populated when we generate per-sentence summaries
# For now, mark as TODO
print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)")
cur.close()
conn.close()
print("\n=== Done ===")

View File

@@ -1 +0,0 @@
../v1.1/scripts/rebuild_story_content_v1.11.py

View File

@@ -1,197 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Regenerate parent chunk summaries using 5W1H multi-dimensional structure via gemma4.
5W1H Structure:
- Who: Main characters/people involved
- What: Key actions/events
- When: Temporal context (sequence in story)
- Where: Location/setting
- Why: Motivation/conflict driving the scene
- How: Emotional tone/manner of events
"""
import json
import requests
import psycopg2
import psycopg2.extras
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
UUID = "384b0ff44aaaa1f1"
LLAMA_URL = "http://127.0.0.1:8081/v1/chat/completions"
def get_parent_with_children():
"""Get all parent chunks with their child chunk texts"""
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(
"""
SELECT pc.id, pc.scene_order, pc.start_time, pc.end_time,
pc.start_frame, pc.end_frame, pc.fps, pc.summary_text as old_summary,
pc.metadata,
ARRAY_AGG(c.text_content ORDER BY c.start_time) as child_texts
FROM parent_chunks pc
LEFT JOIN chunks c ON c.parent_chunk_id = pc.id::varchar
WHERE pc.uuid = %s
GROUP BY pc.id, pc.scene_order, pc.start_time, pc.end_time,
pc.start_frame, pc.end_frame, pc.fps, pc.summary_text, pc.metadata
ORDER BY pc.scene_order
""",
(UUID,),
)
parents = cur.fetchall()
cur.close()
conn.close()
return parents
def call_gemma4(prompt, max_tokens=1500):
"""Call Gemma4 via llama-server OpenAI-compatible API"""
payload = {
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0.3,
"min_p": 0.1,
}
try:
resp = requests.post(LLAMA_URL, json=payload, timeout=180)
if resp.status_code == 200:
result = resp.json()
content = (
result.get("choices", [{}])[0]
.get("message", {})
.get("content", "")
.strip()
)
return content
except Exception as e:
print(f" ⚠️ llama-server error: {e}")
return ""
def generate_5w1h_summary(parent, scene_num):
"""Generate 5W1H structured summary using gemma4"""
texts = [t for t in (parent["child_texts"] or []) if t]
if not texts:
return None
# Use only first 3 and last 3 dialogue lines for context (much faster)
sample_texts = texts[:3] + ["..."] + texts[-3:] if len(texts) > 6 else texts
combined = "\n".join(sample_texts)[:1500]
duration = parent["end_time"] - parent["start_time"]
prompt = f"""You are a film scene analyst. Analyze this scene and provide 5W1H analysis.
Scene {scene_num}/17 | {duration:.0f}s | {len(texts)} dialogue lines
Key dialogue:
{combined}
Respond with ONLY this JSON:
{{"summary_5lines":"...","who":"...","what":"...","when":"...","where":"...","why":"...","how":"...","characters":[],"tone":[],"key_events":[]}}
IMPORTANT: "summary_5lines" must be EXACTLY 5 lines describing the scene. Each line should be a complete sentence separated by \\n."""
response = call_gemma4(prompt, max_tokens=2000)
if not response:
return None
# Simple JSON extraction: find first { and last }
try:
start = response.find("{")
end = response.rfind("}") + 1
if start >= 0 and end > start:
return json.loads(response[start:end])
except Exception:
pass
return None
def update_parent_chunk(parent, analysis):
"""Update parent chunk with 5W1H structured data"""
if not analysis:
return False
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
# Create structured summary text (5 lines)
structured_text = f"{analysis.get('summary_5lines', '')}"
# Update metadata with full 5W1H structure
metadata = parent["metadata"] if parent["metadata"] else {}
metadata["auto_generated_by"] = "gemma4"
metadata["chunk_count"] = len(parent["child_texts"] or [])
metadata["structured_summary"] = {
"summary_5lines": analysis.get("summary_5lines", ""),
"who": analysis.get("who", ""),
"what": analysis.get("what", ""),
"when": analysis.get("when", ""),
"where": analysis.get("where", ""),
"why": analysis.get("why", ""),
"how": analysis.get("how", ""),
"characters": analysis.get("characters", []),
"tone": analysis.get("tone", []),
"key_events": analysis.get("key_events", []),
}
cur.execute(
"""
UPDATE parent_chunks
SET summary_text = %s,
metadata = %s::jsonb
WHERE id = %s
""",
(structured_text, json.dumps(metadata, ensure_ascii=False), parent["id"]),
)
conn.commit()
cur.close()
conn.close()
return True
def main():
print(f"🎬 Regenerating 5W1H summaries for {UUID}")
print(f" Using llama.cpp server at {LLAMA_URL}")
print("=" * 70)
parents = get_parent_with_children()
print(f"📥 Found {len(parents)} parent chunks")
success_count = 0
for i, parent in enumerate(parents):
duration = parent["end_time"] - parent["start_time"]
text_count = len(parent["child_texts"] or [])
print(
f"\n🎬 Scene {parent['scene_order']}: {parent['start_time']:.0f}s-{parent['end_time']:.0f}s ({duration:.0f}s, {text_count} chunks)"
)
if parent["old_summary"]:
print(f" Old: {parent['old_summary'][:80]}...")
analysis = generate_5w1h_summary(parent, parent["scene_order"])
if analysis:
summary = analysis.get("summary_5lines", "N/A")
print(f" ✅ Summary: {summary[:100]}...")
print(f" 👤 Who: {analysis.get('who', 'N/A')[:60]}")
print(f" 📍 Where: {analysis.get('where', 'N/A')[:60]}")
print(f" 💡 Why: {analysis.get('why', 'N/A')[:60]}")
if update_parent_chunk(parent, analysis):
success_count += 1
else:
print(" ❌ Failed to generate analysis")
print(f"\n{'=' * 70}")
print(
f"✅ Updated {success_count}/{len(parents)} parent chunks with 5W1H summaries"
)
if __name__ == "__main__":
main()

View File

@@ -1 +0,0 @@
../v1.1/scripts/regenerate_parent_5w1h_v1.11.py

View File

@@ -39,140 +39,8 @@ def get_conn():
def merge_traces_within_cuts(face_data: dict, cut_scenes: list) -> dict:
"""Merge traces within the same cut if they have similar embeddings (same person re-appeared)."""
frames = face_data.get("frames", {})
if not frames:
return face_data
# Map each frame to its scene/cut number
frame_to_scene = {}
for s in cut_scenes:
for f in range(s["start_frame"], s["end_frame"] + 1):
frame_to_scene[f] = s["scene_number"]
# Collect per-trace data: scene numbers, embeddings, face positions
trace_frames = defaultdict(list)
trace_embeddings = defaultdict(list)
trace_poses = {}
for fnum_str, frm_data in frames.items():
fnum = int(fnum_str)
for face in frm_data.get("faces", []):
tid = face.get("trace_id")
if tid is None:
continue
trace_frames[tid].append(fnum)
emb = face.get("embedding")
if emb is not None:
trace_embeddings[tid].append(emb)
if tid not in trace_poses:
trace_poses[tid] = (
face.get("x", 0),
face.get("y", 0),
face.get("width", 0),
face.get("height", 0),
)
if len(trace_embeddings) < 2:
return face_data
# Compute centroid per trace
trace_centroids = {}
for tid, embs in trace_embeddings.items():
centroid = np.mean(embs, axis=0)
norm = np.linalg.norm(centroid)
trace_centroids[tid] = centroid / norm if norm > 0 else centroid
# Determine which scene each trace belongs to (majority of frames)
trace_scene = {}
for tid, fns in trace_frames.items():
scene_votes = defaultdict(int)
for fn in fns:
scene = frame_to_scene.get(fn, -1)
scene_votes[scene] += 1
trace_scene[tid] = max(scene_votes, key=scene_votes.get) if scene_votes else -1
# Within each scene, merge traces with similar centroids
scene_traces = defaultdict(list)
for tid, scene in trace_scene.items():
if scene >= 0 and tid in trace_centroids:
scene_traces[scene].append(tid)
merged = 0
next_new_id = max(trace_frames.keys()) + 1 if trace_frames else 0
SIMILARITY_THRESHOLD = 0.75
for scene, tids in scene_traces.items():
if len(tids) < 2:
continue
used = set()
for i in range(len(tids)):
if tids[i] in used:
continue
keep_tid = tids[i]
for j in range(i + 1, len(tids)):
if tids[j] in used:
continue
sim = float(np.dot(trace_centroids[tids[i]], trace_centroids[tids[j]]))
if sim >= SIMILARITY_THRESHOLD:
# Merge tids[j] into keep_tid
for fnum_str, frm_data in frames.items():
for face in frm_data.get("faces", []):
if face.get("trace_id") == tids[j]:
face["trace_id"] = keep_tid
used.add(tids[j])
merged += 1
# If any merges happened, rebuild trace metadata
if merged > 0:
# Rebuild traces dict
new_traces = {}
new_trace_frames = defaultdict(list)
for fnum_str, frm_data in frames.items():
fnum = int(fnum_str)
for face in frm_data.get("faces", []):
tid = face.get("trace_id")
if tid is not None:
new_trace_frames[tid].append(
{
"frame": fnum,
"face_index": 0,
"bbox": {
"x": face.get("x", 0),
"y": face.get("y", 0),
"width": face.get("width", 0),
"height": face.get("height", 0),
},
"confidence": face.get("confidence", 0.0),
}
)
for tid, path in new_trace_frames.items():
if len(path) >= 1:
frames_sorted = sorted(set(p["frame"] for p in path))
new_traces[str(tid)] = {
"trace_id": tid,
"start_frame": frames_sorted[0],
"end_frame": frames_sorted[-1],
"duration_frames": frames_sorted[-1] - frames_sorted[0] + 1,
"duration_seconds": (frames_sorted[-1] - frames_sorted[0])
/ face_data.get("metadata", {}).get("fps", 25.0),
"total_appearances": len(path),
"path": path,
}
face_data["traces"] = new_traces
face_data["metadata"]["trace_stats"] = {
"total_traces": len(new_traces),
"active_traces": len(new_traces),
"long_traces": len(
[t for t in new_traces.values() if t["duration_frames"] >= 2]
),
}
print(
f"[TRACE] Post-merge: {merged} traces merged, {len(new_traces)} total traces"
)
"""Merge traces within the same cut - DISABLED (no embeddings)."""
# TODO: Reimplement with Qdrant _faces collection
return face_data
@@ -235,57 +103,12 @@ def run_face_tracker(
print(f"[TRACE] Processing {len(face_data.get('frames', {}))} frames")
# Load embeddings from DB for the face tracker
# Embeddings no longer loaded from DB - use IoU-only tracking
file_uuid = (
face_json_path.split("/")[-1]
.replace(".face.json", "")
.replace("_traced.json", "")
)
try:
conn = get_conn()
cur = conn.cursor()
cur.execute(
f"""
SELECT frame_number, x, y, width, height, embedding
FROM {SCHEMA}.face_detections
WHERE file_uuid = %s AND embedding IS NOT NULL
""",
(file_uuid,),
)
emb_rows = cur.fetchall()
conn.close()
# Build lookup: frame_number → list of (bbox, embedding)
emb_map = {}
for fn, x, y, w, h, emb in emb_rows:
emb_map.setdefault(fn, []).append(((x, y, w, h), emb))
print(f"[TRACE] Loaded {len(emb_rows)} embeddings from DB")
# Attach embeddings to face data
attached = 0
for fnum_str, frm_data in face_data.get("frames", {}).items():
fnum = int(fnum_str)
for face in frm_data.get("faces", []):
x, y, w, h = (
face.get("x", 0),
face.get("y", 0),
face.get("width", 0),
face.get("height", 0),
)
candidates = emb_map.get(fnum, [])
# Find matching embedding by bbox proximity
for (ex, ey, ew, eh), emb in candidates:
if (
abs(x - ex) < 10
and abs(y - ey) < 10
and abs(w - ew) < 10
and abs(h - eh) < 10
):
face["embedding"] = emb
attached += 1
break
print(f"[TRACE] Attached {attached} embeddings to faces")
except Exception as e:
print(f"[TRACE] WARNING: Could not load embeddings: {e}")
# Load cut boundaries from cut.json (same directory as face.json)
cut_boundaries = None
@@ -301,7 +124,7 @@ def run_face_tracker(
print(f"[TRACE] Loaded {len(cut_boundaries)} cut boundaries")
face_data = track_faces(
face_data, use_embedding=True, cut_boundaries=cut_boundaries
face_data, use_embedding=False, cut_boundaries=cut_boundaries
)
# Merge traces within same cut (same person re-appearing after occlusion/pose change)
@@ -309,7 +132,7 @@ def run_face_tracker(
face_data = merge_traces_within_cuts(face_data, cut_scenes)
metadata = face_data.get("metadata", {})
metadata["tracking_method"] = "iou_embedding"
metadata["tracking_method"] = "iou_only"
metadata["tracked_at"] = datetime.now().isoformat()
face_data["metadata"] = metadata
@@ -350,22 +173,19 @@ def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHE
if face_id is None:
face_id = f"face_{trace_id}"
attributes = face.get("attributes")
embedding = face.get("embedding")
bbox = json.dumps({"x": x, "y": y, "width": w, "height": h})
embed_vec = embedding if embedding and len(embedding) > 0 else None
try:
cur.execute(
f"""
UPDATE {schema}.face_detections
SET trace_id = %s, embedding = %s, face_id = %s
SET trace_id = %s, face_id = %s
WHERE file_uuid = %s AND frame_number = %s
AND x = %s AND y = %s AND width = %s AND height = %s
""",
(
trace_id,
embed_vec,
face_id,
file_uuid,
frame_num,

View File

@@ -1,87 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Story Embedding Pipeline:
1. Read story chunks → LLM summary (Gemma4)
2. Embed summary (EmbeddingGemma)
3. Store in chunks table + Qdrant
"""
import json, urllib.request, subprocess, sys, time, os
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
QDRANT_URL = "http://localhost:6333"
QDRANT_COL = "momentry_dev_stories"
def psql(sql):
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
return r.stdout.strip()
def call_llm(dialogue):
prompt = f"Dialogue: {dialogue}\n\n50-word summary:"
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "max_tokens": 100}).encode()
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
resp = urllib.request.urlopen(req, timeout=120)
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
resp = urllib.request.urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
# Step 0: Ensure Qdrant collection exists (768 dims)
subprocess.run(["curl", "-s", "-X", "PUT", f"{QDRANT_URL}/collections/{QDRANT_COL}",
"-H", "Content-Type: application/json",
"-d", '{"vectors":{"size":768,"distance":"Cosine"}}'], capture_output=True)
# Step 1: Get all story chunks that need summaries
lines = [l for l in psql(f"SELECT chunk_id, chunk_index, start_time, end_time, text_content FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND (summary_text IS NULL OR summary_text = '') ORDER BY chunk_index").split('\n') if l.strip() and '|' in l]
print(f"Chunks to process: {len(lines)}")
total = len(lines)
errors = 0
for i, line in enumerate(lines):
parts = line.split('|', 4)
cid, idx, st, et, dialogue = parts[0].strip(), int(parts[1]), float(parts[2]), float(parts[3]), parts[4] if len(parts) > 4 else ""
if len(dialogue) < 10:
summary = "[no dialogue]"
embedding = [0.0] * 768
else:
try:
summary = call_llm(dialogue)
time.sleep(0.3)
embedding = call_embed(summary)
except Exception as e:
print(f"[{i+1}/{total}] Error: {cid} - {e}")
errors += 1
summary = "[error]"
embedding = [0.0] * 768
# Update DB
s_esc = summary.replace("'", "''")
psql(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE chunk_id='{cid}'")
# Store in Qdrant
point = json.dumps({"points": [{"id": idx + 1, "vector": embedding,
"payload": {"chunk_id": cid, "file_uuid": UUID, "start_time": st, "end_time": et,
"summary": summary, "type": "story_summary"}
}]}).encode()
req = urllib.request.Request(f"{QDRANT_URL}/collections/{QDRANT_COL}/points?wait=true",
data=point, headers={"Content-Type": "application/json"}, method="PUT")
try:
urllib.request.urlopen(req, timeout=10)
except:
pass
if (i+1) % 20 == 0:
print(f"[{i+1}/{total}] {errors} errors so far")
print(f"\nDone. Processed: {total}, Errors: {errors}")
print(f"Qdrant: {QDRANT_COL}")

View File

@@ -1 +0,0 @@
../v1.1/scripts/story_embed_v1.11.py

View File

@@ -1,230 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Story Pipeline Full — Speaker + Story + Summary
Step 1: Update sentence chunks with speaker name
Step 2: Rebuild story chunks + re-embed
Step 3: LLM summary × 228 + embed
"""
import json, urllib.request, subprocess, sys, time, os
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DIR = "/Users/accusys/momentry/output_dev"
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
QDRANT_URL = "http://localhost:6333/collections/momentry_dev_stories/points"
def psql(sql):
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
return r.stdout.strip()
def psql_file(path):
r = subprocess.run(PSQL + ["-f", path], capture_output=True, text=True, timeout=60)
if r.stderr and "ERROR" in r.stderr:
print(f"SQL Error: {r.stderr[:200]}")
return r.returncode
def embed_text(text):
body = json.dumps({"input": text[:1024]}).encode()
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req, timeout=30).read())["data"][0]["embedding"]
def llm_summary(dialogue):
body = json.dumps({
"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": f"Summarize concisely:\n{dialogue}\n\n50-word summary:"}],
"temperature": 0.1, "max_tokens": 100,
}).encode()
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req, timeout=120).read())["choices"][0]["message"]["content"].strip()
fps = 25.0
FILE_ID = 242
# ═══════════════════════════════════════════════════
# Step 0: Load ASR + ASRX + speaker map
# ═══════════════════════════════════════════════════
print("=" * 60)
print("Step 0: Loading data...")
asr = json.load(open(f"{DIR}/{UUID}.asr.json"))
segs = asr["segments"]
asrx = json.load(open(f"{DIR}/{UUID}.asrx.json"))
asrx_segs = asrx["segments"]
# Speaker map from identity_bindings
r = psql("SELECT ib.identity_value, i.name FROM dev.identity_bindings ib JOIN dev.identities i ON i.id=ib.identity_id WHERE ib.identity_type='speaker'")
speaker_map = {}
for line in r.strip().split('\n'):
if line.strip() and '|' in line:
p = line.split('|')
speaker_map[p[0].strip()] = p[1].strip()
speaker_map["SPEAKER_0"] = "Speaker_0" # Fallback for unbounded
# ═══════════════════════════════════════════════════
# Step 1: Update sentence chunks with speaker
# ═══════════════════════════════════════════════════
print("\n" + "=" * 60)
print("Step 1: Updating sentence chunks with speaker...")
sql = ["BEGIN;"]
chunk_meta = {} # idx → {speaker_id, speaker_name}
for idx, seg in enumerate(segs):
st, et = seg["start"], seg["end"]
text = seg["text"].strip()
if not text:
continue
# Find overlapping ASRX segment → speaker_id
spk_id = "SPEAKER_0"
for ax in asrx_segs:
if ax.get("start_time", 0) <= st and ax.get("end_time", 0) >= et:
spk_id = ax.get("speaker_id", "SPEAKER_0")
break
spk_name = speaker_map.get(spk_id, spk_id)
new_text = f"[{spk_name}] {text}"
meta = json.dumps({"speaker_id": spk_id, "speaker_name": spk_name})
esc = new_text.replace("'", "''")
sql.append(f"UPDATE dev.chunks SET text_content='{esc}', metadata='{meta}'::jsonb WHERE file_uuid='{UUID}' AND chunk_id='{UUID}_{idx}';")
chunk_meta[idx] = {"speaker_id": spk_id, "speaker_name": spk_name}
sql.append("COMMIT;")
with open("/tmp/s1_speaker.sql", "w") as f:
f.write("\n".join(sql))
psql_file("/tmp/s1_speaker.sql")
print(f" Updated {len(chunk_meta)} sentence chunks with speaker")
# ═══════════════════════════════════════════════════
# Step 2: Rebuild story chunks + re-embed
# ═══════════════════════════════════════════════════
print("\n" + "=" * 60)
print("Step 2: Rebuilding story chunks...")
# Delete old story chunks
psql(f"DELETE FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story';")
# Recreate
CHUNK_SIZE = 15
sql2 = ["BEGIN;"]
story_meta = []
for i in range(0, len(segs), CHUNK_SIZE):
group = segs[i:i+CHUNK_SIZE]
st, et = group[0]["start"], group[-1]["end"]
idx = i // CHUNK_SIZE
chunk_id = f"{UUID}_story_{idx}"
# Build speaker text from individual sentences
texts = []
speakers_used = {}
for j, seg in enumerate(group):
seg_idx = i + j
if seg_idx in chunk_meta:
cm = chunk_meta[seg_idx]
text = seg["text"].strip()
if text:
texts.append(f"[{cm['speaker_name']}] {text}")
speakers_used[cm['speaker_name']] = speakers_used.get(cm['speaker_name'], 0) + 1
dialogue = " ".join(texts)
child_ids = ", ".join([f"'{UUID}_{j}'" for j in range(i, min(i+CHUNK_SIZE, len(segs)))])
words = sum(len(t.split()) for t in texts)
meta = json.dumps({"method": "fixed_15", "seg_count": len(group), "words": words, "speakers": speakers_used})
esc = dialogue.replace("'", "''")
sql2.append(f"""INSERT INTO dev.chunks (file_id,file_uuid,chunk_id,old_chunk_id,chunk_index,chunk_type,start_time,end_time,fps,start_frame,end_frame,text_content,content,metadata,frame_count,child_chunk_ids)
VALUES ({FILE_ID},'{UUID}','{chunk_id}','{chunk_id}',{idx},'story',{st},{et},{fps},{int(st*fps)},{int(et*fps)},'{esc}','{{"type":"story_parent"}}'::jsonb,'{meta}'::jsonb,{int((et-st)*fps)},ARRAY[{child_ids}]);""")
story_meta.append({"idx": idx, "st": st, "et": et, "dialogue": dialogue, "words": words, "speakers": speakers_used})
sql2.append("COMMIT;")
with open("/tmp/s2_story.sql", "w") as f:
f.write("\n".join(sql2))
psql_file("/tmp/s2_story.sql")
print(f" Created {len(story_meta)} story chunks")
# Embed + upsert to Qdrant
print("\n Embedding story chunks...")
points_dialogue = []
for sm in story_meta:
if len(sm["dialogue"]) < 10:
continue
vec = embed_text(sm["dialogue"])
points_dialogue.append({"id": sm["idx"] + 1, "vector": vec, "payload": {
"chunk_id": f"{UUID}_story_{sm['idx']}", "file_uuid": UUID,
"start_time": sm["st"], "end_time": sm["et"], "type": "story_dialogue"
}})
for i in range(0, len(points_dialogue), 100):
batch = points_dialogue[i:i+100]
data = json.dumps({"points": batch, "wait": True}).encode()
req = urllib.request.Request(f"{QDRANT_URL}?wait=true", data=data, headers={"Content-Type": "application/json"}, method="PUT")
urllib.request.urlopen(req, timeout=30)
print(f" Qdrant: {len(points_dialogue)} dialogue vectors")
# ═══════════════════════════════════════════════════
# Step 3: LLM summaries + embed
# ═══════════════════════════════════════════════════
print("\n" + "=" * 60)
print("Step 3: LLM summaries...")
points_summary = []
summary_sql = ["BEGIN;"]
for i, sm in enumerate(story_meta):
if len(sm["dialogue"]) < 10:
continue
try:
summary = llm_summary(sm["dialogue"])
time.sleep(0.3)
vec = embed_text(summary)
time.sleep(0.1)
except Exception as e:
print(f" Error on story {sm['idx']}: {e}")
summary = "[error]"
vec = [0.0] * 768
s_esc = summary.replace("'", "''")
summary_sql.append(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE file_uuid='{UUID}' AND chunk_id='{UUID}_story_{sm['idx']}';")
points_summary.append({"id": 100000 + sm["idx"] + 1, "vector": vec, "payload": {
"chunk_id": f"{UUID}_story_{sm['idx']}", "file_uuid": UUID,
"start_time": sm["st"], "end_time": sm["et"],
"summary": summary, "type": "story_summary"
}})
if (i + 1) % 50 == 0:
print(f" {i+1}/{len(story_meta)}")
# Update DB with summaries
summary_sql.append("COMMIT;")
with open("/tmp/s3_summary.sql", "w") as f:
f.write("\n".join(summary_sql))
psql_file("/tmp/s3_summary.sql")
# Upsert summary vectors to Qdrant
for i in range(0, len(points_summary), 100):
batch = points_summary[i:i+100]
data = json.dumps({"points": batch, "wait": True}).encode()
req = urllib.request.Request(f"{QDRANT_URL}?wait=true", data=data, headers={"Content-Type": "application/json"}, method="PUT")
urllib.request.urlopen(req, timeout=30)
print(f" Qdrant: {len(points_summary)} summary vectors")
# ═══════════════════════════════════════════════════
# Step 4: Verify
# ═══════════════════════════════════════════════════
print("\n" + "=" * 60)
print("Done.")
r1 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' AND text_content LIKE '[%'")
r2 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story'")
r3 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND summary_text IS NOT NULL")
print(f"Sentence chunks with speaker: {r1}")
print(f"Story chunks: {r2}")
print(f"Story chunks with summary: {r3}")

View File

@@ -1 +0,0 @@
../v1.1/scripts/story_pipeline_full_v1.11.py

View File

@@ -1,325 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Story Processor - Generate parent-child chunk hierarchy for RAG
Uses LOCAL video analysis (ASR, YOLO, OCR, Scene) to create parent chunks.
NO cloud API calls - fully offline processing
"""
import sys
import json
import os
import argparse
from typing import Dict, List, Any
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
def extract_video_metadata(video_path: str) -> Dict[str, Any]:
"""Extract basic video metadata using ffprobe"""
import subprocess
try:
cmd = [
"ffprobe",
"-v",
"quiet",
"-print_format",
"json",
"-show_format",
"-show_streams",
video_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
return json.loads(result.stdout)
except Exception:
pass
return {}
def generate_parent_child_chunks(
asr_data: Dict,
cut_data: Dict,
yolo_data: Dict,
ocr_data: Dict,
scene_data: Dict,
parent_chunk_size: int = 5,
) -> Dict:
"""
Generate parent-child chunk hierarchy using LOCAL data only.
No LLM/API calls - uses template-based narrative generation.
"""
child_chunks = []
parent_chunks = []
# Create child chunks from ASR
for seg in asr_data.get("segments", []):
child_chunks.append(
{
"chunk_id": f"asr_{seg.get('start', 0):.1f}_{seg.get('end', 0):.1f}",
"chunk_type": "asr",
"source": "asr",
"start_time": seg.get("start", 0),
"end_time": seg.get("end", 0),
"text_content": seg.get("text", ""),
"content": {
"text": seg.get("text", ""),
"confidence": seg.get("confidence", 0),
},
"child_chunk_ids": [],
"parent_chunk_id": None,
}
)
# Create child chunks from CUT scenes
for scene in cut_data.get("scenes", []):
child_chunks.append(
{
"chunk_id": f"cut_{scene.get('scene_number', 0)}",
"chunk_type": "cut",
"source": "cut",
"start_time": scene.get("start_time", 0),
"end_time": scene.get("end_time", 0),
"text_content": f"Scene {scene.get('scene_number', 0)}",
"content": {
"scene_number": scene.get("scene_number", 0),
"duration": scene.get("duration", 0),
},
"child_chunk_ids": [],
"parent_chunk_id": None,
}
)
asr_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "asr"]
cut_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "cut"]
yolo_frames = yolo_data.get("frames", [])
ocr_frames = ocr_data.get("frames", [])
# Group ASR segments into parent chunks
for i in range(0, len(asr_child_ids), parent_chunk_size):
batch = asr_child_ids[i : i + parent_chunk_size]
if not batch:
continue
batch_texts = []
batch_objects = []
batch_times = []
for child_id in batch:
for child in child_chunks:
if child["chunk_id"] == child_id:
if child["text_content"]:
batch_texts.append(child["text_content"])
batch_times.append((child["start_time"], child["end_time"]))
break
start_time = batch_times[0][0] if batch_times else 0
end_time = batch_times[-1][1] if batch_times else 0
# Find objects in this time range
for frame in yolo_frames[:50]:
ts = frame.get("timestamp", 0)
if start_time <= ts <= end_time:
for obj in frame.get("objects", []):
batch_objects.append(obj.get("class_name", "unknown"))
narrative = generate_narrative(batch_texts, batch_objects, start_time, end_time)
parent_chunk = {
"chunk_id": f"story_asr_{i // parent_chunk_size:04d}",
"chunk_type": "story",
"source": "story_asr",
"start_time": start_time,
"end_time": end_time,
"text_content": narrative,
"content": {
"description": narrative,
"child_count": len(batch),
"speech_preview": " ".join(batch_texts[:3]) if batch_texts else None,
"detected_objects": list(set(batch_objects))[:5],
},
"child_chunk_ids": batch,
"parent_chunk_id": None,
}
parent_chunks.append(parent_chunk)
for child_id in batch:
for child in child_chunks:
if child["chunk_id"] == child_id:
child["parent_chunk_id"] = parent_chunk["chunk_id"]
break
# Group CUT scenes into parent chunks
for i in range(0, len(cut_child_ids), parent_chunk_size):
batch = cut_child_ids[i : i + parent_chunk_size]
if not batch:
continue
batch_times = []
batch_objects = []
for child_id in batch:
for child in child_chunks:
if child["chunk_id"] == child_id:
batch_times.append((child["start_time"], child["end_time"]))
break
start_time = batch_times[0][0] if batch_times else 0
end_time = batch_times[-1][1] if batch_times else 0
for frame in yolo_frames[:50]:
ts = frame.get("timestamp", 0)
if start_time <= ts <= end_time:
for obj in frame.get("objects", []):
batch_objects.append(obj.get("class_name", "unknown"))
narrative = generate_scene_narrative(
batch_objects, start_time, end_time, len(batch)
)
parent_chunk = {
"chunk_id": f"story_cut_{i // parent_chunk_size:04d}",
"chunk_type": "story",
"source": "story_cut",
"start_time": start_time,
"end_time": end_time,
"text_content": narrative,
"content": {
"description": narrative,
"child_count": len(batch),
"scenes": batch,
"detected_objects": list(set(batch_objects))[:5],
},
"child_chunk_ids": batch,
"parent_chunk_id": None,
}
parent_chunks.append(parent_chunk)
for child_id in batch:
for child in child_chunks:
if child["chunk_id"] == child_id:
child["parent_chunk_id"] = parent_chunk["chunk_id"]
break
return {
"child_chunks": child_chunks,
"parent_chunks": parent_chunks,
"stats": {
"total_child_chunks": len(child_chunks),
"total_parent_chunks": len(parent_chunks),
"asr_children": len(asr_child_ids),
"cut_children": len(cut_child_ids),
},
}
def generate_narrative(
texts: List[str], objects: List[str], start: float, end: float
) -> str:
"""Generate narrative description from LOCAL text snippets and objects"""
if not texts and not objects:
return f"Video segment from {start:.1f}s to {end:.1f}s"
parts = []
if texts:
combined = " ".join(texts[:5])
if len(combined) > 150:
combined = combined[:150] + "..."
parts.append(f"Speech: {combined}")
if objects:
unique_objs = list(set(objects))[:5]
parts.append(f"Visuals: {', '.join(unique_objs)}")
return f"[{start:.0f}s-{end:.0f}s] {' | '.join(parts)}"
def generate_scene_narrative(
objects: List[str], start: float, end: float, scene_count: int
) -> str:
"""Generate scene narrative from LOCAL detected objects"""
unique_objects = list(set(objects))[:5]
if unique_objects:
obj_str = ", ".join(unique_objects)
return f"[{start:.0f}s-{end:.0f}s] {scene_count} scenes. Visuals: {obj_str}."
else:
return f"[{start:.0f}s-{end:.0f}s] {scene_count} video scenes."
def run_story(
video_path: str, output_path: str, uuid: str = "", parent_chunk_size: int = 5
):
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("story", "STORY_START")
base_path = os.path.dirname(output_path)
uuid_name = os.path.basename(output_path).split(".")[0]
asr_data = {"segments": []}
cut_data = {"scenes": []}
yolo_data = {"frames": []}
ocr_data = {"frames": []}
scene_data = {"scenes": []}
for name, data_var in [
("asr", asr_data),
("cut", cut_data),
("yolo", yolo_data),
("ocr", ocr_data),
("scene", scene_data),
]:
path = os.path.join(base_path, f"{uuid_name}.{name}.json")
if os.path.exists(path):
with open(path) as f:
data_var.update(json.load(f))
result = generate_parent_child_chunks(
asr_data, cut_data, yolo_data, ocr_data, scene_data, parent_chunk_size
)
result["video_metadata"] = extract_video_metadata(video_path)
result["processing"] = {
"method": "local_aggregation",
"cloud_api_used": False,
"parent_chunk_size": parent_chunk_size,
}
with open(output_path, "w") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
if publisher:
publisher.complete(
"story",
f"{result['stats']['total_parent_chunks']} parent, {result['stats']['total_child_chunks']} child chunks (LOCAL)",
)
return result
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Story Processor - Parent-Child Chunk Hierarchy (LOCAL ONLY)"
)
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", help="UUID for progress tracking", default="")
parser.add_argument(
"--parent-chunk-size",
type=int,
default=5,
help="Number of child chunks per parent",
)
args = parser.parse_args()
result = run_story(
args.video_path, args.output_path, args.uuid, args.parent_chunk_size
)
print(
f"Story generated: {result['stats']['total_parent_chunks']} parent, "
f"{result['stats']['total_child_chunks']} child chunks (LOCAL)"
)

View File

@@ -1,848 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Story Processor - AI-Driven Processor Contract Version 1.0
Compliant with AI-Driven Processor Contract v1.0
Effective Date: 2025-03-27
Features:
1. Standardized command-line interface
2. Redis progress reporting
3. Signal handling (SIGTERM, SIGINT)
4. Health check mode
5. Resource monitoring
6. Contract-compliant JSON output
7. Unified configuration
"""
import sys
import json
import os
import argparse
import signal
import time
import traceback
from datetime import datetime
from typing import Dict, Any, List
# Redis Publisher for progress reporting
try:
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
REDIS_AVAILABLE = True
except ImportError:
REDIS_AVAILABLE = False
print(
"WARNING: RedisPublisher not available, progress reporting disabled",
file=sys.stderr,
)
# Contract version
CONTRACT_VERSION = "1.0"
PROCESSOR_NAME = (
"/Users/accusys/momentry_core_0.1/scripts/story_processor_contract_v1.py"
)
PROCESSOR_VERSION = "1.0.0"
MODEL_NAME = "gpt-4"
MODEL_VERSION = "latest"
# Unified configuration defaults
DEFAULT_TIMEOUT = 3600 # 1 hour for story generation
DEFAULT_PARENT_CHUNK_SIZE = 5
DEFAULT_MIN_CHILD_CHUNKS = 3
DEFAULT_MAX_CHILD_CHUNKS = 10
DEFAULT_SUMMARY_LENGTH = 150
DEFAULT_MODEL = "openai" # openai, local, or template
DEFAULT_MODEL_NAME = "gpt-4"
DEFAULT_TEMPERATURE = 0.7
DEFAULT_MAX_TOKENS = 500
# Signal handling with timeout support
class SignalHandler:
"""Handle system signals for graceful shutdown"""
def __init__(self):
self.should_exit = False
self.exit_code = 0
signal.signal(signal.SIGTERM, self.handle_signal)
signal.signal(signal.SIGINT, self.handle_signal)
def handle_signal(self, signum, frame):
"""Handle termination signals"""
print(f"\n收到信号 {signum},正在优雅关闭...")
self.should_exit = True
self.exit_code = 128 + signum
def should_stop(self):
"""Check if should stop processing"""
return self.should_exit
# Timeout manager
class TimeoutManager:
"""Manage processing timeouts"""
def __init__(self, timeout_seconds: int):
self.timeout_seconds = timeout_seconds
self.start_time = time.time()
self.timer = None
def check_timeout(self) -> bool:
"""Check if timeout has been reached"""
elapsed = time.time() - self.start_time
return elapsed > self.timeout_seconds
def get_remaining_time(self) -> float:
"""Get remaining time in seconds"""
elapsed = time.time() - self.start_time
return max(0, self.timeout_seconds - elapsed)
def format_remaining_time(self) -> str:
"""Format remaining time as HH:MM:SS"""
remaining = self.get_remaining_time()
hours = int(remaining // 3600)
minutes = int((remaining % 3600) // 60)
seconds = int(remaining % 60)
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
# Health check functions
def check_environment() -> Dict[str, Any]:
"""Check environment and dependencies"""
checks = []
# Check 1: OpenAI API (optional)
try:
import openai
checks.append(
{
"name": "openai",
"status": "available",
"version": openai.__version__,
}
)
except ImportError:
checks.append({"name": "openai", "status": "optional", "version": None})
# Check 2: Redis (optional)
checks.append(
{
"name": "redis",
"status": "available" if REDIS_AVAILABLE else "optional",
"version": None,
}
)
# Check 3: Python version
checks.append(
{
"name": "python",
"status": "available",
"version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
}
)
return {
"timestamp": datetime.now().isoformat(),
"processor_name": PROCESSOR_NAME,
"processor_version": PROCESSOR_VERSION,
"contract_version": CONTRACT_VERSION,
"model_name": MODEL_NAME,
"model_version": MODEL_VERSION,
"checks": checks,
}
def check_input_files(input_files: Dict[str, str]) -> Dict[str, Any]:
"""Check input files exist and are valid JSON"""
results = {}
for file_type, file_path in input_files.items():
if not file_path:
results[file_type] = {
"exists": False,
"valid": False,
"error": "No path provided",
}
continue
if not os.path.exists(file_path):
results[file_type] = {
"exists": False,
"valid": False,
"error": "File not found",
}
continue
try:
with open(file_path, "r") as f:
data = json.load(f)
# Basic validation based on file type
if file_type == "asr":
valid = isinstance(data, dict) and "segments" in data
elif file_type == "cut":
valid = isinstance(data, dict) and "scenes" in data
elif file_type == "yolo":
valid = isinstance(data, dict) and "detections" in data
elif file_type == "ocr":
valid = isinstance(data, dict) and "texts" in data
else:
valid = isinstance(data, dict)
results[file_type] = {
"exists": True,
"valid": valid,
"size": os.path.getsize(file_path),
"data_keys": list(data.keys()) if isinstance(data, dict) else [],
}
except json.JSONDecodeError as e:
results[file_type] = {
"exists": True,
"valid": False,
"error": f"Invalid JSON: {e}",
}
except Exception as e:
results[file_type] = {"exists": True, "valid": False, "error": str(e)}
return results
def load_input_data(input_files: Dict[str, str]) -> Dict[str, Any]:
"""Load input data from JSON files"""
data = {}
for file_type, file_path in input_files.items():
if not file_path or not os.path.exists(file_path):
data[file_type] = None
continue
try:
with open(file_path, "r") as f:
data[file_type] = json.load(f)
except:
data[file_type] = None
return data
def generate_parent_child_chunks(
asr_data: Dict,
cut_data: Dict,
yolo_data: Dict,
ocr_data: Dict,
parent_chunk_size: int = DEFAULT_PARENT_CHUNK_SIZE,
min_child_chunks: int = DEFAULT_MIN_CHILD_CHUNKS,
max_child_chunks: int = DEFAULT_MAX_CHILD_CHUNKS,
summary_length: int = DEFAULT_SUMMARY_LENGTH,
model: str = DEFAULT_MODEL,
**kwargs,
) -> List[Dict[str, Any]]:
"""Generate parent-child chunk hierarchy for RAG"""
parent_chunks = []
# Extract ASR segments
asr_segments = asr_data.get("segments", []) if asr_data else []
# Extract scenes from CUT data
scenes = cut_data.get("scenes", []) if cut_data else []
# Extract detections from YOLO data
yolo_detections = yolo_data.get("detections", []) if yolo_data else []
# Extract OCR texts
ocr_texts = ocr_data.get("texts", []) if ocr_data else []
# If we have scenes, use them to group content
if scenes:
for scene in scenes:
scene_start = scene.get("start_time", 0)
scene_end = scene.get("end_time", 0)
scene_duration = scene.get("duration", 0)
# Find ASR segments in this scene
scene_asr_segments = []
for segment in asr_segments:
seg_start = segment.get("start", 0)
if scene_start <= seg_start <= scene_end:
scene_asr_segments.append(segment)
# Find YOLO detections in this scene
scene_yolo_detections = []
for detection in yolo_detections:
det_time = detection.get("timestamp", 0)
if scene_start <= det_time <= scene_end:
scene_yolo_detections.append(detection)
# Find OCR texts in this scene
scene_ocr_texts = []
for text in ocr_texts:
text_time = text.get("timestamp", 0)
if scene_start <= text_time <= scene_end:
scene_ocr_texts.append(text)
# Create child chunks
child_chunks = []
# Add ASR segments as child chunks
for segment in scene_asr_segments[:max_child_chunks]:
child_chunks.append(
{
"type": "asr",
"content": segment.get("text", ""),
"start_time": segment.get("start", 0),
"end_time": segment.get("end", 0),
"confidence": segment.get("confidence", 0),
"metadata": {"speaker": segment.get("speaker")},
}
)
# Add YOLO detections as child chunks
for detection in scene_yolo_detections[:max_child_chunks]:
child_chunks.append(
{
"type": "yolo",
"content": f"Detected {detection.get('class', 'object')} with confidence {detection.get('confidence', 0):.2f}",
"timestamp": detection.get("timestamp", 0),
"confidence": detection.get("confidence", 0),
"metadata": {
"class": detection.get("class"),
"bbox": detection.get("bbox"),
},
}
)
# Add OCR texts as child chunks
for text in scene_ocr_texts[:max_child_chunks]:
child_chunks.append(
{
"type": "ocr",
"content": text.get("text", ""),
"timestamp": text.get("timestamp", 0),
"confidence": text.get("confidence", 0),
"metadata": {
"bbox": text.get("bbox"),
"language": text.get("language"),
},
}
)
# Skip if not enough child chunks
if len(child_chunks) < min_child_chunks:
continue
# Generate parent summary
if model == "openai":
parent_summary = generate_openai_summary(child_chunks, scene, **kwargs)
elif model == "local":
parent_summary = generate_local_summary(child_chunks, scene, **kwargs)
else:
parent_summary = generate_template_summary(child_chunks, scene)
# Create parent chunk
parent_chunks.append(
{
"parent_id": len(parent_chunks) + 1,
"scene_id": scene.get("scene_id", 0),
"start_time": scene_start,
"end_time": scene_end,
"duration": scene_duration,
"summary": parent_summary[:summary_length]
if summary_length > 0
else parent_summary,
"child_count": len(child_chunks),
"child_types": list(set(chunk["type"] for chunk in child_chunks)),
"child_chunks": child_chunks[
:parent_chunk_size
], # Limit child chunks in output
}
)
# If no scenes, create chunks based on time windows
elif asr_segments:
# Group ASR segments by time windows
time_window = 30 # seconds
current_window = 0
while current_window * time_window < (
asr_segments[-1].get("end", 0) if asr_segments else 0
):
window_start = current_window * time_window
window_end = (current_window + 1) * time_window
# Find segments in this window
window_segments = []
for segment in asr_segments:
seg_start = segment.get("start", 0)
if window_start <= seg_start < window_end:
window_segments.append(segment)
if len(window_segments) >= min_child_chunks:
# Create child chunks
child_chunks = []
for segment in window_segments[:max_child_chunks]:
child_chunks.append(
{
"type": "asr",
"content": segment.get("text", ""),
"start_time": segment.get("start", 0),
"end_time": segment.get("end", 0),
"confidence": segment.get("confidence", 0),
"metadata": {"speaker": segment.get("speaker")},
}
)
# Generate parent summary
parent_summary = generate_template_summary(
child_chunks,
{
"start_time": window_start,
"end_time": window_end,
"duration": time_window,
},
)
# Create parent chunk
parent_chunks.append(
{
"parent_id": len(parent_chunks) + 1,
"time_window": current_window,
"start_time": window_start,
"end_time": window_end,
"duration": time_window,
"summary": parent_summary[:summary_length]
if summary_length > 0
else parent_summary,
"child_count": len(child_chunks),
"child_types": ["asr"],
"child_chunks": child_chunks[:parent_chunk_size],
}
)
current_window += 1
return parent_chunks
def generate_openai_summary(child_chunks: List[Dict], scene: Dict, **kwargs) -> str:
"""Generate summary using OpenAI"""
try:
import openai
# Prepare context from child chunks
context_parts = []
for chunk in child_chunks[:10]: # Limit context size
if chunk["type"] == "asr":
context_parts.append(f"Speech: {chunk['content']}")
elif chunk["type"] == "yolo":
context_parts.append(f"Visual: {chunk['content']}")
elif chunk["type"] == "ocr":
context_parts.append(f"Text: {chunk['content']}")
context = "\n".join(context_parts)
# Prepare prompt
prompt = f"""Summarize this video scene ({scene.get("duration", 0):.1f} seconds) based on the following elements:
{context}
Provide a concise narrative summary that connects the speech, visual elements, and text into a coherent description."""
# Call OpenAI API
response = openai.chat.completions.create(
model=kwargs.get("model_name", DEFAULT_MODEL_NAME),
messages=[
{
"role": "system",
"content": "You are a video analysis assistant that creates coherent narrative summaries from multiple data sources.",
},
{"role": "user", "content": prompt},
],
max_tokens=kwargs.get("max_tokens", DEFAULT_MAX_TOKENS),
temperature=kwargs.get("temperature", DEFAULT_TEMPERATURE),
)
return response.choices[0].message.content
except ImportError:
return "OpenAI not available for summary generation"
except Exception as e:
return f"Summary generation error: {str(e)}"
def generate_local_summary(child_chunks: List[Dict], scene: Dict, **kwargs) -> str:
"""Generate summary using local model (placeholder)"""
# This is a placeholder for local model implementation
asr_count = sum(1 for chunk in child_chunks if chunk["type"] == "asr")
yolo_count = sum(1 for chunk in child_chunks if chunk["type"] == "yolo")
ocr_count = sum(1 for chunk in child_chunks if chunk["type"] == "ocr")
return f"Scene ({scene.get('duration', 0):.1f}s) with {asr_count} speech segments, {yolo_count} visual detections, and {ocr_count} text elements. Local summary model not implemented."
def generate_template_summary(child_chunks: List[Dict], scene: Dict) -> str:
"""Generate summary using template"""
asr_count = sum(1 for chunk in child_chunks if chunk["type"] == "asr")
yolo_count = sum(1 for chunk in child_chunks if chunk["type"] == "yolo")
ocr_count = sum(1 for chunk in child_chunks if chunk["type"] == "ocr")
# Extract some sample content
asr_samples = [
chunk["content"][:50] for chunk in child_chunks if chunk["type"] == "asr"
][:2]
yolo_classes = list(
set(
chunk["metadata"].get("class", "object")
for chunk in child_chunks
if chunk["type"] == "yolo"
)
)
summary_parts = [f"Scene duration: {scene.get('duration', 0):.1f} seconds."]
if asr_count > 0:
summary_parts.append(f"Contains {asr_count} speech segments.")
if asr_samples:
summary_parts.append(f"Sample speech: {'; '.join(asr_samples)}...")
if yolo_count > 0:
summary_parts.append(
f"Detected {yolo_count} objects including: {', '.join(yolo_classes[:3])}."
)
if ocr_count > 0:
summary_parts.append(f"Extracted {ocr_count} text elements from the video.")
return " ".join(summary_parts)
# Main processing function
def process_story(
asr_path: str,
cut_path: str,
yolo_path: str,
ocr_path: str,
output_path: str,
uuid: str = "",
parent_chunk_size: int = DEFAULT_PARENT_CHUNK_SIZE,
min_child_chunks: int = DEFAULT_MIN_CHILD_CHUNKS,
max_child_chunks: int = DEFAULT_MAX_CHILD_CHUNKS,
summary_length: int = DEFAULT_SUMMARY_LENGTH,
model: str = DEFAULT_MODEL,
model_name: str = DEFAULT_MODEL_NAME,
temperature: float = DEFAULT_TEMPERATURE,
max_tokens: int = DEFAULT_MAX_TOKENS,
timeout: int = DEFAULT_TIMEOUT,
) -> Dict[str, Any]:
"""Process video analysis data to create parent-child chunk hierarchy"""
# Initialize
signal_handler = SignalHandler()
timeout_manager = TimeoutManager(timeout)
publisher = None
if REDIS_AVAILABLE and uuid:
try:
publisher = RedisPublisher(uuid)
except:
publisher = None
def publish(stage: str, message: str, data: Dict = None):
if publisher:
publisher.info(PROCESSOR_NAME, stage, message, data)
if publisher:
publish("STORY_START", "开始生成故事层次结构")
result = {
"processor_name": PROCESSOR_NAME,
"processor_version": PROCESSOR_VERSION,
"contract_version": CONTRACT_VERSION,
"model_name": MODEL_NAME,
"model_version": MODEL_VERSION,
"input_files": {
"asr": asr_path,
"cut": cut_path,
"yolo": yolo_path,
"ocr": ocr_path,
},
"output_path": output_path,
"uuid": uuid,
"timestamp": datetime.now().isoformat(),
"parameters": {
"parent_chunk_size": parent_chunk_size,
"min_child_chunks": min_child_chunks,
"max_child_chunks": max_child_chunks,
"summary_length": summary_length,
"model": model,
"model_name": model_name,
"temperature": temperature,
"max_tokens": max_tokens,
"timeout": timeout,
},
"success": False,
"error": None,
"parent_chunks": [],
"chunk_statistics": {},
"processing_time": 0,
"resource_usage": {},
}
start_time = time.time()
try:
# Check timeout
if timeout_manager.check_timeout():
raise TimeoutError(f"超时 ({timeout} 秒)")
# Check if should exit
if signal_handler.should_stop():
raise KeyboardInterrupt("收到停止信号")
# Check input files
if publisher:
publish("STORY_CHECK_FILES", "检查输入文件")
input_files = {
"asr": asr_path,
"cut": cut_path,
"yolo": yolo_path,
"ocr": ocr_path,
}
file_checks = check_input_files(input_files)
result["file_checks"] = file_checks
# Check if we have at least ASR data
if not file_checks.get("asr", {}).get("valid", False):
raise ValueError("缺少有效的 ASR 数据文件")
if publisher:
publish("STORY_FILES_VALID", "输入文件检查通过")
# Load input data
if publisher:
publish("STORY_LOAD_DATA", "加载输入数据")
input_data = load_input_data(input_files)
if publisher:
publish("STORY_DATA_LOADED", "数据加载完成")
# Generate parent-child chunks
if publisher:
publish("STORY_GENERATE_CHUNKS", "生成父-子块层次结构")
parent_chunks = generate_parent_child_chunks(
asr_data=input_data.get("asr"),
cut_data=input_data.get("cut"),
yolo_data=input_data.get("yolo"),
ocr_data=input_data.get("ocr"),
parent_chunk_size=parent_chunk_size,
min_child_chunks=min_child_chunks,
max_child_chunks=max_child_chunks,
summary_length=summary_length,
model=model,
model_name=model_name,
temperature=temperature,
max_tokens=max_tokens,
)
result["parent_chunks"] = parent_chunks
result["parent_chunk_count"] = len(parent_chunks)
# Calculate statistics
total_child_chunks = sum(chunk.get("child_count", 0) for chunk in parent_chunks)
child_types = {}
for chunk in parent_chunks:
for child_type in chunk.get("child_types", []):
child_types[child_type] = child_types.get(child_type, 0) + 1
result["chunk_statistics"] = {
"total_parent_chunks": len(parent_chunks),
"total_child_chunks": total_child_chunks,
"avg_children_per_parent": total_child_chunks / len(parent_chunks)
if parent_chunks
else 0,
"child_type_distribution": child_types,
}
result["success"] = True
if publisher:
publish("STORY_COMPLETE", f"完成: {len(parent_chunks)} 个父块")
except TimeoutError as e:
result["error"] = f"处理超时: {e}"
if publisher:
publish("STORY_TIMEOUT", f"超时: {e}")
except KeyboardInterrupt:
result["error"] = "处理被用户中断"
if publisher:
publish("STORY_INTERRUPTED", "处理被中断")
except ImportError as e:
result["error"] = f"依赖缺失: {e}"
if publisher:
publish("STORY_MISSING_DEPS", f"缺少依赖: {e}")
except Exception as e:
result["error"] = f"处理错误: {str(e)}"
if publisher:
publish("STORY_ERROR", f"错误: {str(e)}")
traceback.print_exc()
# Calculate processing time
processing_time = time.time() - start_time
result["processing_time"] = processing_time
# Add resource usage
try:
import psutil
process = psutil.Process()
memory_info = process.memory_info()
result["resource_usage"] = {
"cpu_percent": process.cpu_percent(),
"memory_mb": memory_info.rss / (1024 * 1024),
"user_time": process.cpu_times().user,
"system_time": process.cpu_times().system,
}
except ImportError:
result["resource_usage"] = {"error": "psutil not available"}
# Save result
try:
with open(output_path, "w") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
if publisher:
publish("STORY_SAVED", f"结果保存到: {output_path}")
except Exception as e:
result["error"] = f"保存结果失败: {str(e)}"
if publisher:
publish("STORY_SAVE_ERROR", f"保存失败: {str(e)}")
return result
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(
description=f"{PROCESSOR_NAME.upper()} Processor v{PROCESSOR_VERSION} - Parent-Child Chunk Generation"
)
parser.add_argument("--asr", help="Path to ASR JSON file", required=True)
parser.add_argument("--cut", help="Path to CUT JSON file", default="")
parser.add_argument("--yolo", help="Path to YOLO JSON file", default="")
parser.add_argument("--ocr", help="Path to OCR JSON file", default="")
parser.add_argument("--output", help="Path to output JSON file", required=True)
parser.add_argument("--uuid", help="UUID for progress tracking", default="")
parser.add_argument(
"--parent-chunk-size",
help=f"Maximum child chunks per parent (default: {DEFAULT_PARENT_CHUNK_SIZE})",
type=int,
default=DEFAULT_PARENT_CHUNK_SIZE,
)
parser.add_argument(
"--min-child-chunks",
help=f"Minimum child chunks to create parent (default: {DEFAULT_MIN_CHILD_CHUNKS})",
type=int,
default=DEFAULT_MIN_CHILD_CHUNKS,
)
parser.add_argument(
"--max-child-chunks",
help=f"Maximum child chunks per parent (default: {DEFAULT_MAX_CHILD_CHUNKS})",
type=int,
default=DEFAULT_MAX_CHILD_CHUNKS,
)
parser.add_argument(
"--summary-length",
help=f"Maximum summary length in characters (default: {DEFAULT_SUMMARY_LENGTH})",
type=int,
default=DEFAULT_SUMMARY_LENGTH,
)
parser.add_argument(
"--model",
help=f"Summary model to use (default: {DEFAULT_MODEL})",
default=DEFAULT_MODEL,
choices=["openai", "local", "template"],
)
parser.add_argument(
"--model-name",
help=f"Model name for OpenAI (default: {DEFAULT_MODEL_NAME})",
default=DEFAULT_MODEL_NAME,
)
parser.add_argument(
"--temperature",
help=f"Temperature for generation (default: {DEFAULT_TEMPERATURE})",
type=float,
default=DEFAULT_TEMPERATURE,
)
parser.add_argument(
"--max-tokens",
help=f"Maximum tokens per summary (default: {DEFAULT_MAX_TOKENS})",
type=int,
default=DEFAULT_MAX_TOKENS,
)
parser.add_argument(
"--timeout",
help=f"Timeout in seconds (default: {DEFAULT_TIMEOUT})",
type=int,
default=DEFAULT_TIMEOUT,
)
parser.add_argument(
"--health-check",
help="Run health check and exit",
action="store_true",
)
args = parser.parse_args()
# Health check mode
if args.health_check:
health = check_environment()
print(json.dumps(health, indent=2, ensure_ascii=False))
return (
0
if all(c["status"] in ["available", "optional"] for c in health["checks"])
else 1
)
# Normal processing mode
result = process_story(
asr_path=args.asr,
cut_path=args.cut,
yolo_path=args.yolo,
ocr_path=args.ocr,
output_path=args.output,
uuid=args.uuid,
parent_chunk_size=args.parent_chunk_size,
min_child_chunks=args.min_child_chunks,
max_child_chunks=args.max_child_chunks,
summary_length=args.summary_length,
model=args.model,
model_name=args.model_name,
temperature=args.temperature,
max_tokens=args.max_tokens,
timeout=args.timeout,
)
# Print result summary
if result.get("success", False):
print(f"{PROCESSOR_NAME.upper()} 处理成功")
print(f" 父块数: {result.get('parent_chunk_count', 0)}")
stats = result.get("chunk_statistics", {})
print(f" 子块总数: {stats.get('total_child_chunks', 0)}")
print(f" 平均子块/父块: {stats.get('avg_children_per_parent', 0):.1f}")
print(f" 处理时间: {result.get('processing_time', 0):.1f}")
print(f" 输出文件: {args.output}")
return 0
else:
print(f"{PROCESSOR_NAME.upper()} 处理失败")
print(f" 错误: {result.get('error', '未知错误')}")
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -1 +0,0 @@
../v1.1/scripts/story_processor_contract_v1_v1.11.py

View File

@@ -1 +0,0 @@
../v1.1/scripts/story_processor_v1.11.py

View File

@@ -1,121 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Test Parent Chunk Summary Generation (Gemma 4)
"""
import json
import ollama
import time
# Configuration
UUID = "384b0ff44aaaa1f1"
ASR_PATH = f"output/{UUID}/{UUID}.asr.json"
MODEL = "gemma4:latest"
# The Prompt Template
PARENT_SUMMARY_PROMPT = """
You are an expert film analyst. Analyze the following movie dialogue segment (approx 60 seconds).
Your task is to generate a structured JSON summary containing:
1. **narrative_summary**: A one-sentence summary of the main event/plot point.
2. **entities**: Key information extracted:
- `who`: List of characters involved.
- `where`: Inferred location (e.g., "Apartment", "Train").
- `objects`: Key props mentioned (e.g., "Ticket", "Money").
3. **emotional_arc**: The emotional transition:
- `start_mood`: Mood at the beginning.
- `end_mood`: Mood at the end.
4. **plot_sequence**:
- `scene_type`: Type of scene (e.g., "Confrontation", "Romance", "Discovery").
- `key_action`: The main action taking place.
**IMPORTANT RULES:**
- Output **ONLY** valid JSON.
- Do NOT include "Thinking Process" or markdown formatting.
- If information is unknown, use "Unknown".
- Context: This is from the movie "Charade" (1963).
Dialogue:
{context}
"""
def load_sample(start_index, count=20):
"""Load a slice of dialogue to simulate a Parent Chunk"""
try:
with open(ASR_PATH, "r") as f:
data = json.load(f)
segments = data.get("segments", [])
selected = segments[start_index : start_index + count]
text = " ".join([s.get("text", "") for s in selected])
print(f"📂 Loaded Sample {start_index}: {len(selected)} segments.")
return text
except Exception as e:
return f"Error: {e}"
def run_test(name, context_text):
print(f"\n🧪 Testing: {name}")
print("-" * 50)
print(f"📖 Input Preview: {context_text[:100]}...")
prompt = PARENT_SUMMARY_PROMPT.format(context=context_text)
try:
start = time.time()
response = ollama.chat(
model=MODEL, messages=[{"role": "user", "content": prompt}]
)
duration = time.time() - start
content = response["message"]["content"]
# Clean up thinking tags if present
if "```json" in content:
content = content.split("```json")[1].split("```")[0]
elif "Thinking..." in content:
# crude cleanup for demo
content = content.split("...")[-1]
# Attempt parse
try:
result = json.loads(content.strip())
print(f"✅ Success ({duration:.2f}s)")
print(json.dumps(result, indent=2))
return True
except json.JSONDecodeError:
print(f"⚠️ JSON Parse Failed ({duration:.2f}s)")
print(content[:500])
return False
except Exception as e:
print(f"❌ API Error: {e}")
return False
def main():
print(f"🚀 Starting Parent Chunk Summary Tests on '{UUID}'")
# Test 1: Early Dialogue (Entities & Narrative Focus)
# "possessed a ticket of passage..."
txt1 = load_sample(start_index=10)
res1 = run_test("Test 1: Early Plot (Entities & Narrative)", txt1)
time.sleep(2) # Cool down
# Test 2: Middle Conflict (Emotional Arc Focus)
# "where did he keep his money..." (From previous context)
txt2 = load_sample(start_index=50)
res2 = run_test("Test 2: Conflict (Emotional Arc)", txt2)
time.sleep(2) # Cool down
# Test 3: Later Dialogue (Plot Sequence Focus)
# Looking for a scene involving a conclusion or death aftermath
# Let's pick a later section to test robustness
txt3 = load_sample(start_index=150)
res3 = run_test("Test 3: Late Plot (Sequence)", txt3)
if __name__ == "__main__":
main()

View File

@@ -1 +0,0 @@
../v1.1/scripts/test_parent_chunk_generation_v1.11.py