feat: tmdb_agent now inserts identities and file_identities to DB

- tmdb_agent.py: INSERT identities with status='pending' - tmdb_agent.py: INSERT file_identities (file_uuid → identity_id) - identity.json: file_bindings includes file_uuid, movie_id, character - backfill_file_identities.py: migrate existing TMDb identities - Tested: 27 Charade cast identities linked to file
2026-06-26 13:39:08 +08:00
parent 6cbc11efda
commit 67caf09732
2 changed files with 234 additions and 2 deletions
@@ -0,0 +1,147 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Backfill file_identities table for existing TMDb identities.
+
+For each TMDb identity with tmdb_movie_id in metadata:
+1. Find matching file by movie name
+2. INSERT into file_identities (file_uuid, identity_id)
+
+Usage:
+    python3 scripts/backfill_file_identities.py --schema public
+    python3 scripts/backfill_file_identities.py --schema dev
+"""
+
+import argparse
+import os
+import psycopg2
+import psycopg2.extras
+import re
+from pathlib import Path
+
+
+def extract_movie_name(filename: str) -> str | None:
+    """Extract movie name from filename"""
+    name = Path(filename).stem
+    cleaned = re.sub(r'[._]', ' ', name).strip()
+    for sep in ('|', '(', '[', '{', '\u2502'):
+        idx = cleaned.find(sep)
+        if idx > 0:
+            cleaned = cleaned[:idx].strip()
+    suffixes = (
+        r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl',
+        r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid',
+        r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3',
+        r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub',
+        r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut',
+        r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie',
+        r'english', r'french', r'spanish', r'german', r'chinese',
+        r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd',
+    )
+    pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
+    cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
+    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
+    return cleaned if len(cleaned) >= 3 else None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backfill file_identities")
+    parser.add_argument("--schema", default="public", help="Database schema")
+    parser.add_argument("--db", default=os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry"))
+    args = parser.parse_args()
+
+    schema = args.schema
+    identities_table = f"{schema}.identities" if schema != "public" else "identities"
+    file_identities_table = f"{schema}.file_identities" if schema != "public" else "file_identities"
+    videos_table = f"{schema}.videos" if schema != "public" else "videos"
+
+    conn = psycopg2.connect(args.db)
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+
+    # Get TMDb identities with tmdb_movie_id
+    cur.execute(f"""
+        SELECT id, name, tmdb_id, metadata->>'tmdb_movie_id' as tmdb_movie_id,
+               metadata->>'tmdb_movie_title' as tmdb_movie_title,
+               metadata->>'tmdb_character' as tmdb_character,
+               metadata->>'tmdb_cast_order' as tmdb_cast_order
+        FROM {identities_table}
+        WHERE source = 'tmdb' AND tmdb_id IS NOT NULL
+          AND metadata->>'tmdb_movie_id' IS NOT NULL
+    """)
+    identities = cur.fetchall()
+
+    print(f"[Backfill] Found {len(identities)} TMDb identities with movie_id")
+
+    # Get all files
+    cur.execute(f"SELECT file_uuid, file_name FROM {videos_table}")
+    files = cur.fetchall()
+    print(f"[Backfill] Found {len(files)} files")
+
+    # Build file lookup by movie name
+    file_by_movie = {}
+    for f in files:
+        movie_name = extract_movie_name(f["file_name"])
+        if movie_name:
+            file_by_movie[movie_name.lower()] = f["file_uuid"]
+
+    # Match identities to files
+    matched = 0
+    inserted = 0
+
+    for identity in identities:
+        tmdb_movie_title = identity.get("tmdb_movie_title")
+        if not tmdb_movie_title:
+            continue
+
+        # Try to find matching file
+        movie_key = tmdb_movie_title.lower().strip()
+        file_uuid = file_by_movie.get(movie_key)
+
+        # Also try partial match
+        if not file_uuid:
+            for key, fid in file_by_movie.items():
+                if movie_key in key or key in movie_key:
+                    file_uuid = fid
+                    break
+
+        if file_uuid:
+            matched += 1
+            try:
+                # Check if already exists
+                cur.execute(f"""
+                    SELECT 1 FROM {file_identities_table}
+                    WHERE file_uuid = %s AND identity_id = %s
+                """, (file_uuid, identity["id"]))
+                if cur.fetchone():
+                    continue
+
+                # Insert
+                cur.execute(f"""
+                    INSERT INTO {file_identities_table} (
+                        file_uuid, identity_id, confidence, metadata
+                    ) VALUES (%s, %s, %s, %s)
+                """, (
+                    file_uuid,
+                    identity["id"],
+                    1.0,
+                    psycopg2.extras.Json({
+                        "source": "tmdb_backfill",
+                        "tmdb_movie_id": identity.get("tmdb_movie_id"),
+                        "tmdb_movie_title": tmdb_movie_title,
+                        "character": identity.get("tmdb_character"),
+                        "cast_order": int(identity.get("tmdb_cast_order", 0)) if identity.get("tmdb_cast_order") else None,
+                    }),
+                ))
+                inserted += 1
+            except Exception as e:
+                print(f"  [WARN] Failed for {identity['name']}: {e}")
+
+    conn.commit()
+    cur.close()
+    conn.close()
+
+    print(f"[Backfill] Matched: {matched}/{len(identities)}")
+    print(f"[Backfill] Inserted: {inserted} new file_identities")
+
+
+if __name__ == "__main__":
+    main()