momentry_core/scripts/sync_to_mongodb.py

#!/opt/homebrew/bin/python3.11
"""
Sync chunks from PostgreSQL to MongoDB
"""

import psycopg2
from pymongo import MongoClient


VIDEO_UUID = "39567a0eb16f39fd"

POSTGRES_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "user": "accusys",
    "password": "Test3200",
    "database": "momentry",
}

MONGO_URI = "mongodb://localhost:27017"
MONGO_DB = "momentry"
MONGO_COLLECTION = "chunks"


def sync_to_mongodb():
    """Sync chunks from PostgreSQL to MongoDB"""
    # Connect to PostgreSQL
    pg_conn = psycopg2.connect(**POSTGRES_CONFIG)
    pg_cur = pg_conn.cursor()

    # Get all chunks for the video
    pg_cur.execute(
        """
        SELECT uuid, chunk_id, chunk_index, chunk_type,
               start_time, end_time, fps, start_frame, end_frame,
               content, metadata, vector_id
        FROM chunks
        WHERE uuid = %s AND chunk_type = 'sentence'
        ORDER BY chunk_index
    """,
        (VIDEO_UUID,),
    )

    rows = pg_cur.fetchall()
    print(f"Found {len(rows)} chunks in PostgreSQL")

    # Connect to MongoDB
    mongo_client = MongoClient(MONGO_URI)
    mongo_db = mongo_client[MONGO_DB]
    mongo_collection = mongo_db[MONGO_COLLECTION]

    # Prepare documents
    documents = []
    for row in rows:
        doc = {
            "uuid": row[0],
            "chunk_id": row[1],
            "chunk_index": row[2],
            "chunk_type": row[3],
            "start_time": row[4],
            "end_time": row[5],
            "fps": row[6],
            "start_frame": row[7],
            "end_frame": row[8],
            "content": row[9],
            "metadata": row[10],
            "vector_id": row[11],
        }
        documents.append(doc)

    # Insert into MongoDB (upsert)
    if documents:
        # Delete existing chunks for this video
        mongo_collection.delete_many({"uuid": VIDEO_UUID, "chunk_type": "sentence"})

        # Insert new chunks
        result = mongo_collection.insert_many(documents)
        print(f"Inserted {len(result.inserted_ids)} chunks into MongoDB")

    # Create text index for search
    mongo_collection.create_index([("content", "text"), ("chunk_type", 1)])
    print("Created text index")

    pg_cur.close()
    pg_conn.close()
    mongo_client.close()

    print("Done!")


def test_mongodb_text_search():
    """Test MongoDB text search"""
    from pymongo import MongoClient
    import time

    mongo_client = MongoClient(MONGO_URI)
    mongo_db = mongo_client[MONGO_DB]
    mongo_collection = mongo_db[MONGO_COLLECTION]

    test_queries = ["Paris", "Audrey Hepburn", "Cary Grant"]
    results = {}

    for query in test_queries:
        start = time.time()
        cursor = mongo_collection.find(
            {"uuid": VIDEO_UUID, "chunk_type": "sentence", "$text": {"$search": query}}
        ).limit(10)

        rows = list(cursor)
        elapsed = (time.time() - start) * 1000

        results[query] = {"ms": round(elapsed, 2), "rows": len(rows)}
        print(f"MongoDB text '{query}': {elapsed:.2f}ms, {len(rows)} rows")

    mongo_client.close()
    return results


if __name__ == "__main__":
    sync_to_mongodb()
    print("\nTesting MongoDB text search:")
    test_mongodb_text_search()