momentry_core/scripts/test_multilingual.py

#!/opt/homebrew/bin/python3.11
"""
Multilingual Vector Search Test with nomic-embed-text-v2-moe
"""

import time
import requests
import psycopg2
import uuid


VIDEO_UUID = "39567a0eb16f39fd"

POSTGRES_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "user": "accusys",
    "password": "Test3200",
    "database": "momentry",
}

MODEL = "nomic-embed-text-v2-moe"
QDRANT_COLLECTION = "chunks_v3"


def get_embedding(text, prefix=""):
    prompt = f"{prefix}{text}"
    resp = requests.post(
        "http://localhost:11434/api/embeddings", json={"model": MODEL, "prompt": prompt}
    )
    return resp.json()["embedding"]


def sync_to_qdrant():
    """Sync vectors to Qdrant with multilingual model"""
    conn = psycopg2.connect(**POSTGRES_CONFIG)
    cur = conn.cursor()

    cur.execute(
        """
        SELECT chunk_id, content->>'text' as text, start_time, end_time, uuid
        FROM chunks
        WHERE uuid = %s AND chunk_type = 'sentence'
        ORDER BY chunk_index
    """,
        (VIDEO_UUID,),
    )

    rows = cur.fetchall()
    print(f"Syncing {len(rows)} chunks to Qdrant with {MODEL}")

    points = []
    for chunk_id, text, start_time, end_time, vid in rows:
        if not text:
            continue

        # Use search_document: prefix for chunks
        embedding = get_embedding(text, "search_document: ")

        point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk_id))

        payload = {
            "uuid": vid,
            "chunk_id": chunk_id,
            "chunk_type": "sentence",
            "start_time": float(start_time),
            "end_time": float(end_time),
            "text": text[:200],
        }

        points.append({"id": point_id, "vector": embedding, "payload": payload})

    # Upload in batches
    batch_size = 100
    for i in range(0, len(points), batch_size):
        batch = points[i : i + batch_size]
        resp = requests.put(
            f"http://localhost:6333/collections/{QDRANT_COLLECTION}/points",
            headers={
                "api-key": "Test3200Test3200Test3200",
                "Content-Type": "application/json",
            },
            json={"points": batch},
        )
        if resp.status_code != 200:
            print(f"Error: {resp.text[:200]}")
            break
        print(
            f"Uploaded batch {i // batch_size + 1}/{(len(points) - 1) // batch_size + 1}"
        )

    cur.close()
    conn.close()
    print("Done!")


def test_queries(queries, use_prefix=True):
    """Test queries against Qdrant"""
    prefix = "search_query: " if use_prefix else ""

    for query in queries:
        embedding = get_embedding(query, prefix)

        start = time.time()
        resp = requests.post(
            f"http://localhost:6333/collections/{QDRANT_COLLECTION}/points/search",
            headers={
                "api-key": "Test3200Test3200Test3200",
                "Content-Type": "application/json",
            },
            json={"vector": embedding, "limit": 3, "with_payload": True},
        )
        elapsed = (time.time() - start) * 1000

        results = resp.json().get("result", [])

        print(f"\nQuery: '{query}' ({elapsed:.1f}ms)")
        print("-" * 60)
        for i, r in enumerate(results):
            score = r.get("score", 0)
            payload = r.get("payload", {})
            text = payload.get("text", "")[:60]
            print(f"  {i + 1}. [{score:.3f}] {text}")


# English queries
ENGLISH_QUERIES = [
    "a person talking",
    "someone speaking on camera",
    "outdoor scene",
    "indoor setting",
    "walking or moving",
    "dialogue or conversation",
    "looking at something",
    "happy or joyful",
    "serious or dramatic",
    "comedy or funny",
    "wearing a tie",
    "holding an object",
    "sitting on a chair",
    "city or urban",
    "building or room",
    "open space",
]

# Chinese queries
CHINESE_QUERIES = [
    "有人在說話",
    "戶外場景",
    "室內場景",
    "走路或移動",
    "對話或交談",
    "看著某樣東西",
    "快樂或開心",
    "嚴肅或戲劇性",
    "喜劇或有趣",
    "戴著領帶",
    "拿著東西",
    "坐在椅子上",
    "城市或都市",
    "建築物或房間",
    "開放空間",
]


if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1 and sys.argv[1] == "sync":
        print("=" * 60)
        print(f"Syncing vectors to {QDRANT_COLLECTION}")
        print(f"Model: {MODEL}")
        print("Prefix for chunks: search_document:")
        print("=" * 60)
        sync_to_qdrant()
    else:
        print("=" * 60)
        print(f"Testing with {QDRANT_COLLECTION}")
        print(f"Model: {MODEL}")
        print("Prefix for queries: search_query:")
        print("=" * 60)

        print("\n" + "=" * 60)
        print("ENGLISH QUERIES")
        print("=" * 60)
        test_queries(ENGLISH_QUERIES)

        print("\n" + "=" * 60)
        print("CHINESE QUERIES")
        print("=" * 60)
        test_queries(CHINESE_QUERIES)