momentry_core/scripts/test_v2_with_text.py

#!/opt/homebrew/bin/python3.11
"""
Vector Search Test with nomic-embed-text:v1.5 using prefixes - with text content
"""

import time
import requests
import psycopg2


VIDEO_UUID = "39567a0eb16f39fd"

POSTGRES_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "user": "accusys",
    "password": "Test3200",
    "database": "momentry",
}

MODEL = "nomic-embed-text:v1.5"
QDRANT_COLLECTION = "chunks_v2"


def get_embedding(text, prefix=""):
    prompt = f"{prefix}{text}"
    resp = requests.post(
        "http://localhost:11434/api/embeddings", json={"model": MODEL, "prompt": prompt}
    )
    return resp.json()["embedding"]


def get_text_from_chunk_id(chunk_id):
    """Get text from PostgreSQL using chunk_id"""
    conn = psycopg2.connect(**POSTGRES_CONFIG)
    cur = conn.cursor()
    cur.execute("SELECT content->>'text' FROM chunks WHERE chunk_id = %s", (chunk_id,))
    result = cur.fetchone()
    cur.close()
    conn.close()
    return result[0] if result else ""


def test_queries(queries, use_prefix=True):
    """Test queries against Qdrant"""
    prefix = "search_query: " if use_prefix else ""

    for query in queries:
        embedding = get_embedding(query, prefix)

        start = time.time()
        resp = requests.post(
            f"http://localhost:6333/collections/{QDRANT_COLLECTION}/points/search",
            headers={"api-key": "Test3200Test3200Test3200"},
            json={"vector": embedding, "limit": 3},
        )
        elapsed = (time.time() - start) * 1000

        results = resp.json().get("result", [])

        print(f"\nQuery: '{query}' ({elapsed:.1f}ms)")
        print("-" * 60)
        for i, r in enumerate(results):
            score = r.get("score", 0)
            # Try to get chunk_id from payload
            payload = r.get("payload", {})
            chunk_id = payload.get("chunk_id", "")
            if not chunk_id:
                # Try to get text from Qdrant payload
                text = payload.get("text", "")[:50]
            else:
                # Get text from PostgreSQL
                text = get_text_from_chunk_id(chunk_id)[:50]
            print(f"  {i + 1}. [{score:.3f}] {text}...")


# English queries
ENGLISH_QUERIES = [
    "a person talking",
    "someone speaking on camera",
    "outdoor scene",
    "indoor setting",
    "walking or moving",
    "dialogue or conversation",
    "looking at something",
    "happy or joyful",
    "serious or dramatic",
    "comedy or funny",
    "wearing a tie",
    "holding an object",
    "sitting on a chair",
    "city or urban",
    "building or room",
    "open space",
]

# Chinese queries
CHINESE_QUERIES = [
    "有人在說話",
    "戶外場景",
    "室內場景",
    "走路或移動",
    "對話或交談",
    "看著某樣東西",
    "快樂或開心",
    "嚴肅或戲劇性",
    "喜劇或有趣",
    "戴著領帶",
    "拿著東西",
    "坐在椅子上",
    "城市或都市",
    "建築物或房間",
    "開放空間",
]


if __name__ == "__main__":
    print("=" * 70)
    print(f"Testing with {QDRANT_COLLECTION}")
    print(f"Model: {MODEL}")
    print("Prefix for chunks: search_document:")
    print("Prefix for queries: search_query:")
    print("=" * 70)

    print("\n" + "=" * 70)
    print("ENGLISH QUERIES")
    print("=" * 70)
    test_queries(ENGLISH_QUERIES)

    print("\n" + "=" * 70)
    print("CHINESE QUERIES")
    print("=" * 70)
    test_queries(CHINESE_QUERIES)