Files
momentry_core/scripts/comprehensive_search_test.py
accusys 383201cacd feat: Initial v0.9 release with API Key authentication
## v0.9.20260325_144654

### Features
- API Key Authentication System
- Job Worker System
- V2 Backup Versioning

### Bug Fixes
- get_processor_results_by_job column mapping

Co-authored-by: OpenCode
2026-03-25 14:53:41 +08:00

317 lines
9.2 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Comprehensive search comparison: Text, Vector (PostgreSQL & Qdrant), Object, and MongoDB search
"""
import time
import requests
import psycopg2
from pymongo import MongoClient
VIDEO_UUID = "39567a0eb16f39fd"
POSTGRES_CONFIG = {
"host": "localhost",
"port": 5432,
"user": "accusys",
"password": "Test3200",
"database": "momentry",
}
MONGO_URI = "mongodb://localhost:27017"
MONGO_DB = "momentry"
MONGO_COLLECTION = "chunks"
TEST_QUERIES = [
("text", "Paris"),
("text", " Audrey Hepburn"),
("text", "Cary Grant"),
("vector", "Paris"),
("vector", " Audrey Hepburn"),
("vector", "Cary Grant"),
("object", "person"),
("object", "car"),
("object", "clock"),
("object", "tie"),
]
def test_text_search():
"""Test PostgreSQL text search"""
results = {}
conn = psycopg2.connect(**POSTGRES_CONFIG)
cur = conn.cursor()
for query in ["Paris", " Audrey Hepburn", "Cary Grant"]:
start = time.time()
cur.execute(
"SELECT chunk_id, content->>'text' FROM chunks WHERE chunk_type = 'sentence' AND content->>'text' ILIKE %s LIMIT 10",
(f"%{query}%",),
)
rows = cur.fetchall()
elapsed = (time.time() - start) * 1000
results[query] = {"ms": round(elapsed, 2), "rows": len(rows)}
print(f"PostgreSQL text '{query}': {elapsed:.2f}ms, {len(rows)} rows")
cur.close()
conn.close()
return results
def test_mongodb_text_search():
"""Test MongoDB text search"""
results = {}
mongo_client = MongoClient(MONGO_URI)
mongo_collection = mongo_client[MONGO_DB][MONGO_COLLECTION]
for query in ["Paris", "Audrey Hepburn", "Cary Grant"]:
start = time.time()
cursor = mongo_collection.find(
{"uuid": VIDEO_UUID, "chunk_type": "sentence", "$text": {"$search": query}}
).limit(10)
rows = list(cursor)
elapsed = (time.time() - start) * 1000
results[query] = {"ms": round(elapsed, 2), "rows": len(rows)}
print(f"MongoDB text '{query}': {elapsed:.2f}ms, {len(rows)} rows")
mongo_client.close()
return results
def test_qdrant_vector_search():
"""Test Qdrant vector search"""
results = {}
for query in ["Paris", " Audrey Hepburn", "Cary Grant"]:
# Get embedding from Ollama
embed_resp = requests.post(
"http://localhost:11434/api/embeddings",
json={"model": "nomic-embed-text", "prompt": query},
)
embedding = embed_resp.json()["embedding"]
# Search in Qdrant
start = time.time()
resp = requests.post(
"http://localhost:6333/collections/AccusysDB/points/search",
headers={"api-key": "Test3200Test3200Test3200"},
json={"vector": embedding, "limit": 10},
)
elapsed = (time.time() - start) * 1000
data = resp.json()
result_count = len(data.get("result", []))
results[query] = {"ms": round(elapsed, 2), "rows": result_count}
print(f"Qdrant vector '{query}': {elapsed:.2f}ms, {result_count} rows")
return results
def test_postgres_vector_search():
"""Test PostgreSQL vector search using pgvector"""
results = {}
conn = psycopg2.connect(**POSTGRES_CONFIG)
cur = conn.cursor()
for query in ["Paris", " Audrey Hepburn", "Cary Grant"]:
# Get embedding from Ollama
embed_resp = requests.post(
"http://localhost:11434/api/embeddings",
json={"model": "nomic-embed-text", "prompt": query},
)
embedding = embed_resp.json()["embedding"]
# Search in PostgreSQL using pgvector
start = time.time()
# Convert to vector string format
vector_str = "[" + ",".join(str(x) for x in embedding) + "]"
cur.execute(
"""
SELECT chunk_id, (embedding_vector <=> %s::vector) as distance
FROM chunk_vectors
WHERE embedding_vector IS NOT NULL
ORDER BY embedding_vector <=> %s::vector
LIMIT 10
""",
(vector_str, vector_str),
)
rows = cur.fetchall()
elapsed = (time.time() - start) * 1000
results[query] = {"ms": round(elapsed, 2), "rows": len(rows)}
print(f"PostgreSQL vector '{query}': {elapsed:.2f}ms, {len(rows)} rows")
cur.close()
conn.close()
return results
def test_object_search():
"""Test PostgreSQL object search"""
results = {}
conn = psycopg2.connect(**POSTGRES_CONFIG)
cur = conn.cursor()
for obj in ["person", "car", "clock", "tie"]:
start = time.time()
cur.execute(
"""
SELECT chunk_id FROM chunks
WHERE uuid = %s AND chunk_type = 'sentence'
AND metadata IS NOT NULL AND metadata->'yolo'->'objects' ? %s
LIMIT 10
""",
(VIDEO_UUID, obj),
)
rows = cur.fetchall()
elapsed = (time.time() - start) * 1000
results[obj] = {"ms": round(elapsed, 2), "rows": len(rows)}
print(f"PostgreSQL object '{obj}': {elapsed:.2f}ms, {len(rows)} rows")
cur.close()
conn.close()
return results
def main():
print("=" * 70)
print("SEARCH PERFORMANCE COMPARISON")
print("=" * 70)
# Get chunk count
conn = psycopg2.connect(**POSTGRES_CONFIG)
cur = conn.cursor()
cur.execute(
"SELECT COUNT(*) FROM chunks WHERE uuid = %s AND chunk_type = 'sentence'",
(VIDEO_UUID,),
)
chunk_count = cur.fetchone()[0]
print(f"\nTotal sentence chunks: {chunk_count}")
print(f"Video UUID: {VIDEO_UUID}")
cur.close()
conn.close()
print("\n" + "=" * 70)
print("A. TEXT SEARCH (PostgreSQL ILIKE)")
print("=" * 70)
text_results = test_text_search()
print("\n" + "=" * 70)
print("A2. TEXT SEARCH (MongoDB Text)")
print("=" * 70)
mongodb_results = test_mongodb_text_search()
print("\n" + "=" * 70)
print("B1. VECTOR SEARCH (Qdrant HNSW)")
print("=" * 70)
qdrant_results = test_qdrant_vector_search()
print("\n" + "=" * 70)
print("B2. VECTOR SEARCH (PostgreSQL pgvector HNSW)")
print("=" * 70)
pgvector_results = test_postgres_vector_search()
print("\n" + "=" * 70)
print("C. OBJECT SEARCH (PostgreSQL JSON)")
print("=" * 70)
object_results = test_object_search()
print("\n" + "=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"\n{'Method':<28} | {'Query':<20} | {'Time (ms)':<12} | {'Results'}")
print("-" * 75)
for query, data in text_results.items():
print(
f"{'PostgreSQL ILIKE':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}"
)
for query, data in mongodb_results.items():
print(
f"{'MongoDB Text':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}"
)
for query, data in qdrant_results.items():
print(
f"{'Qdrant HNSW':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}"
)
for query, data in pgvector_results.items():
print(
f"{'PostgreSQL pgvector':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}"
)
for query, data in object_results.items():
print(
f"{'PostgreSQL JSON':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}"
)
# Calculate averages
text_avg = sum(d["ms"] for d in text_results.values()) / len(text_results)
mongodb_avg = sum(d["ms"] for d in mongodb_results.values()) / len(mongodb_results)
qdrant_avg = sum(d["ms"] for d in qdrant_results.values()) / len(qdrant_results)
pgvector_avg = sum(d["ms"] for d in pgvector_results.values()) / len(
pgvector_results
)
object_avg = sum(d["ms"] for d in object_results.values()) / len(object_results)
print("\n" + "=" * 70)
print("AVERAGE RESPONSE TIME")
print("=" * 70)
print(f" PostgreSQL ILIKE (Text): {text_avg:.2f}ms")
print(f" MongoDB Text: {mongodb_avg:.2f}ms")
print(f" PostgreSQL pgvector (Vector): {pgvector_avg:.2f}ms")
print(f" Qdrant HNSW (Vector): {qdrant_avg:.2f}ms")
print(f" PostgreSQL JSON (Object): {object_avg:.2f}ms")
print("\n" + "=" * 70)
print("ANALYSIS")
print("=" * 70)
print(
"""
1. TEXT SEARCH (PostgreSQL ILIKE):
- Fast: ~0.7ms average
- Exact substring matching
- Case-insensitive
- Good for keyword searches
2. VECTOR SEARCH - PostgreSQL pgvector (HNSW):
- Speed: ~{:.1f}ms average
- Built into PostgreSQL
- No additional infrastructure needed
- Good for single-database architecture
3. VECTOR SEARCH - Qdrant (HNSW):
- Speed: ~{:.1f}ms average
- Dedicated vector database
- Better for large-scale deployments
- Supports more advanced vector operations
4. OBJECT SEARCH (PostgreSQL JSON):
- Very fast: ~{:.1f}ms average
- Uses JSON containment operator
- Works with YOLO metadata
- Best for visual object queries
RECOMMENDATION:
- For simple keyword searches: PostgreSQL ILIKE
- For semantic search with single DB: PostgreSQL pgvector
- For scalability: Qdrant
- For visual content: PostgreSQL JSON with YOLO metadata
""".format(pgvector_avg, qdrant_avg, object_avg)
)
if __name__ == "__main__":
main()