Files
momentry_core/test_search_comparison.rs
Warren b54c2def30 feat: add migrations, test scripts, and utility tools
- Add database migrations (006-028) for face recognition, identity, file_uuid
- Add test scripts for ASR, face, search, processing
- Add portal frontend (Tauri)
- Add config, benchmark, and monitoring utilities
- Add model checkpoints and pretrained model references
2026-04-30 15:11:53 +08:00

228 lines
7.1 KiB
Rust

use anyhow::{Context, Result};
use momentry_core::core::db::{Database, PostgresDb, QdrantDb};
use momentry_core::Embedder;
use std::env;
use std::time::Instant;
#[derive(Debug)]
struct SearchComparison {
query: String,
bm25_results: usize,
qdrant_results: usize,
bm25_top_score: f32,
qdrant_top_score: f32,
bm25_time_ms: u128,
qdrant_time_ms: u128,
overlap_count: usize,
}
fn print_results(
query: &str,
bm25_results: &[momentry_core::core::db::postgres_db::Bm25Result],
qdrant_results: &[momentry_core::core::db::SearchResult],
limit: usize,
) {
println!("\n=== 查詢: '{}' ===", query);
println!(
"BM25 結果 (共 {} 筆,顯示前 {} 筆):",
bm25_results.len(),
limit.min(bm25_results.len())
);
for (i, r) in bm25_results.iter().take(limit).enumerate() {
println!(
" {}. {} (uuid: {}, chunk_id: {})",
i + 1,
r.text.chars().take(60).collect::<String>(),
r.uuid,
r.chunk_id
);
println!(
" 分數: {:.4}, 時間: {:.1}-{:.1}s, 類型: {}",
r.bm25_score, r.start_time, r.end_time, r.chunk_type
);
}
println!(
"\nQdrant 向量搜尋結果 (共 {} 筆,顯示前 {} 筆):",
qdrant_results.len(),
limit.min(qdrant_results.len())
);
for (i, r) in qdrant_results.iter().take(limit).enumerate() {
println!(" {}. uuid: {}, chunk_id: {}", i + 1, r.uuid, r.chunk_id);
println!(" 分數: {:.4}", r.score);
}
// 計算重疊
let bm25_ids: Vec<String> = bm25_results
.iter()
.map(|r| format!("{}-{}", r.uuid, r.chunk_id))
.collect();
let qdrant_ids: Vec<String> = qdrant_results
.iter()
.map(|r| format!("{}-{}", r.uuid, r.chunk_id))
.collect();
let overlap: Vec<&String> = bm25_ids
.iter()
.filter(|id| qdrant_ids.contains(id))
.collect();
println!(
"\n結果重疊: {}/{} (BM25 與 Qdrant 共同返回)",
overlap.len(),
bm25_results.len().max(qdrant_results.len())
);
if !overlap.is_empty() {
println!("重疊的 chunk IDs: {:?}", overlap);
}
}
#[tokio::main]
async fn main() -> Result<()> {
// 設定環境變數
env::set_var("RUST_LOG", "info");
env::set_var("QDRANT_URL", "http://localhost:6333");
env::set_var("QDRANT_API_KEY", "Test3200Test3200Test3200");
env::set_var("QDRANT_COLLECTION", "momentry_rule1");
println!("=== BM25 與 Qdrant 搜尋比較測試 ===\n");
// 初始化元件
println!("初始化元件...");
let embedder = Embedder::new("nomic-embed-text-v2-moe:latest".to_string());
let pg = PostgresDb::init()
.await
.context("Failed to initialize PostgreSQL database")?;
let qdrant = QdrantDb::new();
// 測試查詢清單
let test_queries = vec![
// 英文查詢
("telephone", Some("384b0ff44aaaa1f1")), // Charade 電影
("money", Some("384b0ff44aaaa1f1")),
("gold", Some("384b0ff44aaaa1f1")),
// 中文查詢
("工作", Some("9760d0820f0cf9a7")), // ExaSAN 影片
("加快速度", Some("9760d0820f0cf9a7")),
("聲音", Some("9760d0820f0cf9a7")),
// 全域查詢(無 uuid 限制)
("computer", None),
("technology", None),
];
let limit = 10;
let mut comparisons = Vec::new();
for (query_str, uuid_opt) in test_queries {
let query = query_str.to_string();
let uuid = uuid_opt.map(|s| s.to_string());
println!(
"\n🔍 測試查詢: '{}' {}",
query,
uuid_opt
.map(|u| format!("(uuid: {})", u))
.unwrap_or_default()
);
// BM25 搜尋
let bm25_start = Instant::now();
let bm25_results = pg.search_bm25(&query, uuid_opt, limit).await?;
let bm25_time = bm25_start.elapsed();
// Qdrant 向量搜尋
let qdrant_start = Instant::now();
let query_vector = embedder.embed_query(&query).await?;
let qdrant_results = if let Some(ref uuid) = uuid {
qdrant.search_in_uuid(&query_vector, uuid, limit).await?
} else {
qdrant.search(&query_vector, limit).await?
};
let qdrant_time = qdrant_start.elapsed();
// 計算重疊
let bm25_ids: Vec<String> = bm25_results
.iter()
.map(|r| format!("{}-{}", r.uuid, r.chunk_id))
.collect();
let qdrant_ids: Vec<String> = qdrant_results
.iter()
.map(|r| format!("{}-{}", r.uuid, r.chunk_id))
.collect();
let overlap_count = bm25_ids.iter().filter(|id| qdrant_ids.contains(id)).count();
// 儲存比較結果
let comparison = SearchComparison {
query: query.clone(),
bm25_results: bm25_results.len(),
qdrant_results: qdrant_results.len(),
bm25_top_score: bm25_results.first().map(|r| r.bm25_score).unwrap_or(0.0),
qdrant_top_score: qdrant_results.first().map(|r| r.score).unwrap_or(0.0),
bm25_time_ms: bm25_time.as_millis(),
qdrant_time_ms: qdrant_time.as_millis(),
overlap_count,
};
comparisons.push(comparison);
// 顯示詳細結果
print_results(&query, &bm25_results, &qdrant_results, 5);
// 顯示效能比較
println!("\n⏱️ 效能比較:");
println!(" BM25 搜尋時間: {}ms", bm25_time.as_millis());
println!(
" Qdrant 搜尋時間: {}ms (含向量嵌入時間)",
qdrant_time.as_millis()
);
}
// 顯示總結比較表
println!("\n📊 搜尋比較總結");
println!(
"{:<15} {:<6} {:<6} {:<8} {:<8} {:<6} {:<6} {:<6}",
"查詢", "BM25數", "QD數", "BM25分", "QD分", "BM25ms", "QDms", "重疊"
);
println!("{}", "-".repeat(80));
for comp in &comparisons {
println!(
"{:<15} {:<6} {:<6} {:<8.4} {:<8.4} {:<6} {:<6} {:<6}/{}",
&comp.query[..15.min(comp.query.len())],
comp.bm25_results,
comp.qdrant_results,
comp.bm25_top_score,
comp.qdrant_top_score,
comp.bm25_time_ms,
comp.qdrant_time_ms,
comp.overlap_count,
comp.bm25_results.max(comp.qdrant_results)
);
}
// 分析統計
let total_queries = comparisons.len();
let bm25_faster = comparisons
.iter()
.filter(|c| c.bm25_time_ms < c.qdrant_time_ms)
.count();
let avg_overlap = comparisons
.iter()
.map(|c| c.overlap_count as f32 / c.bm25_results.max(c.qdrant_results).max(1) as f32)
.sum::<f32>()
/ total_queries as f32
* 100.0;
println!("\n📈 統計分析:");
println!(" • 總測試查詢數: {}", total_queries);
println!(
" • BM25 較快的查詢: {}/{} ({:.1}%)",
bm25_faster,
total_queries,
bm25_faster as f32 / total_queries as f32 * 100.0
);
println!(" • 平均結果重疊率: {:.1}%", avg_overlap);
Ok(())
}