feat: score-based search, LLM re-ranking endpoint, video title search, pipeline module
Core search changes: - Replace RRF with score-based merge (max of semantic/keyword/identity) - Add video title ILIKE search for brand/name queries (score 0.9) - Add /api/v1/search/llm-smart endpoint with Gemma 4 re-ranking - Fix LLM JSON parsing (markdown fences, empty responses) Infrastructure: - Rebuild Qdrant collection (clear 347K contaminated points) - Add dotenv loading to main.rs for config parity - Implement store_pre_chunk in postgres_db.rs Pipeline module (WordPress): - store-asrx, rule1, vectorize, phase1, complete endpoints - CLI commands for pipeline operations Docs: - SEARCH_SCORE_IMPROVEMENT.md (score-based merge proposal)
This commit is contained in:
+94
-28
@@ -21,7 +21,7 @@ pub struct SmartSearchRequest {
|
||||
pub limit: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct SearchResult {
|
||||
pub id: i32,
|
||||
pub file_uuid: Option<String>,
|
||||
@@ -47,12 +47,12 @@ pub struct SmartSearchResponse {
|
||||
pub strategy: String,
|
||||
}
|
||||
|
||||
/// Internal merged result with RRF scoring
|
||||
/// Internal merged result with score-based merge
|
||||
#[derive(Debug)]
|
||||
struct MergedResult {
|
||||
file_uuid: String,
|
||||
chunk_id: String,
|
||||
rrf_score: f64,
|
||||
score: f64,
|
||||
semantic_score: Option<f64>,
|
||||
keyword_score: Option<f64>,
|
||||
identity_score: Option<f64>,
|
||||
@@ -140,8 +140,10 @@ pub async fn smart_search(
|
||||
},
|
||||
)?;
|
||||
|
||||
const KEYWORD_FIXED_SCORE: f64 = 0.5;
|
||||
const IDENTITY_FIXED_SCORE: f64 = 0.85;
|
||||
|
||||
let fetch_limit = limit * 3;
|
||||
let rrf_k = 60.0;
|
||||
|
||||
// 2. Semantic search via Qdrant
|
||||
let semantic_results: Vec<(String, String, f64)> = if let Some(file_uuid) = &req.file_uuid {
|
||||
@@ -176,6 +178,46 @@ pub async fn smart_search(
|
||||
}
|
||||
};
|
||||
|
||||
// 3b. Video title search: if query matches a video title, get its chunks
|
||||
const TITLE_MATCH_SCORE: f64 = 0.9;
|
||||
let title_results: Vec<(String, String, f64)> = {
|
||||
let clean_query = req.query.replace('\'', "''");
|
||||
let v_table = crate::core::db::schema::table_name("videos");
|
||||
let c_table = crate::core::db::schema::table_name("chunk");
|
||||
let video_rows: Vec<(String,)> = sqlx::query_as(&format!(
|
||||
"SELECT file_uuid::text FROM {} WHERE file_name ILIKE $1 LIMIT 5",
|
||||
v_table
|
||||
))
|
||||
.bind(format!("%{}%", clean_query))
|
||||
.fetch_all(db.pool())
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
|
||||
let mut chunks = Vec::new();
|
||||
for (fu,) in video_rows.iter() {
|
||||
if let Some(ref f) = req.file_uuid {
|
||||
if fu != f {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
let rows: Vec<(String, String)> = sqlx::query_as(&format!(
|
||||
"SELECT chunk_id, file_uuid::text FROM {} \
|
||||
WHERE file_uuid = $1 AND embedding IS NOT NULL \
|
||||
AND chunk_type = 'sentence' \
|
||||
LIMIT 20",
|
||||
c_table
|
||||
))
|
||||
.bind(fu)
|
||||
.fetch_all(db.pool())
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
for (cid, file_uuid) in rows {
|
||||
chunks.push((file_uuid, cid, TITLE_MATCH_SCORE));
|
||||
}
|
||||
}
|
||||
chunks
|
||||
};
|
||||
|
||||
// 4. Identity search: if query matches a person name, get their chunks
|
||||
let identity_results: Vec<(String, String, f64)> = {
|
||||
let id_table = crate::core::db::schema::table_name("identities");
|
||||
@@ -211,24 +253,23 @@ pub async fn smart_search(
|
||||
id_chunks
|
||||
};
|
||||
|
||||
// 5. RRF merge: combine results from all sources
|
||||
// 5. Score-based merge: combine results from all sources
|
||||
let mut merged: HashMap<(String, String), MergedResult> = HashMap::new();
|
||||
|
||||
// Add semantic results
|
||||
for (rank, (file_uuid, chunk_id, score)) in semantic_results.iter().enumerate() {
|
||||
// Add semantic results (use Qdrant cosine score directly)
|
||||
for (file_uuid, chunk_id, score) in semantic_results.iter() {
|
||||
let key = (file_uuid.clone(), chunk_id.clone());
|
||||
let rrf_contribution = 1.0 / (rrf_k + rank as f64 + 1.0);
|
||||
merged
|
||||
.entry(key)
|
||||
.and_modify(|e| {
|
||||
e.rrf_score += rrf_contribution;
|
||||
e.score = e.score.max(*score);
|
||||
e.semantic_score = Some(*score);
|
||||
e.source = format!("{}_{}", e.source.strip_prefix("semantic+").unwrap_or(&e.source), "semantic");
|
||||
})
|
||||
.or_insert(MergedResult {
|
||||
file_uuid: file_uuid.clone(),
|
||||
chunk_id: chunk_id.clone(),
|
||||
rrf_score: rrf_contribution,
|
||||
score: *score,
|
||||
semantic_score: Some(*score),
|
||||
keyword_score: None,
|
||||
identity_score: None,
|
||||
@@ -236,54 +277,76 @@ pub async fn smart_search(
|
||||
});
|
||||
}
|
||||
|
||||
// Add keyword results
|
||||
for (rank, (file_uuid, chunk_id, score)) in keyword_results.iter().enumerate() {
|
||||
// Add keyword results (fixed score 0.5)
|
||||
let keyword_fixed = KEYWORD_FIXED_SCORE;
|
||||
for (file_uuid, chunk_id, _) in keyword_results.iter() {
|
||||
let key = (file_uuid.clone(), chunk_id.clone());
|
||||
let rrf_contribution = 1.0 / (rrf_k + rank as f64 + 1.0);
|
||||
merged
|
||||
.entry(key)
|
||||
.and_modify(|e| {
|
||||
e.rrf_score += rrf_contribution;
|
||||
e.keyword_score = Some(*score);
|
||||
e.score = e.score.max(keyword_fixed);
|
||||
e.keyword_score = Some(keyword_fixed);
|
||||
e.source = format!("{}_keyword", e.source);
|
||||
})
|
||||
.or_insert(MergedResult {
|
||||
file_uuid: file_uuid.clone(),
|
||||
chunk_id: chunk_id.clone(),
|
||||
rrf_score: rrf_contribution,
|
||||
score: keyword_fixed,
|
||||
semantic_score: None,
|
||||
keyword_score: Some(*score),
|
||||
keyword_score: Some(keyword_fixed),
|
||||
identity_score: None,
|
||||
source: "keyword".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// Add identity results (only if we found matching identities)
|
||||
let has_identity_match = !identity_results.is_empty();
|
||||
for (rank, (file_uuid, chunk_id, score)) in identity_results.iter().enumerate() {
|
||||
// Add title match results (high score 0.9) — query matched video title
|
||||
let has_title_match = !title_results.is_empty();
|
||||
let title_fixed = TITLE_MATCH_SCORE;
|
||||
for (file_uuid, chunk_id, _) in title_results.iter() {
|
||||
let key = (file_uuid.clone(), chunk_id.clone());
|
||||
let rrf_contribution = 1.0 / (rrf_k + rank as f64 + 1.0);
|
||||
merged
|
||||
.entry(key)
|
||||
.and_modify(|e| {
|
||||
e.rrf_score += rrf_contribution;
|
||||
e.identity_score = Some(*score);
|
||||
e.score = e.score.max(title_fixed);
|
||||
e.source = format!("{}_title", e.source);
|
||||
})
|
||||
.or_insert(MergedResult {
|
||||
file_uuid: file_uuid.clone(),
|
||||
chunk_id: chunk_id.clone(),
|
||||
score: title_fixed,
|
||||
semantic_score: None,
|
||||
keyword_score: None,
|
||||
identity_score: None,
|
||||
source: "title".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// Add identity results (fixed score 0.85)
|
||||
let has_identity_match = !identity_results.is_empty();
|
||||
let identity_fixed = IDENTITY_FIXED_SCORE;
|
||||
for (file_uuid, chunk_id, _) in identity_results.iter() {
|
||||
let key = (file_uuid.clone(), chunk_id.clone());
|
||||
merged
|
||||
.entry(key)
|
||||
.and_modify(|e| {
|
||||
e.score = e.score.max(identity_fixed);
|
||||
e.identity_score = Some(identity_fixed);
|
||||
e.source = format!("{}_identity", e.source);
|
||||
})
|
||||
.or_insert(MergedResult {
|
||||
file_uuid: file_uuid.clone(),
|
||||
chunk_id: chunk_id.clone(),
|
||||
rrf_score: rrf_contribution,
|
||||
score: identity_fixed,
|
||||
semantic_score: None,
|
||||
keyword_score: None,
|
||||
identity_score: Some(*score),
|
||||
identity_score: Some(identity_fixed),
|
||||
source: "identity".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
// Sort by RRF score descending
|
||||
// Sort by score descending (score-based merge)
|
||||
let mut ranked: Vec<&MergedResult> = merged.values().collect();
|
||||
ranked.sort_by(|a, b| b.rrf_score.partial_cmp(&a.rrf_score).unwrap_or(std::cmp::Ordering::Equal));
|
||||
ranked.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
// 6. Enrich top results from PG and build final response
|
||||
let mut final_results = Vec::new();
|
||||
@@ -307,7 +370,7 @@ pub async fn smart_search(
|
||||
raw_text: None,
|
||||
summary: Some(pg.summary),
|
||||
metadata: pg.metadata.clone(),
|
||||
similarity: Some(mr.rrf_score),
|
||||
similarity: Some(mr.score),
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -320,6 +383,9 @@ pub async fn smart_search(
|
||||
if has_identity_match {
|
||||
strategies.push("identity");
|
||||
}
|
||||
if has_title_match {
|
||||
strategies.push("title");
|
||||
}
|
||||
|
||||
Ok(Json(SmartSearchResponse {
|
||||
query: req.query,
|
||||
|
||||
Reference in New Issue
Block a user