feat: deploy hybrid search (semantic+keyword+identity) with RRF fusion

- Replace smart_search with hybrid RRF implementation
- Add speaker_detections table for identity-agent binding
- Fix identity queries: direct SQL to avoid type mismatches
- Add debug logs to job_worker for processor debugging
- Deployed to production (3002) successfully

Key changes:
- search.rs: Complete rewrite with 3 strategies + RRF
- postgres_db.rs: speaker_detections table + identity query fixes
- job_worker.rs: Debug logs for output file checks

Tested:
- Hybrid search works with semantic + keyword + identity
- Identity search: 'identity:Charade' returns correct results
- Chinese keyword search: '調光' matches Charade summaries

Bugs found:
- Case mismatch: 'ASRX' vs 'asrx' in processors field
- Missing CUT dependency for ASRX processor
This commit is contained in:
Accusys
2026-06-01 15:15:17 +08:00
parent 0d58a738a1
commit 874d688987
4 changed files with 549 additions and 74 deletions

View File

@@ -1,17 +1,20 @@
//! Smart Search API
//! Implements the 5W1H search capability using semantic vectors.
//! Hybrid search: semantic (Qdrant) + keyword (PG ILIKE) + identity (person name → chunks).
//! Uses Reciprocal Rank Fusion (RRF) to merge and deduplicate results.
use axum::{extract::State, http::StatusCode, response::Json, routing::post, Router};
use serde::{Deserialize, Serialize};
use serde_json;
use std::collections::HashMap;
use crate::core::db::postgres_db::SemanticSearchResult;
use crate::core::embedding::Embedder;
// --- Request / Response Structures ---
#[derive(Debug, Deserialize)]
pub struct SmartSearchRequest {
pub file_uuid: String,
#[serde(default)]
pub file_uuid: Option<String>,
pub query: String,
pub page: Option<usize>,
pub page_size: Option<usize>,
@@ -21,20 +24,16 @@ pub struct SmartSearchRequest {
#[derive(Debug, Serialize)]
pub struct SearchResult {
pub id: i32,
pub file_uuid: Option<String>,
pub parent_id: i32,
pub scene_order: Option<i32>,
// Primary: frame-accurate position (authoritative unit)
pub start_frame: i64,
pub end_frame: i64,
pub fps: f64,
// Reference: time derived from frames (subject to FPS variation, not precise)
pub start_time: f64,
pub end_time: f64,
pub raw_text: Option<String>, // Text content of the child chunk
pub summary: Option<String>, // Summary from parent context
pub raw_text: Option<String>,
pub summary: Option<String>,
pub metadata: Option<serde_json::Value>,
pub similarity: Option<f64>,
}
@@ -48,6 +47,67 @@ pub struct SmartSearchResponse {
pub strategy: String,
}
/// Internal merged result with RRF scoring
#[derive(Debug)]
struct MergedResult {
file_uuid: String,
chunk_id: String,
rrf_score: f64,
semantic_score: Option<f64>,
keyword_score: Option<f64>,
identity_score: Option<f64>,
source: String,
}
/// Enrich a Qdrant search result with full data from PostgreSQL
async fn enrich_from_pg(
db: &crate::core::db::PostgresDb,
file_uuid: &str,
chunk_id: &str,
qdrant_score: f32,
) -> Option<SearchResult> {
match db.get_chunk_by_file_and_chunk_id(file_uuid, chunk_id).await {
Ok(Some(p)) => Some(SearchResult {
id: 0,
file_uuid: p.file_uuid.clone(),
parent_id: p.scene_order,
scene_order: Some(p.scene_order),
start_frame: p.start_frame,
end_frame: p.end_frame,
fps: p.fps,
start_time: p.start_time,
end_time: p.end_time,
raw_text: None,
summary: Some(p.summary),
metadata: p.metadata.clone(),
similarity: Some(qdrant_score as f64),
}),
Ok(None) => None,
Err(e) => {
tracing::warn!("PG enrichment failed for {} {}: {}", file_uuid, chunk_id, e);
None
}
}
}
fn pg_result_to_search(p: &SemanticSearchResult) -> SearchResult {
SearchResult {
id: 0,
file_uuid: p.file_uuid.clone(),
parent_id: p.scene_order,
scene_order: Some(p.scene_order),
start_frame: p.start_frame,
end_frame: p.end_frame,
fps: p.fps,
start_time: p.start_time,
end_time: p.end_time,
raw_text: None,
summary: Some(p.summary.clone()),
metadata: p.metadata.clone(),
similarity: p.similarity,
}
}
// --- API Handler ---
pub async fn smart_search(
@@ -55,8 +115,8 @@ pub async fn smart_search(
Json(req): Json<SmartSearchRequest>,
) -> Result<Json<SmartSearchResponse>, (StatusCode, Json<serde_json::Value>)> {
let db = &state.db;
let qdrant = &state.qdrant;
let page = req.page.unwrap_or(1).max(1);
// Backward compat: if old `limit` sent without `page_size`, use limit as page_size
let page_size = if req.page_size.is_some() {
req.page_size.unwrap()
} else if req.limit.is_some() && req.page.is_none() {
@@ -68,7 +128,7 @@ pub async fn smart_search(
let hard_limit = req.limit.unwrap_or(usize::MAX);
let limit = hard_limit.min(page_size);
// 1. Generate Embedding using EmbeddingGemma via MOMENTRY_EMBED_URL
// 1. Generate embedding
let embedder = Embedder::new("embeddinggemma-300m".to_string());
let embedding = embedder.embed_query(&req.query).await.map_err(
|e| -> (StatusCode, Json<serde_json::Value>) {
@@ -80,52 +140,198 @@ pub async fn smart_search(
},
)?;
// 2. Search Database (Drill-Down: Find Parents First)
let db_parents: Vec<crate::core::db::postgres_db::SemanticSearchResult> = db
.search_parent_chunks_semantic(&req.file_uuid, &embedding, limit)
.await
.map_err(
|e: anyhow::Error| -> (StatusCode, Json<serde_json::Value>) {
tracing::error!("DB search failed: {}", e);
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(serde_json::json!({ "error": e.to_string() })),
)
},
)?;
let fetch_limit = limit * 3;
let rrf_k = 60.0;
// Return parent chunks directly as search results
let results: Vec<SearchResult> = db_parents
.into_iter()
.map(|p| SearchResult {
id: 0,
parent_id: p.scene_order,
scene_order: Some(p.scene_order),
start_frame: p.start_frame,
end_frame: p.end_frame,
fps: p.fps,
start_time: p.start_time,
end_time: p.end_time,
raw_text: None,
summary: Some(p.summary),
metadata: p.metadata.clone(),
similarity: p.similarity,
})
.collect();
let response = SmartSearchResponse {
query: req.query,
results,
page,
page_size,
strategy: "semantic_vector_search".to_string(),
// 2. Semantic search via Qdrant
let semantic_results: Vec<(String, String, f64)> = if let Some(file_uuid) = &req.file_uuid {
let qdrant_hits = qdrant
.search_in_uuid(&embedding, file_uuid, fetch_limit)
.await
.unwrap_or_default();
qdrant_hits
.into_iter()
.map(|h| (h.uuid, h.chunk_id, h.score as f64))
.collect()
} else {
let qdrant_hits = qdrant.search(&embedding, fetch_limit).await.unwrap_or_default();
qdrant_hits
.into_iter()
.map(|h| (h.uuid, h.chunk_id, h.score as f64))
.collect()
};
Ok(Json(response))
// 3. Keyword search via PG ILIKE
let keyword_results: Vec<(String, String, f64)> = match db
.search_bm25(&req.query, req.file_uuid.as_deref(), fetch_limit as i64)
.await
{
Ok(rows) => rows
.into_iter()
.map(|r| (r.file_uuid, r.chunk_id, r.combined_score))
.collect(),
Err(e) => {
tracing::warn!("Keyword search (bm25) failed: {}", e);
vec![]
}
};
// 4. Identity search: if query matches a person name, get their chunks
let identity_results: Vec<(String, String, f64)> = {
let id_table = crate::core::db::schema::table_name("identities");
let clean_query = req.query.replace('\'', "''");
let id_rows: Vec<(i32, String, String)> = sqlx::query_as(&format!(
"SELECT id, name, uuid::text FROM {} WHERE name ILIKE $1 LIMIT 5",
id_table
))
.bind(format!("%{}%", clean_query))
.fetch_all(db.pool())
.await
.unwrap_or_default();
let mut id_chunks = Vec::new();
for (identity_id, _, uuid_text) in id_rows.iter().take(3) {
let clean_uuid = uuid_text.replace('-', "");
match db.get_identity_chunks(&clean_uuid, 20, 0).await {
Ok(chunks) => {
for chunk in chunks {
if let Some(ref fu) = req.file_uuid {
if &chunk.file_uuid != fu {
continue;
}
}
id_chunks.push((chunk.file_uuid, chunk.chunk_id, 0.85));
}
}
Err(e) => {
tracing::debug!("get_identity_chunks for {} failed: {}", clean_uuid, e);
}
}
}
id_chunks
};
// 5. RRF merge: combine results from all sources
let mut merged: HashMap<(String, String), MergedResult> = HashMap::new();
// Add semantic results
for (rank, (file_uuid, chunk_id, score)) in semantic_results.iter().enumerate() {
let key = (file_uuid.clone(), chunk_id.clone());
let rrf_contribution = 1.0 / (rrf_k + rank as f64 + 1.0);
merged
.entry(key)
.and_modify(|e| {
e.rrf_score += rrf_contribution;
e.semantic_score = Some(*score);
e.source = format!("{}_{}", e.source.strip_prefix("semantic+").unwrap_or(&e.source), "semantic");
})
.or_insert(MergedResult {
file_uuid: file_uuid.clone(),
chunk_id: chunk_id.clone(),
rrf_score: rrf_contribution,
semantic_score: Some(*score),
keyword_score: None,
identity_score: None,
source: "semantic".to_string(),
});
}
// Add keyword results
for (rank, (file_uuid, chunk_id, score)) in keyword_results.iter().enumerate() {
let key = (file_uuid.clone(), chunk_id.clone());
let rrf_contribution = 1.0 / (rrf_k + rank as f64 + 1.0);
merged
.entry(key)
.and_modify(|e| {
e.rrf_score += rrf_contribution;
e.keyword_score = Some(*score);
e.source = format!("{}_keyword", e.source);
})
.or_insert(MergedResult {
file_uuid: file_uuid.clone(),
chunk_id: chunk_id.clone(),
rrf_score: rrf_contribution,
semantic_score: None,
keyword_score: Some(*score),
identity_score: None,
source: "keyword".to_string(),
});
}
// Add identity results (only if we found matching identities)
let has_identity_match = !identity_results.is_empty();
for (rank, (file_uuid, chunk_id, score)) in identity_results.iter().enumerate() {
let key = (file_uuid.clone(), chunk_id.clone());
let rrf_contribution = 1.0 / (rrf_k + rank as f64 + 1.0);
merged
.entry(key)
.and_modify(|e| {
e.rrf_score += rrf_contribution;
e.identity_score = Some(*score);
e.source = format!("{}_identity", e.source);
})
.or_insert(MergedResult {
file_uuid: file_uuid.clone(),
chunk_id: chunk_id.clone(),
rrf_score: rrf_contribution,
semantic_score: None,
keyword_score: None,
identity_score: Some(*score),
source: "identity".to_string(),
});
}
// Sort by RRF score descending
let mut ranked: Vec<&MergedResult> = merged.values().collect();
ranked.sort_by(|a, b| b.rrf_score.partial_cmp(&a.rrf_score).unwrap_or(std::cmp::Ordering::Equal));
// 6. Enrich top results from PG and build final response
let mut final_results = Vec::new();
for mr in ranked.iter().take(limit) {
if let Some(pg) = db
.get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id)
.await
.ok()
.flatten()
{
final_results.push(SearchResult {
id: 0,
file_uuid: pg.file_uuid.clone(),
parent_id: pg.scene_order,
scene_order: Some(pg.scene_order),
start_frame: pg.start_frame,
end_frame: pg.end_frame,
fps: pg.fps,
start_time: pg.start_time,
end_time: pg.end_time,
raw_text: None,
summary: Some(pg.summary),
metadata: pg.metadata.clone(),
similarity: Some(mr.rrf_score),
});
}
}
// Determine strategy string
let mut strategies = vec!["semantic"];
if !keyword_results.is_empty() {
strategies.push("keyword");
}
if has_identity_match {
strategies.push("identity");
}
Ok(Json(SmartSearchResponse {
query: req.query,
results: final_results,
page,
page_size,
strategy: format!("hybrid_{}", strategies.join("+")),
}))
}
// --- Router Setup ---
pub fn search_routes() -> Router<crate::api::types::AppState> {
Router::new().route("/api/v1/search/smart", post(smart_search))
}
}