//! Smart Search API //! Hybrid search: semantic (Qdrant) + keyword (PG ILIKE) + identity (person name → chunks). //! Uses Reciprocal Rank Fusion (RRF) to merge and deduplicate results. use axum::{extract::State, http::StatusCode, response::Json, routing::post, Router}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use crate::core::db::postgres_db::SemanticSearchResult; use crate::core::embedding::Embedder; // --- Request / Response Structures --- #[derive(Debug, Deserialize)] pub struct SmartSearchRequest { #[serde(default)] pub file_uuid: Option, pub query: String, pub page: Option, pub page_size: Option, pub limit: Option, } #[derive(Debug, Clone, Serialize)] pub struct SearchResult { pub id: i32, pub file_uuid: Option, pub parent_id: i32, pub scene_order: Option, pub start_frame: i64, pub end_frame: i64, pub fps: f64, pub start_time: f64, pub end_time: f64, pub raw_text: Option, pub summary: Option, pub metadata: Option, pub similarity: Option, pub file_name: Option, pub serve_url: Option, pub thumbnail_url: Option, } #[derive(Debug, Serialize)] pub struct SmartSearchResponse { pub query: String, pub results: Vec, pub page: usize, pub page_size: usize, pub strategy: String, } /// Internal merged result with score-based merge #[derive(Debug)] struct MergedResult { file_uuid: String, chunk_id: String, score: f64, semantic_score: Option, keyword_score: Option, identity_score: Option, source: String, } /// Enrich a Qdrant search result with full data from PostgreSQL async fn enrich_from_pg( db: &crate::core::db::PostgresDb, file_uuid: &str, chunk_id: &str, qdrant_score: f32, ) -> Option { match db.get_chunk_by_file_and_chunk_id(file_uuid, chunk_id).await { Ok(Some(p)) => Some(SearchResult { id: 0, file_uuid: p.file_uuid.clone(), parent_id: p.scene_order, scene_order: Some(p.scene_order), start_frame: p.start_frame, end_frame: p.end_frame, fps: p.fps, start_time: p.start_time, end_time: p.end_time, raw_text: None, summary: Some(p.summary), metadata: p.metadata.clone(), similarity: Some(qdrant_score as f64), file_name: None, serve_url: None, thumbnail_url: None, }), Ok(None) => None, Err(e) => { tracing::warn!("PG enrichment failed for {} {}: {}", file_uuid, chunk_id, e); None } } } fn pg_result_to_search(p: &SemanticSearchResult) -> SearchResult { SearchResult { id: 0, file_uuid: p.file_uuid.clone(), parent_id: p.scene_order, scene_order: Some(p.scene_order), start_frame: p.start_frame, end_frame: p.end_frame, fps: p.fps, start_time: p.start_time, end_time: p.end_time, raw_text: None, summary: Some(p.summary.clone()), metadata: p.metadata.clone(), similarity: p.similarity, file_name: None, serve_url: None, thumbnail_url: None, } } // --- API Handler --- pub async fn smart_search( State(state): State, Json(req): Json, ) -> Result, (StatusCode, Json)> { let db = &state.db; let qdrant = &state.qdrant; let page = req.page.unwrap_or(1).max(1); let page_size = if req.page_size.is_some() { req.page_size.unwrap() } else if req.limit.is_some() && req.page.is_none() { req.limit.unwrap() } else { 5 } .max(1); let hard_limit = req.limit.unwrap_or(usize::MAX); let limit = hard_limit.min(page_size); // 1. Generate embedding let embedder = Embedder::new("embeddinggemma-300m".to_string()); let embedding = embedder.embed_query(&req.query).await.map_err( |e| -> (StatusCode, Json) { tracing::error!("Embedding failed: {}", e); ( StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({ "error": e.to_string() })), ) }, )?; const KEYWORD_FIXED_SCORE: f64 = 0.5; const IDENTITY_FIXED_SCORE: f64 = 0.85; let fetch_limit = limit * 3; // 2. Semantic search via Qdrant let semantic_results: Vec<(String, String, f64)> = if let Some(file_uuid) = &req.file_uuid { let qdrant_hits = qdrant .search_in_uuid(&embedding, file_uuid, fetch_limit) .await .unwrap_or_default(); tracing::info!( "Smart search: Qdrant search_in_uuid for {} returned {} hits", file_uuid, qdrant_hits.len() ); qdrant_hits .into_iter() .map(|h| (h.uuid, h.chunk_id, h.score as f64)) .collect() } else { let qdrant_hits = qdrant .search(&embedding, fetch_limit) .await .unwrap_or_default(); tracing::info!( "Smart search: Qdrant search (no uuid filter) returned {} hits", qdrant_hits.len() ); qdrant_hits .into_iter() .map(|h| (h.uuid, h.chunk_id, h.score as f64)) .collect() }; // 3. Keyword search via PG ILIKE let keyword_results: Vec<(String, String, f64)> = match db .search_bm25(&req.query, req.file_uuid.as_deref(), fetch_limit as i64) .await { Ok(rows) => rows .into_iter() .map(|r| (r.file_uuid, r.chunk_id, r.combined_score)) .collect(), Err(e) => { tracing::warn!("Keyword search (bm25) failed: {}", e); vec![] } }; // 3b. Video title search: if query matches a video title, get its chunks const TITLE_MATCH_SCORE: f64 = 0.9; let title_results: Vec<(String, String, f64)> = { let clean_query = req.query.replace('\'', "''"); let v_table = crate::core::db::schema::table_name("videos"); let c_table = crate::core::db::schema::table_name("chunk"); let video_rows: Vec<(String,)> = sqlx::query_as(&format!( "SELECT file_uuid::text FROM {} WHERE file_name ILIKE $1 LIMIT 5", v_table )) .bind(format!("%{}%", clean_query)) .fetch_all(db.pool()) .await .unwrap_or_default(); let mut chunks = Vec::new(); for (fu,) in video_rows.iter() { if let Some(ref f) = req.file_uuid { if fu != f { continue; } } let rows: Vec<(String, String)> = sqlx::query_as(&format!( "SELECT chunk_id, file_uuid::text FROM {} \ WHERE file_uuid = $1 AND embedding IS NOT NULL \ AND chunk_type = 'sentence' \ LIMIT 20", c_table )) .bind(fu) .fetch_all(db.pool()) .await .unwrap_or_default(); for (cid, file_uuid) in rows { chunks.push((file_uuid, cid, TITLE_MATCH_SCORE)); } } chunks }; // 4. Identity search: if query matches a person name, get their chunks let identity_results: Vec<(String, String, f64)> = { let id_table = crate::core::db::schema::table_name("identities"); let clean_query = req.query.replace('\'', "''"); let id_rows: Vec<(i32, String, String)> = sqlx::query_as(&format!( "SELECT id, name, uuid::text FROM {} WHERE name ILIKE $1 LIMIT 5", id_table )) .bind(format!("%{}%", clean_query)) .fetch_all(db.pool()) .await .unwrap_or_default(); let mut id_chunks = Vec::new(); for (identity_id, _, uuid_text) in id_rows.iter().take(3) { let clean_uuid = uuid_text.replace('-', ""); match db.get_identity_chunks(&clean_uuid, 20, 0).await { Ok(chunks) => { for chunk in chunks { if let Some(ref fu) = req.file_uuid { if &chunk.file_uuid != fu { continue; } } id_chunks.push((chunk.file_uuid, chunk.chunk_id, 0.85)); } } Err(e) => { tracing::debug!("get_identity_chunks for {} failed: {}", clean_uuid, e); } } } id_chunks }; // 5. Score-based merge: combine results from all sources let mut merged: HashMap<(String, String), MergedResult> = HashMap::new(); // Add semantic results (use Qdrant cosine score directly) for (file_uuid, chunk_id, score) in semantic_results.iter() { let key = (file_uuid.clone(), chunk_id.clone()); merged .entry(key) .and_modify(|e| { e.score = e.score.max(*score); e.semantic_score = Some(*score); e.source = format!( "{}_{}", e.source.strip_prefix("semantic+").unwrap_or(&e.source), "semantic" ); }) .or_insert(MergedResult { file_uuid: file_uuid.clone(), chunk_id: chunk_id.clone(), score: *score, semantic_score: Some(*score), keyword_score: None, identity_score: None, source: "semantic".to_string(), }); } // Add keyword results (fixed score 0.5) let keyword_fixed = KEYWORD_FIXED_SCORE; for (file_uuid, chunk_id, _) in keyword_results.iter() { let key = (file_uuid.clone(), chunk_id.clone()); merged .entry(key) .and_modify(|e| { e.score = e.score.max(keyword_fixed); e.keyword_score = Some(keyword_fixed); e.source = format!("{}_keyword", e.source); }) .or_insert(MergedResult { file_uuid: file_uuid.clone(), chunk_id: chunk_id.clone(), score: keyword_fixed, semantic_score: None, keyword_score: Some(keyword_fixed), identity_score: None, source: "keyword".to_string(), }); } // Add title match results (high score 0.9) — query matched video title let has_title_match = !title_results.is_empty(); let title_fixed = TITLE_MATCH_SCORE; for (file_uuid, chunk_id, _) in title_results.iter() { let key = (file_uuid.clone(), chunk_id.clone()); merged .entry(key) .and_modify(|e| { e.score = e.score.max(title_fixed); e.source = format!("{}_title", e.source); }) .or_insert(MergedResult { file_uuid: file_uuid.clone(), chunk_id: chunk_id.clone(), score: title_fixed, semantic_score: None, keyword_score: None, identity_score: None, source: "title".to_string(), }); } // Add identity results (fixed score 0.85) let has_identity_match = !identity_results.is_empty(); let identity_fixed = IDENTITY_FIXED_SCORE; for (file_uuid, chunk_id, _) in identity_results.iter() { let key = (file_uuid.clone(), chunk_id.clone()); merged .entry(key) .and_modify(|e| { e.score = e.score.max(identity_fixed); e.identity_score = Some(identity_fixed); e.source = format!("{}_identity", e.source); }) .or_insert(MergedResult { file_uuid: file_uuid.clone(), chunk_id: chunk_id.clone(), score: identity_fixed, semantic_score: None, keyword_score: None, identity_score: Some(identity_fixed), source: "identity".to_string(), }); } // Sort by score descending (score-based merge) let mut ranked: Vec<&MergedResult> = merged.values().collect(); ranked.sort_by(|a, b| { b.score .partial_cmp(&a.score) .unwrap_or(std::cmp::Ordering::Equal) }); // 6. Enrich top results from PG and build final response let query_lower = req.query.to_lowercase(); let mut final_results = Vec::new(); for mr in ranked.iter().take(limit * 3) { // 取更多結果以便過濾 if let Some(pg) = db .get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id) .await .ok() .flatten() { // 關鍵字過濾: CJK 用子字串匹配,英文用單詞邊界匹配 let summary_lower = pg.summary.to_lowercase(); let query_words: Vec = query_lower .split_whitespace() .map(|s| s.to_string()) .collect(); let text_match = !pg.summary.is_empty() && { let has_cjk = |s: &str| -> bool { s.chars().any(|c| { ('\u{4E00}'..='\u{9FFF}').contains(&c) || ('\u{3040}'..='\u{309F}').contains(&c) || ('\u{30A0}'..='\u{30FF}').contains(&c) || ('\u{AC00}'..='\u{D7AF}').contains(&c) }) }; if has_cjk(&query_lower) || has_cjk(&summary_lower) { query_words.iter().all(|w| summary_lower.contains(w)) } else { let bordered = format!(" {} ", summary_lower); query_words .iter() .all(|w| bordered.contains(&format!(" {} ", w))) } }; if !text_match && mr.semantic_score.is_none() { continue; } final_results.push(SearchResult { id: 0, file_uuid: pg.file_uuid.clone(), parent_id: pg.scene_order, scene_order: Some(pg.scene_order), start_frame: pg.start_frame, end_frame: pg.end_frame, fps: pg.fps, start_time: pg.start_time, end_time: pg.end_time, raw_text: None, summary: Some(pg.summary), metadata: pg.metadata.clone(), similarity: Some(mr.score), file_name: None, serve_url: None, thumbnail_url: pg.file_uuid.as_ref().map(|fu| { format!( "/wp-json/momentry/v1/media?type=chunk_thumbnail&file_uuid={}&chunk_id={}", fu, mr.chunk_id ) }), }); } } // Trim to requested limit final_results.truncate(limit); // 7. Enrich results with file_name and serve_url from videos table if !final_results.is_empty() { let v_table = crate::core::db::schema::table_name("videos"); let file_uuids: Vec = final_results .iter() .filter_map(|r| r.file_uuid.clone()) .collect(); let file_rows: Vec<(String, String, String)> = sqlx::query_as(&format!( "SELECT file_uuid::text, file_name, file_path FROM {} WHERE file_uuid = ANY($1)", v_table )) .bind(&file_uuids) .fetch_all(db.pool()) .await .unwrap_or_default(); let file_map: std::collections::HashMap = file_rows .into_iter() .map(|(uuid, name, path)| (uuid, (name, path))) .collect(); let storage_root = crate::core::config::STORAGE_ROOT.as_str(); let serve_base = crate::core::config::SERVE_BASE_URL.as_str(); for r in &mut final_results { if let Some(ref uuid) = r.file_uuid { if let Some((name, path)) = file_map.get(uuid) { r.file_name = Some(name.clone()); if let Some(relative) = path.strip_prefix(storage_root) { r.serve_url = Some(format!("{}{}", serve_base, relative)); } } } } } // Determine strategy string let mut strategies = vec!["semantic"]; if !keyword_results.is_empty() { strategies.push("keyword"); } if has_identity_match { strategies.push("identity"); } if has_title_match { strategies.push("title"); } Ok(Json(SmartSearchResponse { query: req.query, results: final_results, page, page_size, strategy: format!("hybrid_{}", strategies.join("+")), })) } // --- Router Setup --- pub fn search_routes() -> Router { Router::new().route("/api/v1/search/smart", post(smart_search)) }