diff --git a/src/api/search.rs b/src/api/search.rs index 1b82ddc..5dd9896 100644 --- a/src/api/search.rs +++ b/src/api/search.rs @@ -34,6 +34,7 @@ pub struct SearchResult { pub end_time: f64, pub raw_text: Option, pub summary: Option, + pub text_content: Option, pub metadata: Option, pub similarity: Option, pub file_name: Option, @@ -82,6 +83,7 @@ async fn enrich_from_pg( end_time: p.end_time, raw_text: None, summary: Some(p.summary), + text_content: p.text_content.clone(), metadata: p.metadata.clone(), similarity: Some(qdrant_score as f64), file_name: None, @@ -109,6 +111,7 @@ fn pg_result_to_search(p: &SemanticSearchResult) -> SearchResult { end_time: p.end_time, raw_text: None, summary: Some(p.summary.clone()), + text_content: p.text_content.clone(), metadata: p.metadata.clone(), similarity: p.similarity, file_name: None, @@ -381,43 +384,55 @@ pub async fn smart_search( let mut final_results = Vec::new(); for mr in ranked.iter().take(limit * 3) { // 取更多結果以便過濾 - if let Some(pg) = db - .get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id) - .await - .ok() - .flatten() - { - // 關鍵字過濾: CJK 用子字串匹配,英文用單詞邊界匹配 - let summary_lower = pg.summary.to_lowercase(); - let query_words: Vec = query_lower - .split_whitespace() - .map(|s| s.to_string()) - .collect(); + // Use no_embedding version for keyword results, regular for semantic + let pg_opt = if mr.keyword_score.is_some() && mr.semantic_score.is_none() { + db.get_chunk_by_id_no_embedding(&mr.file_uuid, &mr.chunk_id).await + } else { + db.get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id).await + }; + if let Some(pg) = pg_opt.ok().flatten() { + // 關鍵字結果跳過 text_match 過濾(search_bm25 已經匹配過) + let is_keyword_only = mr.keyword_score.is_some() && mr.semantic_score.is_none(); + if !is_keyword_only { + // 關鍵字過濾: CJK 用子字串匹配,英文用單詞邊界匹配 + let summary_lower = pg.summary.to_lowercase(); + let query_words: Vec = query_lower + .split_whitespace() + .map(|s| s.to_string()) + .collect(); - let text_match = !pg.summary.is_empty() && { - let has_cjk = |s: &str| -> bool { - s.chars().any(|c| { - ('\u{4E00}'..='\u{9FFF}').contains(&c) - || ('\u{3040}'..='\u{309F}').contains(&c) - || ('\u{30A0}'..='\u{30FF}').contains(&c) - || ('\u{AC00}'..='\u{D7AF}').contains(&c) - }) + let text_match = !pg.summary.is_empty() && { + let has_cjk = |s: &str| -> bool { + s.chars().any(|c| { + ('\u{4E00}'..='\u{9FFF}').contains(&c) + || ('\u{3040}'..='\u{309F}').contains(&c) + || ('\u{30A0}'..='\u{30FF}').contains(&c) + || ('\u{AC00}'..='\u{D7AF}').contains(&c) + }) + }; + + if has_cjk(&query_lower) || has_cjk(&summary_lower) { + query_words.iter().all(|w| summary_lower.contains(w)) + } else { + let bordered = format!(" {} ", summary_lower); + query_words + .iter() + .all(|w| bordered.contains(&format!(" {} ", w))) + } }; - if has_cjk(&query_lower) || has_cjk(&summary_lower) { - query_words.iter().all(|w| summary_lower.contains(w)) - } else { - let bordered = format!(" {} ", summary_lower); - query_words - .iter() - .all(|w| bordered.contains(&format!(" {} ", w))) + if !text_match { + continue; } - }; - - if !text_match && mr.semantic_score.is_none() { - continue; } + // 使用 text_content 如果 summary 為空 + let display_text = if pg.summary.is_empty() { + pg.text_content.clone().unwrap_or_default() + } else { + pg.summary.clone() + }; + final_results.push(SearchResult { id: 0, file_uuid: pg.file_uuid.clone(), @@ -430,6 +445,7 @@ pub async fn smart_search( end_time: pg.end_time, raw_text: None, summary: Some(pg.summary), + text_content: pg.text_content.clone(), metadata: pg.metadata.clone(), similarity: Some(mr.score), file_name: None, diff --git a/src/core/db/postgres_db.rs b/src/core/db/postgres_db.rs index 8c72783..b3129d7 100644 --- a/src/core/db/postgres_db.rs +++ b/src/core/db/postgres_db.rs @@ -832,6 +832,7 @@ pub struct SemanticSearchResult { pub start_time: f64, pub end_time: f64, pub summary: String, + pub text_content: Option, pub metadata: Option, pub similarity: Option, } @@ -2552,6 +2553,7 @@ impl PostgresDb { (start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \ fps, start_time, end_time, \ COALESCE(summary_text, text_content, '') as summary, \ + text_content as text_content, \ metadata, \ 1.0::float8 as similarity \ FROM {} \ @@ -2568,6 +2570,37 @@ impl PostgresDb { Ok(results) } + /// Get chunk by file_uuid and chunk_id WITHOUT embedding requirement (for keyword search) + pub async fn get_chunk_by_id_no_embedding( + &self, + file_uuid: &str, + chunk_id: &str, + ) -> Result> { + let chunk_table = schema::table_name("chunk"); + let results = sqlx::query_as::<_, SemanticSearchResult>( + &format!( + "SELECT \ + id, file_uuid, id as scene_order, \ + (start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \ + fps, start_time, end_time, \ + COALESCE(summary_text, text_content, '') as summary, \ + text_content as text_content, \ + metadata, \ + 1.0::float8 as similarity \ + FROM {} \ + WHERE file_uuid = $1 AND chunk_id = $2 \ + LIMIT 1", + chunk_table + ), + ) + .bind(file_uuid) + .bind(chunk_id) + .fetch_optional(&self.pool) + .await?; + + Ok(results) + } + /// Get children for a list of parent IDs pub async fn get_children_for_parents( &self, @@ -3339,22 +3372,45 @@ impl PostgresDb { let like = format!("%{}%", query.replace('%', "%%")); use sqlx::Row; - // Use PostgreSQL full-text search with ts_rank for ranking, fallback to ILIKE for recall - let sql = format!( - "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \ - CASE \ - WHEN to_tsvector('english', text_content) @@ plainto_tsquery('english', $1) \ - THEN ts_rank(to_tsvector('english', text_content), plainto_tsquery('english', $1))::float8 \ - ELSE 0.1::float8 \ - END as score \ - FROM {} \ - WHERE text_content ILIKE $2 AND text_content != '' \ - {}\ - ORDER BY score DESC \ - LIMIT $3", - table, - if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" } - ); + // Check if query contains CJK characters + let has_cjk = query.chars().any(|c| { + ('\u{4E00}'..='\u{9FFF}').contains(&c) + || ('\u{3040}'..='\u{309F}').contains(&c) + || ('\u{30A0}'..='\u{30FF}').contains(&c) + || ('\u{AC00}'..='\u{D7AF}').contains(&c) + }); + + let sql = if has_cjk { + // CJK/Korean: use ILIKE position-based ranking + format!( + "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \ + (1.0 - (POSITION(LOWER($1) IN LOWER(text_content))::float8 / NULLIF(LENGTH(text_content), 0)::float8))::float8 as score \ + FROM {} \ + WHERE text_content ILIKE $2 AND text_content != '' \ + {}\ + ORDER BY score DESC \ + LIMIT $3", + table, + if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" } + ) + } else { + // English: use PostgreSQL full-text search + format!( + "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \ + CASE \ + WHEN to_tsvector('english', text_content) @@ plainto_tsquery('english', $1) \ + THEN ts_rank(to_tsvector('english', text_content), plainto_tsquery('english', $1))::float8 \ + ELSE 0.1::float8 \ + END as score \ + FROM {} \ + WHERE text_content ILIKE $2 AND text_content != '' \ + {}\ + ORDER BY score DESC \ + LIMIT $3", + table, + if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" } + ) + }; let rows = if let Some(u) = file_uuid { sqlx::query(&sql)