fix: keyword search - add text_content field and CJK support

- Added text_content field to SearchResult and SemanticSearchResult - Added get_chunk_by_id_no_embedding for keyword results without embedding requirement - Fixed search_bm25 to use position-based ranking for CJK/Korean content - Fixed sqlx column mapping with explicit alias - Skip text_match filter for keyword-only results - Use text_content as fallback when summary is empty
2026-07-02 21:16:38 +08:00
parent 5a9d4325d8
commit 78364afc51
2 changed files with 119 additions and 47 deletions
@@ -34,6 +34,7 @@ pub struct SearchResult {
    pub end_time: f64,
    pub raw_text: Option<String>,
    pub summary: Option<String>,
+    pub text_content: Option<String>,
    pub metadata: Option<serde_json::Value>,
    pub similarity: Option<f64>,
    pub file_name: Option<String>,
@@ -82,6 +83,7 @@ async fn enrich_from_pg(
            end_time: p.end_time,
            raw_text: None,
            summary: Some(p.summary),
+            text_content: p.text_content.clone(),
            metadata: p.metadata.clone(),
            similarity: Some(qdrant_score as f64),
            file_name: None,
@@ -109,6 +111,7 @@ fn pg_result_to_search(p: &SemanticSearchResult) -> SearchResult {
        end_time: p.end_time,
        raw_text: None,
        summary: Some(p.summary.clone()),
+        text_content: p.text_content.clone(),
        metadata: p.metadata.clone(),
        similarity: p.similarity,
        file_name: None,
@@ -381,43 +384,55 @@ pub async fn smart_search(
    let mut final_results = Vec::new();
    for mr in ranked.iter().take(limit * 3) {
        // 取更多結果以便過濾
-        if let Some(pg) = db
-            .get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id)
-            .await
-            .ok()
-            .flatten()
-        {
-            // 關鍵字過濾: CJK 用子字串匹配，英文用單詞邊界匹配
-            let summary_lower = pg.summary.to_lowercase();
-            let query_words: Vec<String> = query_lower
-                .split_whitespace()
-                .map(|s| s.to_string())
-                .collect();
+        // Use no_embedding version for keyword results, regular for semantic
+        let pg_opt = if mr.keyword_score.is_some() && mr.semantic_score.is_none() {
+            db.get_chunk_by_id_no_embedding(&mr.file_uuid, &mr.chunk_id).await
+        } else {
+            db.get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id).await
+        };
+        if let Some(pg) = pg_opt.ok().flatten() {
+            // 關鍵字結果跳過 text_match 過濾（search_bm25 已經匹配過）
+            let is_keyword_only = mr.keyword_score.is_some() && mr.semantic_score.is_none();
+            if !is_keyword_only {
+                // 關鍵字過濾: CJK 用子字串匹配，英文用單詞邊界匹配
+                let summary_lower = pg.summary.to_lowercase();
+                let query_words: Vec<String> = query_lower
+                    .split_whitespace()
+                    .map(|s| s.to_string())
+                    .collect();

-            let text_match = !pg.summary.is_empty() && {
-                let has_cjk = |s: &str| -> bool {
-                    s.chars().any(|c| {
-                        ('\u{4E00}'..='\u{9FFF}').contains(&c)
-                            || ('\u{3040}'..='\u{309F}').contains(&c)
-                            || ('\u{30A0}'..='\u{30FF}').contains(&c)
-                            || ('\u{AC00}'..='\u{D7AF}').contains(&c)
-                    })
+                let text_match = !pg.summary.is_empty() && {
+                    let has_cjk = |s: &str| -> bool {
+                        s.chars().any(|c| {
+                            ('\u{4E00}'..='\u{9FFF}').contains(&c)
+                                || ('\u{3040}'..='\u{309F}').contains(&c)
+                                || ('\u{30A0}'..='\u{30FF}').contains(&c)
+                                || ('\u{AC00}'..='\u{D7AF}').contains(&c)
+                        })
+                    };
+
+                    if has_cjk(&query_lower) || has_cjk(&summary_lower) {
+                        query_words.iter().all(|w| summary_lower.contains(w))
+                    } else {
+                        let bordered = format!(" {} ", summary_lower);
+                        query_words
+                            .iter()
+                            .all(|w| bordered.contains(&format!(" {} ", w)))
+                    }
                };

-                if has_cjk(&query_lower) || has_cjk(&summary_lower) {
-                    query_words.iter().all(|w| summary_lower.contains(w))
-                } else {
-                    let bordered = format!(" {} ", summary_lower);
-                    query_words
-                        .iter()
-                        .all(|w| bordered.contains(&format!(" {} ", w)))
+                if !text_match {
+                    continue;
                }
-            };
-
-            if !text_match && mr.semantic_score.is_none() {
-                continue;
            }

+            // 使用 text_content 如果 summary 為空
+            let display_text = if pg.summary.is_empty() {
+                pg.text_content.clone().unwrap_or_default()
+            } else {
+                pg.summary.clone()
+            };
+
            final_results.push(SearchResult {
                id: 0,
                file_uuid: pg.file_uuid.clone(),
@@ -430,6 +445,7 @@ pub async fn smart_search(
                end_time: pg.end_time,
                raw_text: None,
                summary: Some(pg.summary),
+                text_content: pg.text_content.clone(),
                metadata: pg.metadata.clone(),
                similarity: Some(mr.score),
                file_name: None,
@@ -832,6 +832,7 @@ pub struct SemanticSearchResult {
    pub start_time: f64,
    pub end_time: f64,
    pub summary: String,
+    pub text_content: Option<String>,
    pub metadata: Option<serde_json::Value>,
    pub similarity: Option<f64>,
 }
@@ -2552,6 +2553,7 @@ impl PostgresDb {
                     (start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \
                     fps, start_time, end_time, \
                     COALESCE(summary_text, text_content, '') as summary, \
+                     text_content as text_content, \
                     metadata, \
                     1.0::float8 as similarity \
                 FROM {} \
@@ -2568,6 +2570,37 @@ impl PostgresDb {
        Ok(results)
    }

+    /// Get chunk by file_uuid and chunk_id WITHOUT embedding requirement (for keyword search)
+    pub async fn get_chunk_by_id_no_embedding(
+        &self,
+        file_uuid: &str,
+        chunk_id: &str,
+    ) -> Result<Option<SemanticSearchResult>> {
+        let chunk_table = schema::table_name("chunk");
+        let results = sqlx::query_as::<_, SemanticSearchResult>(
+            &format!(
+                "SELECT \
+                     id, file_uuid, id as scene_order, \
+                     (start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \
+                     fps, start_time, end_time, \
+                     COALESCE(summary_text, text_content, '') as summary, \
+                     text_content as text_content, \
+                     metadata, \
+                     1.0::float8 as similarity \
+                 FROM {} \
+                 WHERE file_uuid = $1 AND chunk_id = $2 \
+                 LIMIT 1",
+                chunk_table
+            ),
+        )
+        .bind(file_uuid)
+        .bind(chunk_id)
+        .fetch_optional(&self.pool)
+        .await?;
+
+        Ok(results)
+    }
+
    /// Get children for a list of parent IDs
    pub async fn get_children_for_parents(
        &self,
@@ -3339,22 +3372,45 @@ impl PostgresDb {
        let like = format!("%{}%", query.replace('%', "%%"));
        use sqlx::Row;

-        // Use PostgreSQL full-text search with ts_rank for ranking, fallback to ILIKE for recall
-        let sql = format!(
-            "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \
-             CASE \
-                  WHEN to_tsvector('english', text_content) @@ plainto_tsquery('english', $1) \
-                  THEN ts_rank(to_tsvector('english', text_content), plainto_tsquery('english', $1))::float8 \
-                  ELSE 0.1::float8 \
-              END as score \
-             FROM {} \
-             WHERE text_content ILIKE $2 AND text_content != '' \
-             {}\
-             ORDER BY score DESC \
-             LIMIT $3",
-            table,
-            if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" }
-        );
+        // Check if query contains CJK characters
+        let has_cjk = query.chars().any(|c| {
+            ('\u{4E00}'..='\u{9FFF}').contains(&c)
+                || ('\u{3040}'..='\u{309F}').contains(&c)
+                || ('\u{30A0}'..='\u{30FF}').contains(&c)
+                || ('\u{AC00}'..='\u{D7AF}').contains(&c)
+        });
+
+        let sql = if has_cjk {
+            // CJK/Korean: use ILIKE position-based ranking
+            format!(
+                "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \
+                 (1.0 - (POSITION(LOWER($1) IN LOWER(text_content))::float8 / NULLIF(LENGTH(text_content), 0)::float8))::float8 as score \
+                 FROM {} \
+                 WHERE text_content ILIKE $2 AND text_content != '' \
+                 {}\
+                 ORDER BY score DESC \
+                 LIMIT $3",
+                table,
+                if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" }
+            )
+        } else {
+            // English: use PostgreSQL full-text search
+            format!(
+                "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \
+                 CASE \
+                      WHEN to_tsvector('english', text_content) @@ plainto_tsquery('english', $1) \
+                      THEN ts_rank(to_tsvector('english', text_content), plainto_tsquery('english', $1))::float8 \
+                      ELSE 0.1::float8 \
+                  END as score \
+                 FROM {} \
+                 WHERE text_content ILIKE $2 AND text_content != '' \
+                 {}\
+                 ORDER BY score DESC \
+                 LIMIT $3",
+                table,
+                if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" }
+            )
+        };

        let rows = if let Some(u) = file_uuid {
            sqlx::query(&sql)