fix: keyword search - add text_content field and CJK support
- Added text_content field to SearchResult and SemanticSearchResult - Added get_chunk_by_id_no_embedding for keyword results without embedding requirement - Fixed search_bm25 to use position-based ranking for CJK/Korean content - Fixed sqlx column mapping with explicit alias - Skip text_match filter for keyword-only results - Use text_content as fallback when summary is empty
This commit is contained in:
+47
-31
@@ -34,6 +34,7 @@ pub struct SearchResult {
|
||||
pub end_time: f64,
|
||||
pub raw_text: Option<String>,
|
||||
pub summary: Option<String>,
|
||||
pub text_content: Option<String>,
|
||||
pub metadata: Option<serde_json::Value>,
|
||||
pub similarity: Option<f64>,
|
||||
pub file_name: Option<String>,
|
||||
@@ -82,6 +83,7 @@ async fn enrich_from_pg(
|
||||
end_time: p.end_time,
|
||||
raw_text: None,
|
||||
summary: Some(p.summary),
|
||||
text_content: p.text_content.clone(),
|
||||
metadata: p.metadata.clone(),
|
||||
similarity: Some(qdrant_score as f64),
|
||||
file_name: None,
|
||||
@@ -109,6 +111,7 @@ fn pg_result_to_search(p: &SemanticSearchResult) -> SearchResult {
|
||||
end_time: p.end_time,
|
||||
raw_text: None,
|
||||
summary: Some(p.summary.clone()),
|
||||
text_content: p.text_content.clone(),
|
||||
metadata: p.metadata.clone(),
|
||||
similarity: p.similarity,
|
||||
file_name: None,
|
||||
@@ -381,43 +384,55 @@ pub async fn smart_search(
|
||||
let mut final_results = Vec::new();
|
||||
for mr in ranked.iter().take(limit * 3) {
|
||||
// 取更多結果以便過濾
|
||||
if let Some(pg) = db
|
||||
.get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id)
|
||||
.await
|
||||
.ok()
|
||||
.flatten()
|
||||
{
|
||||
// 關鍵字過濾: CJK 用子字串匹配,英文用單詞邊界匹配
|
||||
let summary_lower = pg.summary.to_lowercase();
|
||||
let query_words: Vec<String> = query_lower
|
||||
.split_whitespace()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
// Use no_embedding version for keyword results, regular for semantic
|
||||
let pg_opt = if mr.keyword_score.is_some() && mr.semantic_score.is_none() {
|
||||
db.get_chunk_by_id_no_embedding(&mr.file_uuid, &mr.chunk_id).await
|
||||
} else {
|
||||
db.get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id).await
|
||||
};
|
||||
if let Some(pg) = pg_opt.ok().flatten() {
|
||||
// 關鍵字結果跳過 text_match 過濾(search_bm25 已經匹配過)
|
||||
let is_keyword_only = mr.keyword_score.is_some() && mr.semantic_score.is_none();
|
||||
if !is_keyword_only {
|
||||
// 關鍵字過濾: CJK 用子字串匹配,英文用單詞邊界匹配
|
||||
let summary_lower = pg.summary.to_lowercase();
|
||||
let query_words: Vec<String> = query_lower
|
||||
.split_whitespace()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
|
||||
let text_match = !pg.summary.is_empty() && {
|
||||
let has_cjk = |s: &str| -> bool {
|
||||
s.chars().any(|c| {
|
||||
('\u{4E00}'..='\u{9FFF}').contains(&c)
|
||||
|| ('\u{3040}'..='\u{309F}').contains(&c)
|
||||
|| ('\u{30A0}'..='\u{30FF}').contains(&c)
|
||||
|| ('\u{AC00}'..='\u{D7AF}').contains(&c)
|
||||
})
|
||||
let text_match = !pg.summary.is_empty() && {
|
||||
let has_cjk = |s: &str| -> bool {
|
||||
s.chars().any(|c| {
|
||||
('\u{4E00}'..='\u{9FFF}').contains(&c)
|
||||
|| ('\u{3040}'..='\u{309F}').contains(&c)
|
||||
|| ('\u{30A0}'..='\u{30FF}').contains(&c)
|
||||
|| ('\u{AC00}'..='\u{D7AF}').contains(&c)
|
||||
})
|
||||
};
|
||||
|
||||
if has_cjk(&query_lower) || has_cjk(&summary_lower) {
|
||||
query_words.iter().all(|w| summary_lower.contains(w))
|
||||
} else {
|
||||
let bordered = format!(" {} ", summary_lower);
|
||||
query_words
|
||||
.iter()
|
||||
.all(|w| bordered.contains(&format!(" {} ", w)))
|
||||
}
|
||||
};
|
||||
|
||||
if has_cjk(&query_lower) || has_cjk(&summary_lower) {
|
||||
query_words.iter().all(|w| summary_lower.contains(w))
|
||||
} else {
|
||||
let bordered = format!(" {} ", summary_lower);
|
||||
query_words
|
||||
.iter()
|
||||
.all(|w| bordered.contains(&format!(" {} ", w)))
|
||||
if !text_match {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if !text_match && mr.semantic_score.is_none() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 使用 text_content 如果 summary 為空
|
||||
let display_text = if pg.summary.is_empty() {
|
||||
pg.text_content.clone().unwrap_or_default()
|
||||
} else {
|
||||
pg.summary.clone()
|
||||
};
|
||||
|
||||
final_results.push(SearchResult {
|
||||
id: 0,
|
||||
file_uuid: pg.file_uuid.clone(),
|
||||
@@ -430,6 +445,7 @@ pub async fn smart_search(
|
||||
end_time: pg.end_time,
|
||||
raw_text: None,
|
||||
summary: Some(pg.summary),
|
||||
text_content: pg.text_content.clone(),
|
||||
metadata: pg.metadata.clone(),
|
||||
similarity: Some(mr.score),
|
||||
file_name: None,
|
||||
|
||||
+72
-16
@@ -832,6 +832,7 @@ pub struct SemanticSearchResult {
|
||||
pub start_time: f64,
|
||||
pub end_time: f64,
|
||||
pub summary: String,
|
||||
pub text_content: Option<String>,
|
||||
pub metadata: Option<serde_json::Value>,
|
||||
pub similarity: Option<f64>,
|
||||
}
|
||||
@@ -2552,6 +2553,7 @@ impl PostgresDb {
|
||||
(start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \
|
||||
fps, start_time, end_time, \
|
||||
COALESCE(summary_text, text_content, '') as summary, \
|
||||
text_content as text_content, \
|
||||
metadata, \
|
||||
1.0::float8 as similarity \
|
||||
FROM {} \
|
||||
@@ -2568,6 +2570,37 @@ impl PostgresDb {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Get chunk by file_uuid and chunk_id WITHOUT embedding requirement (for keyword search)
|
||||
pub async fn get_chunk_by_id_no_embedding(
|
||||
&self,
|
||||
file_uuid: &str,
|
||||
chunk_id: &str,
|
||||
) -> Result<Option<SemanticSearchResult>> {
|
||||
let chunk_table = schema::table_name("chunk");
|
||||
let results = sqlx::query_as::<_, SemanticSearchResult>(
|
||||
&format!(
|
||||
"SELECT \
|
||||
id, file_uuid, id as scene_order, \
|
||||
(start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \
|
||||
fps, start_time, end_time, \
|
||||
COALESCE(summary_text, text_content, '') as summary, \
|
||||
text_content as text_content, \
|
||||
metadata, \
|
||||
1.0::float8 as similarity \
|
||||
FROM {} \
|
||||
WHERE file_uuid = $1 AND chunk_id = $2 \
|
||||
LIMIT 1",
|
||||
chunk_table
|
||||
),
|
||||
)
|
||||
.bind(file_uuid)
|
||||
.bind(chunk_id)
|
||||
.fetch_optional(&self.pool)
|
||||
.await?;
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Get children for a list of parent IDs
|
||||
pub async fn get_children_for_parents(
|
||||
&self,
|
||||
@@ -3339,22 +3372,45 @@ impl PostgresDb {
|
||||
let like = format!("%{}%", query.replace('%', "%%"));
|
||||
use sqlx::Row;
|
||||
|
||||
// Use PostgreSQL full-text search with ts_rank for ranking, fallback to ILIKE for recall
|
||||
let sql = format!(
|
||||
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \
|
||||
CASE \
|
||||
WHEN to_tsvector('english', text_content) @@ plainto_tsquery('english', $1) \
|
||||
THEN ts_rank(to_tsvector('english', text_content), plainto_tsquery('english', $1))::float8 \
|
||||
ELSE 0.1::float8 \
|
||||
END as score \
|
||||
FROM {} \
|
||||
WHERE text_content ILIKE $2 AND text_content != '' \
|
||||
{}\
|
||||
ORDER BY score DESC \
|
||||
LIMIT $3",
|
||||
table,
|
||||
if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" }
|
||||
);
|
||||
// Check if query contains CJK characters
|
||||
let has_cjk = query.chars().any(|c| {
|
||||
('\u{4E00}'..='\u{9FFF}').contains(&c)
|
||||
|| ('\u{3040}'..='\u{309F}').contains(&c)
|
||||
|| ('\u{30A0}'..='\u{30FF}').contains(&c)
|
||||
|| ('\u{AC00}'..='\u{D7AF}').contains(&c)
|
||||
});
|
||||
|
||||
let sql = if has_cjk {
|
||||
// CJK/Korean: use ILIKE position-based ranking
|
||||
format!(
|
||||
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \
|
||||
(1.0 - (POSITION(LOWER($1) IN LOWER(text_content))::float8 / NULLIF(LENGTH(text_content), 0)::float8))::float8 as score \
|
||||
FROM {} \
|
||||
WHERE text_content ILIKE $2 AND text_content != '' \
|
||||
{}\
|
||||
ORDER BY score DESC \
|
||||
LIMIT $3",
|
||||
table,
|
||||
if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" }
|
||||
)
|
||||
} else {
|
||||
// English: use PostgreSQL full-text search
|
||||
format!(
|
||||
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \
|
||||
CASE \
|
||||
WHEN to_tsvector('english', text_content) @@ plainto_tsquery('english', $1) \
|
||||
THEN ts_rank(to_tsvector('english', text_content), plainto_tsquery('english', $1))::float8 \
|
||||
ELSE 0.1::float8 \
|
||||
END as score \
|
||||
FROM {} \
|
||||
WHERE text_content ILIKE $2 AND text_content != '' \
|
||||
{}\
|
||||
ORDER BY score DESC \
|
||||
LIMIT $3",
|
||||
table,
|
||||
if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" }
|
||||
)
|
||||
};
|
||||
|
||||
let rows = if let Some(u) = file_uuid {
|
||||
sqlx::query(&sql)
|
||||
|
||||
Reference in New Issue
Block a user