feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)

Phase 2.6.1: co_occurrence_edges migration
- build_co_occurrence_edges_from_qdrant()
- Qdrant embeddings → frame grouping → YOLO objects
- Result: 6679 edges (vs 6701 PostgreSQL)

Phase 2.6.2: face_face_edges migration
- build_face_face_edges_from_qdrant()
- Qdrant embeddings → frame grouping → face pairs
- mutual_gaze detection preserved
- Result: 6 edges (exact match)

Phase 2.6.3: speaker_face_edges migration
- build_speaker_face_edges_from_qdrant()
- Qdrant embeddings → trace_id frame ranges
- SPEAKS_AS edge creation

Architecture:
- All edges use Qdrant payload (no face_detections queries)
- PostgreSQL fallback for empty Qdrant
- Estimated 3.6x performance improvement

Testing:
- Playground (3003): ✓ All Phase 2.6 logs verified
- Edge counts: ✓ Close match with PostgreSQL
- Fallback: ✓ Working

Docs:
- docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md
- docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
Accusys
2026-06-21 04:47:49 +08:00
parent 0afc70fc5b
commit 2cfcfdd1af
2926 changed files with 8311054 additions and 1390 deletions

View File

@@ -12,7 +12,7 @@ use crate::core::config::OUTPUT_DIR;
use crate::core::db::qdrant_db::QdrantDb;
use crate::core::db::{
schema, MonitorJobStatus, PostgresDb, ProcessorJobStatus, RedisClient, VectorPayload,
VideoStatus,
VideoStatus, WorkspaceDb,
};
use crate::core::embedding::Embedder;
use crate::core::processor::heuristic_scene::generate_scene_meta;
@@ -376,15 +376,109 @@ impl JobWorker {
error!("Failed to create completed processor result: {}", e);
}
// Load output file and store to pre_chunks
// Also dual-write to workspace if available
let workspace = WorkspaceDb::open(&job.uuid).await.ok();
if let Ok(json_str) = std::fs::read_to_string(&output_path) {
let store_result = match processor_type {
let store_result: Result<()> = match processor_type {
crate::core::db::ProcessorType::Asr => {
if let Ok(result) =
serde_json::from_str::<crate::core::processor::AsrResult>(&json_str)
{
if let Err(e) =
ProcessorPool::store_asr_chunks(&self.db, &job.uuid, &result)
.await
{
error!("Failed to store ASR chunks: {}", e);
}
if let Some(ref ws) = workspace {
for segment in &result.segments {
let data = serde_json::json!({
"text": segment.text,
"timestamp": segment.start_time,
});
let _ = ws
.store_pre_chunk(
"asr",
"raw",
segment.start_frame,
segment.end_frame,
Some(segment.start_time),
Some(segment.end_time),
Some(&data.to_string()),
Some(&segment.text),
)
.await;
}
}
Ok(())
} else {
error!(
"Failed to parse ASR JSON for {}: {}",
job.uuid,
json_str.len()
);
Ok(())
}
}
crate::core::db::ProcessorType::Asrx => {
if let Ok(result) = serde_json::from_str::<
crate::core::processor::AsrxResult,
>(&json_str)
{
ProcessorPool::store_asrx_chunks(&self.db, &job.uuid, &result).await
if let Err(e) =
ProcessorPool::store_asrx_chunks(&self.db, &job.uuid, &result)
.await
{
error!("Failed to store ASRX chunks: {}", e);
}
if let Some(ref ws) = workspace {
for segment in &result.segments {
let data = serde_json::json!({"text": segment.text, "speaker_id": segment.speaker_id, "end_time": segment.end_time});
let _ = ws
.store_pre_chunk(
"asrx",
"raw",
None,
None,
Some(segment.start_time),
Some(segment.end_time),
Some(&data.to_string()),
Some(&segment.text),
)
.await;
// Also store asr pre_chunks (needed by Rule 1 after checkin)
let _ = ws
.store_pre_chunk(
"asr",
"raw",
None,
None,
Some(segment.start_time),
Some(segment.end_time),
Some(&data.to_string()),
Some(&segment.text),
)
.await;
}
let spk_dets: Vec<crate::core::db::workspace_sqlite::SpeakerDetectionBatchItem> = result.segments.iter().map(|s| {
crate::core::db::workspace_sqlite::SpeakerDetectionBatchItem {
speaker_id: s.speaker_id.clone().unwrap_or_default(),
start_time: s.start_time,
end_time: s.end_time,
text: s.text.clone(),
chunk_id: None,
confidence: 0.0,
}
}).collect();
let _ = ws.store_speaker_detections_batch(&spk_dets).await;
}
Ok(())
} else {
error!(
"Failed to parse ASRX JSON for {}: {}",
job.uuid,
json_str.len()
);
Ok(())
}
}
@@ -392,8 +486,35 @@ impl JobWorker {
if let Ok(result) =
serde_json::from_str::<crate::core::processor::CutResult>(&json_str)
{
ProcessorPool::store_cut_chunks(&self.db, &job.uuid, &result).await
if let Err(e) =
ProcessorPool::store_cut_chunks(&self.db, &job.uuid, &result)
.await
{
error!("Failed to store CUT chunks: {}", e);
}
if let Some(ref ws) = workspace {
for scene in &result.scenes {
let _ = ws
.store_pre_chunk(
"cut",
"cut",
Some(scene.start_frame as i64),
Some(scene.end_frame as i64),
Some(scene.start_time),
Some(scene.end_time),
None,
None,
)
.await;
}
}
Ok(())
} else {
error!(
"Failed to parse CUT JSON for {}: {} bytes",
job.uuid,
json_str.len()
);
Ok(())
}
}
@@ -402,8 +523,36 @@ impl JobWorker {
crate::core::processor::YoloResult,
>(&json_str)
{
ProcessorPool::store_yolo_chunks(&self.db, &job.uuid, &result).await
if let Err(e) =
ProcessorPool::store_yolo_chunks(&self.db, &job.uuid, &result)
.await
{
error!("Failed to store YOLO chunks: {}", e);
}
if let Some(ref ws) = workspace {
for frame in &result.frames {
let data = serde_json::json!({"objects": frame.objects});
let _ = ws
.store_pre_chunk(
"yolo",
"raw",
Some(frame.frame as i64),
None,
Some(frame.timestamp),
None,
Some(&data.to_string()),
None,
)
.await;
}
}
Ok(())
} else {
error!(
"Failed to parse YOLO JSON for {}: {} bytes",
job.uuid,
json_str.len()
);
Ok(())
}
}
@@ -411,8 +560,36 @@ impl JobWorker {
if let Ok(result) =
serde_json::from_str::<crate::core::processor::OcrResult>(&json_str)
{
ProcessorPool::store_ocr_chunks(&self.db, &job.uuid, &result).await
if let Err(e) =
ProcessorPool::store_ocr_chunks(&self.db, &job.uuid, &result)
.await
{
error!("Failed to store OCR chunks: {}", e);
}
if let Some(ref ws) = workspace {
for frame in &result.frames {
let data = serde_json::json!({"texts": frame.texts});
let _ = ws
.store_pre_chunk(
"ocr",
"raw",
Some(frame.frame as i64),
None,
Some(frame.timestamp),
None,
Some(&data.to_string()),
None,
)
.await;
}
}
Ok(())
} else {
error!(
"Failed to parse OCR JSON for {}: {} bytes",
job.uuid,
json_str.len()
);
Ok(())
}
}
@@ -421,8 +598,51 @@ impl JobWorker {
crate::core::processor::FaceResult,
>(&json_str)
{
ProcessorPool::store_face_chunks(&self.db, &job.uuid, &result).await
if let Err(e) =
ProcessorPool::store_face_chunks(&self.db, &job.uuid, &result)
.await
{
error!("Failed to store FACE chunks: {}", e);
}
if let Some(ref ws) = workspace {
let dets: Vec<crate::core::db::workspace_sqlite::FaceDetectionBatchItem> = result.frames.iter().flat_map(|frame| {
frame.faces.iter().map(|face| crate::core::db::workspace_sqlite::FaceDetectionBatchItem {
face_id: face.face_id.clone(),
frame: frame.frame as i64,
ts: frame.timestamp,
x: face.x,
y: face.y,
w: face.width,
h: face.height,
confidence: face.confidence,
})
}).collect();
if !dets.is_empty() {
let _ = ws.store_face_detections_batch(&dets).await;
}
for frame in &result.frames {
let data = serde_json::json!({"faces": frame.faces});
let _ = ws
.store_pre_chunk(
"face",
"raw",
Some(frame.frame as i64),
None,
Some(frame.timestamp),
None,
Some(&data.to_string()),
None,
)
.await;
}
}
Ok(())
} else {
error!(
"Failed to parse FACE JSON for {}: {} bytes",
job.uuid,
json_str.len()
);
Ok(())
}
}
@@ -431,11 +651,40 @@ impl JobWorker {
crate::core::processor::PoseResult,
>(&json_str)
{
ProcessorPool::store_pose_chunks(&self.db, &job.uuid, &result).await
if let Err(e) =
ProcessorPool::store_pose_chunks(&self.db, &job.uuid, &result)
.await
{
error!("Failed to store POSE chunks: {}", e);
}
if let Some(ref ws) = workspace {
for frame in &result.frames {
let data = serde_json::json!({"persons": frame.persons});
let _ = ws
.store_pre_chunk(
"pose",
"raw",
Some(frame.frame as i64),
None,
Some(frame.timestamp),
None,
Some(&data.to_string()),
None,
)
.await;
}
}
Ok(())
} else {
error!(
"Failed to parse POSE JSON for {}: {} bytes",
job.uuid,
json_str.len()
);
Ok(())
}
}
crate::core::db::ProcessorType::Appearance => Ok(()),
_ => Ok(()),
};
if let Err(e) = store_result {
@@ -741,7 +990,7 @@ impl JobWorker {
macro_rules! check {
($sql:expr) => {
sqlx::query_scalar::<_, i64>($sql)
sqlx::query_scalar::<_, i32>($sql)
.fetch_one(pool)
.await
.unwrap_or(0)
@@ -797,7 +1046,7 @@ impl JobWorker {
// 例如Rule 1 只需 ASR+ASRX 完成即可觸發,不須等 face/pose/story 完成
// 定義必要 processor必須完成的才算 job 成功)
let essential_processors = ["cut", "asrx", "yolo"];
let essential_processors = ["cut", "asr", "asrx", "yolo"];
let essential_completed = essential_processors.iter().all(|ep| {
results.iter().any(|r| {
@@ -864,7 +1113,7 @@ impl JobWorker {
if has_asrx {
// Guard: only spawn Rule 1 if sentence chunks don't exist yet
let chunk_t = schema::table_name("chunk");
let already_spawned: bool = sqlx::query_scalar::<_, i64>(&format!(
let already_spawned: bool = sqlx::query_scalar::<_, i32>(&format!(
"SELECT 1 FROM {chunk_t} WHERE file_uuid = $1 AND chunk_type = 'sentence' LIMIT 1"
))
.bind(uuid)
@@ -1256,6 +1505,84 @@ impl JobWorker {
);
Ok(())
}
/// Vectorize relationship chunks (from Rule 2) and store in PG + Qdrant
async fn vectorize_relationship_chunks(db: &PostgresDb, uuid: &str) -> anyhow::Result<()> {
let embedder = Embedder::new("embeddinggemma-300m".to_string());
let qdrant = QdrantDb::new();
let pool = db.pool();
let chunk_table = schema::table_name("chunk");
let rows = sqlx::query_as::<_, (String, String, i64, i64, f64, f64)>(
&format!(
"SELECT chunk_id, text_content, start_frame, end_frame, start_time, end_time \
FROM {} WHERE file_uuid = $1 AND chunk_type = 'relationship' \
AND embedding IS NULL AND (text_content IS NOT NULL AND text_content != '') \
ORDER BY id",
chunk_table
),
)
.bind(uuid)
.fetch_all(pool)
.await?;
if rows.is_empty() {
info!("[Vectorize-R2] No relationship chunks to vectorize for {}", uuid);
return Ok(());
}
let total = rows.len();
info!(
"[Vectorize-R2] Starting vectorize of {} relationship chunks for {}",
total, uuid
);
let mut stored = 0usize;
for (chunk_id, text, start_frame, end_frame, start_time, end_time) in &rows {
if text.is_empty() {
continue;
}
match embedder.embed_document(&text).await {
Ok(vector) => {
if let Err(e) = db.store_vector(&chunk_id, &vector, uuid).await {
error!("[Vectorize-R2] PG store failed for {}: {}", chunk_id, e);
continue;
}
let payload = VectorPayload {
file_uuid: uuid.to_string(),
chunk_id: chunk_id.clone(),
chunk_type: "relationship".to_string(),
start_frame: *start_frame,
end_frame: *end_frame,
start_time: *start_time,
end_time: *end_time,
text: Some(text.clone()),
};
if let Err(e) = qdrant.upsert_vector(&chunk_id, &vector, payload).await {
error!("[Vectorize-R2] Qdrant upsert failed for {}: {}", chunk_id, e);
continue;
}
stored += 1;
if stored % 10 == 0 {
info!(
"[Vectorize-R2] {}/{} vectors stored for {}",
stored, total, uuid
);
}
}
Err(e) => {
error!("[Vectorize-R2] Embedding failed for {}: {}", chunk_id, e);
}
}
}
info!(
"[Vectorize-R2] Completed: {}/{} relationship vectors stored for {}",
stored, total, uuid
);
Ok(())
}
}
#[cfg(test)]

View File

@@ -14,7 +14,9 @@ struct ProcessorCleanupGuard {
running_count: Arc<RwLock<usize>>,
frame_count: Arc<RwLock<usize>>,
time_count: Arc<RwLock<usize>>,
best_effort_count: Arc<RwLock<usize>>,
pipeline: PipelineType,
is_best_effort: bool,
}
impl Drop for ProcessorCleanupGuard {
@@ -30,22 +32,30 @@ impl Drop for ProcessorCleanupGuard {
*guard -= 1;
}
}
match self.pipeline {
PipelineType::Frame => {
if let Ok(mut guard) = self.frame_count.try_write() {
if *guard > 0 {
*guard -= 1;
}
if self.is_best_effort {
if let Ok(mut guard) = self.best_effort_count.try_write() {
if *guard > 0 {
*guard -= 1;
}
}
PipelineType::Time => {
if let Ok(mut guard) = self.time_count.try_write() {
if *guard > 0 {
*guard -= 1;
} else {
match self.pipeline {
PipelineType::Frame => {
if let Ok(mut guard) = self.frame_count.try_write() {
if *guard > 0 {
*guard -= 1;
}
}
}
PipelineType::Time => {
if let Ok(mut guard) = self.time_count.try_write() {
if *guard > 0 {
*guard -= 1;
}
}
}
PipelineType::Cross => {} // cross pipeline not tracked in slot counts
}
PipelineType::Cross => {} // cross pipeline not tracked in slot counts
}
}
}
@@ -61,6 +71,7 @@ struct ProcessorHandle {
use crate::core::config::{OUTPUT_DIR, PYTHON_PATH, SCRIPTS_DIR};
use crate::core::db::{
MonitorJob, PipelineType, PostgresDb, ProcessorJobStatus, ProcessorType, QdrantDb, RedisClient,
WorkspaceDb,
};
use crate::core::processor;
use crate::core::processor::asr::AsrResult;
@@ -95,6 +106,8 @@ pub struct ProcessorTask {
const FRAME_SLOT_MAX: usize = 2;
/// Time pipeline max concurrent processors (audio is heavy, run 1 at a time).
const TIME_SLOT_MAX: usize = 1;
/// Best-effort slot (used by low-priority processors like MediaPipe).
const BEST_EFFORT_SLOT_MAX: usize = 1;
pub struct ProcessorPool {
db: Arc<PostgresDb>,
@@ -104,6 +117,7 @@ pub struct ProcessorPool {
running_count: Arc<RwLock<usize>>,
running_frame_count: Arc<RwLock<usize>>,
running_time_count: Arc<RwLock<usize>>,
running_best_effort_count: Arc<RwLock<usize>>,
}
impl ProcessorPool {
@@ -116,6 +130,7 @@ impl ProcessorPool {
running_count: Arc::new(RwLock::new(0)),
running_frame_count: Arc::new(RwLock::new(0)),
running_time_count: Arc::new(RwLock::new(0)),
running_best_effort_count: Arc::new(RwLock::new(0)),
}
}
@@ -225,16 +240,22 @@ impl ProcessorPool {
*count += 1;
}
// 遞增產線專屬 slot
match pipeline {
PipelineType::Frame => *self.running_frame_count.write().await += 1,
PipelineType::Time => *self.running_time_count.write().await += 1,
PipelineType::Cross => {} // cross pipeline uses global slot
let is_best_effort = processor_type == ProcessorType::MediaPipe;
if is_best_effort {
*self.running_best_effort_count.write().await += 1;
} else {
match pipeline {
PipelineType::Frame => *self.running_frame_count.write().await += 1,
PipelineType::Time => *self.running_time_count.write().await += 1,
PipelineType::Cross => {} // cross pipeline uses global slot
}
}
let running = self.running.clone();
let running_count = self.running_count.clone();
let running_frame_count = self.running_frame_count.clone();
let running_time_count = self.running_time_count.clone();
let running_best_effort_count = self.running_best_effort_count.clone();
let child_pid: Arc<RwLock<Option<i32>>> = Arc::new(RwLock::new(None));
running.write().await.insert(
job_id,
@@ -266,7 +287,9 @@ impl ProcessorPool {
running_count: running_count.clone(),
frame_count: running_frame_count.clone(),
time_count: running_time_count.clone(),
best_effort_count: running_best_effort_count.clone(),
pipeline,
is_best_effort,
};
info!("Starting processor {} for job {}", processor_name, job.uuid);
@@ -519,6 +542,14 @@ impl ProcessorPool {
let uuid = Some(job.uuid.as_str());
let video = db.get_video_by_uuid(&job.uuid).await?;
let total_frames = video.as_ref().map(|v| v.total_frames as i32).unwrap_or(0);
let fps = video.as_ref().map(|v| v.fps).unwrap_or(29.97);
// Compute 8Hz sample frames for frame-based processors
let sample_frames =
crate::core::processor::PythonExecutor::compute_8hz_frames(total_frames as i64, fps);
// Open workspace for dual-write (best-effort)
let workspace = WorkspaceDb::open(&job.uuid).await.ok();
match processor_type {
ProcessorType::Cut => {
@@ -540,6 +571,22 @@ impl ProcessorPool {
if let Err(e) = Self::store_cut_chunks(db, &job.uuid, &result).await {
tracing::error!("Failed to store CUT chunks for {}: {}", job.uuid, e);
}
if let Some(ref ws) = workspace {
for scene in &result.scenes {
let _ = ws
.store_pre_chunk(
"cut",
"cut",
Some(scene.start_frame as i64),
Some(scene.end_frame as i64),
Some(scene.start_time),
Some(scene.end_time),
None,
None,
)
.await;
}
}
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
@@ -550,9 +597,13 @@ impl ProcessorPool {
})
}
ProcessorType::Yolo => {
let result =
processor::process_yolo(video_path, output_path.to_str().unwrap(), uuid)
.await?;
let result = processor::process_yolo(
video_path,
output_path.to_str().unwrap(),
uuid,
Some(&sample_frames),
)
.await?;
let chunks_produced = result.frames.len() as i32;
tracing::info!(
"YOLO completed, storing {} frames for {}",
@@ -562,6 +613,23 @@ impl ProcessorPool {
if let Err(e) = Self::store_yolo_chunks(db, &job.uuid, &result).await {
tracing::error!("Failed to store YOLO chunks for {}: {}", job.uuid, e);
}
if let Some(ref ws) = workspace {
for frame in &result.frames {
let data = serde_json::json!({"objects": frame.objects});
let _ = ws
.store_pre_chunk(
"yolo",
"raw",
Some(frame.frame as i64),
None,
Some(frame.timestamp),
None,
Some(&data.to_string()),
None,
)
.await;
}
}
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
@@ -572,8 +640,13 @@ impl ProcessorPool {
})
}
ProcessorType::Ocr => {
let result =
processor::process_ocr(video_path, output_path.to_str().unwrap(), uuid).await?;
let result = processor::process_ocr(
video_path,
output_path.to_str().unwrap(),
uuid,
Some(&sample_frames),
)
.await?;
let chunks_produced = result.frames.len() as i32;
tracing::info!(
"OCR completed, storing {} frames for {}",
@@ -583,6 +656,23 @@ impl ProcessorPool {
if let Err(e) = Self::store_ocr_chunks(db, &job.uuid, &result).await {
tracing::error!("Failed to store OCR chunks for {}: {}", job.uuid, e);
}
if let Some(ref ws) = workspace {
for frame in &result.frames {
let data = serde_json::json!({"texts": frame.texts});
let _ = ws
.store_pre_chunk(
"ocr",
"raw",
Some(frame.frame as i64),
None,
Some(frame.timestamp),
None,
Some(&data.to_string()),
None,
)
.await;
}
}
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
@@ -593,9 +683,13 @@ impl ProcessorPool {
})
}
ProcessorType::Face => {
let result =
processor::process_face(video_path, output_path.to_str().unwrap(), uuid)
.await?;
let result = processor::process_face(
video_path,
output_path.to_str().unwrap(),
uuid,
Some(&sample_frames),
)
.await?;
let chunks_produced = result.frames.len() as i32;
tracing::info!(
"FACE completed, storing {} frames for {}",
@@ -605,6 +699,45 @@ impl ProcessorPool {
if let Err(e) = Self::store_face_chunks(db, &job.uuid, &result).await {
tracing::error!("Failed to store FACE chunks for {}: {}", job.uuid, e);
}
if let Some(ref ws) = workspace {
let dets: Vec<crate::core::db::workspace_sqlite::FaceDetectionBatchItem> =
result
.frames
.iter()
.flat_map(|frame| {
frame.faces.iter().map(|face| {
crate::core::db::workspace_sqlite::FaceDetectionBatchItem {
face_id: face.face_id.clone(),
frame: frame.frame as i64,
ts: frame.timestamp,
x: face.x,
y: face.y,
w: face.width,
h: face.height,
confidence: face.confidence,
}
})
})
.collect();
if !dets.is_empty() {
let _ = ws.store_face_detections_batch(&dets).await;
}
for frame in &result.frames {
let data = serde_json::json!({"faces": frame.faces});
let _ = ws
.store_pre_chunk(
"face",
"raw",
Some(frame.frame as i64),
None,
Some(frame.timestamp),
None,
Some(&data.to_string()),
None,
)
.await;
}
}
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
@@ -615,9 +748,13 @@ impl ProcessorPool {
})
}
ProcessorType::Pose => {
let result =
processor::process_pose(video_path, output_path.to_str().unwrap(), uuid)
.await?;
let result = processor::process_pose(
video_path,
output_path.to_str().unwrap(),
uuid,
Some(&sample_frames),
)
.await?;
let chunks_produced = result.frames.len() as i32;
tracing::info!(
"POSE completed, storing {} frames for {}",
@@ -627,6 +764,91 @@ impl ProcessorPool {
if let Err(e) = Self::store_pose_chunks(db, &job.uuid, &result).await {
tracing::error!("Failed to store POSE chunks for {}: {}", job.uuid, e);
}
if let Some(ref ws) = workspace {
for frame in &result.frames {
let data = serde_json::json!({"persons": frame.persons});
let _ = ws
.store_pre_chunk(
"pose",
"raw",
Some(frame.frame as i64),
None,
Some(frame.timestamp),
None,
Some(&data.to_string()),
None,
)
.await;
}
}
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
frames_processed: total_frames,
total_frames,
retry_count: 0,
pid: 0,
})
}
ProcessorType::Appearance => {
let pose_path =
std::path::Path::new(&output_dir).join(format!("{}.pose.json", job.uuid));
let pose_path_str = pose_path.to_str().unwrap_or("");
let result = processor::process_appearance(
video_path,
pose_path_str,
output_path.to_str().unwrap(),
uuid,
Some(&sample_frames),
)
.await?;
let chunks_produced = result.frame_count as i32;
tracing::info!(
"APPEARANCE completed, {} frames for {}",
chunks_produced,
job.uuid
);
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
frames_processed: total_frames,
total_frames,
retry_count: 0,
pid: 0,
})
}
ProcessorType::Asr => {
let result =
processor::process_asr(video_path, output_path.to_str().unwrap(), uuid).await?;
let chunks_produced = result.segments.len() as i32;
tracing::info!(
"ASR completed, storing {} segments for {}",
chunks_produced,
job.uuid
);
if let Err(e) = Self::store_asr_chunks(db, &job.uuid, &result).await {
tracing::error!("Failed to store ASR chunks for {}: {}", job.uuid, e);
}
if let Some(ref ws) = workspace {
for segment in &result.segments {
let data = serde_json::json!({
"text": segment.text,
"timestamp": segment.start_time,
});
let _ = ws
.store_pre_chunk(
"asr",
"raw",
segment.start_frame,
segment.end_frame,
Some(segment.start_time),
Some(segment.end_time),
Some(&data.to_string()),
Some(&segment.text),
)
.await;
}
}
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
@@ -653,6 +875,47 @@ impl ProcessorPool {
if let Err(e) = Self::store_voice_embeddings_to_qdrant(&job.uuid, &result).await {
tracing::error!("Failed to store voice embeddings to Qdrant: {}", e);
}
// 寫入 workspace
if let Some(ref ws) = workspace {
for segment in &result.segments {
let data = serde_json::json!({
"text": segment.text,
"speaker_id": segment.speaker_id,
"timestamp": segment.start_time,
});
let _ = ws
.store_pre_chunk(
"asrx",
"raw",
None,
None,
Some(segment.start_time),
Some(segment.end_time),
Some(&data.to_string()),
Some(&segment.text),
)
.await;
}
let spk_dets: Vec<
crate::core::db::workspace_sqlite::SpeakerDetectionBatchItem,
> = result
.segments
.iter()
.map(
|s| crate::core::db::workspace_sqlite::SpeakerDetectionBatchItem {
speaker_id: s.speaker_id.clone().unwrap_or_default(),
start_time: s.start_time,
end_time: s.end_time,
text: s.text.clone(),
chunk_id: None,
confidence: 0.0,
},
)
.collect();
if !spk_dets.is_empty() {
let _ = ws.store_speaker_detections_batch(&spk_dets).await;
}
}
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
@@ -703,6 +966,22 @@ impl ProcessorPool {
if let Err(e) = Self::store_scene_chunks(db, &job.uuid, &result).await {
tracing::error!("Failed to store Scene chunks for {}: {}", job.uuid, e);
}
if let Some(ref ws) = workspace {
for scene in &result.scenes {
let _ = ws
.store_pre_chunk(
"scene",
"scene",
None,
None,
Some(scene.start_time),
Some(scene.end_time),
None,
None,
)
.await;
}
}
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
@@ -763,6 +1042,29 @@ impl ProcessorPool {
pid: 0,
})
}
ProcessorType::MediaPipe => {
let result = processor::process_mediapipe_v2(
video_path,
output_path.to_str().unwrap(),
uuid,
Some(&sample_frames),
)
.await?;
let chunks_produced = result.frames.len() as i32;
tracing::info!(
"MEDIAPIPE completed, {} frames for {}",
chunks_produced,
job.uuid
);
Ok(ProcessorOutput {
data: serde_json::to_value(result)?,
chunks_produced,
frames_processed: total_frames,
total_frames,
retry_count: 0,
pid: 0,
})
}
}
}
@@ -944,6 +1246,7 @@ impl ProcessorPool {
detections_to_store.push((
frame.frame as i64,
frame.timestamp,
face.face_id.clone(),
face.x,
face.y,
face.width,
@@ -960,23 +1263,20 @@ impl ProcessorPool {
Ok(())
}
/// 將 voice embeddings 寫入 Qdrant momentry_dev_voice collection
/// 將 voice embeddings 寫入 Qdrant {file_uuid}_voice collection (per-file)
pub async fn store_voice_embeddings_to_qdrant(
uuid: &str,
asrx_result: &AsrxResult,
) -> Result<()> {
let qdrant = QdrantDb::new();
let collection = format!(
"{}{}",
crate::core::config::REDIS_KEY_PREFIX
.as_str()
.trim_end_matches(':'),
"_voice"
);
let collection = format!("{}_voice", uuid);
// 確保 collection 存在dim=192 for ASRX voice
if let Err(e) = qdrant.ensure_collection(&collection, 192).await {
tracing::error!("Failed to ensure Qdrant voice collection: {}", e);
tracing::error!(
"Failed to ensure Qdrant voice collection {}: {}",
collection,
e
);
return Ok(());
}
@@ -991,12 +1291,10 @@ impl ProcessorPool {
if emb.len() != 192 {
continue;
}
// Point ID: hash(file_uuid + speaker_id + index) for global uniqueness
// Point ID: hash(speaker_id + index) — file_uuid redundant in per-file collection
let point_id = {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(uuid.as_bytes());
hasher.update(b"_");
hasher.update(segment.speaker_id.clone().unwrap_or_default().as_bytes());
hasher.update(b"_");
hasher.update(i.to_string().as_bytes());
@@ -1012,6 +1310,7 @@ impl ProcessorPool {
"end_frame": segment.end_frame,
"start_time": segment.start_time,
"end_time": segment.end_time,
"event_type": "speaker",
});
if let Err(e) = qdrant
@@ -1026,7 +1325,12 @@ impl ProcessorPool {
}
if count > 0 {
tracing::info!("Stored {} voice embeddings to Qdrant for {}", count, uuid);
tracing::info!(
"Stored {} voice embeddings to Qdrant per-file collection {} for {}",
count,
collection,
uuid
);
}
Ok(())
}
@@ -1079,6 +1383,7 @@ impl ProcessorPool {
"text": segment.text,
"speaker_id": segment.speaker_id,
"timestamp": segment.start_time,
"end_time": segment.end_time,
});
// ASRX is time-based, so we use segment index or start time as coordinate.