feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)
Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
@@ -1,3 +1,7 @@
|
||||
pub mod schema;
|
||||
pub mod verifier;
|
||||
|
||||
pub use verifier::{verify_output, VerificationResult, VerifierError};
|
||||
pub use schema::{FileVerificationReport, ProcessorVerification};
|
||||
pub use verifier::{
|
||||
cleanup_temp_files, verify_file, verify_output, VerificationResult, VerifierError,
|
||||
};
|
||||
|
||||
355
src/verification/schema.rs
Normal file
355
src/verification/schema.rs
Normal file
@@ -0,0 +1,355 @@
|
||||
use crate::core::db::ProcessorType;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Required field definition for JSON schema validation
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RequiredField {
|
||||
pub path: &'static str,
|
||||
pub field_type: FieldType,
|
||||
pub allow_empty: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum FieldType {
|
||||
Number,
|
||||
PositiveNumber,
|
||||
Array,
|
||||
NonEmptyArray,
|
||||
Object,
|
||||
String,
|
||||
OptionalNumber,
|
||||
}
|
||||
|
||||
/// Processor JSON schema: defines required fields and their types
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ProcessorJsonSchema {
|
||||
pub processor: ProcessorType,
|
||||
pub required_fields: &'static [RequiredField],
|
||||
pub min_data_threshold: usize,
|
||||
}
|
||||
|
||||
/// All processor schemas
|
||||
pub const PROCESSOR_SCHEMAS: &[ProcessorJsonSchema] = &[
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Cut,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "frame_count",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "fps",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "scenes",
|
||||
field_type: FieldType::NonEmptyArray,
|
||||
allow_empty: false,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 1,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Yolo,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "metadata.fps",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "metadata.total_frames",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "frames",
|
||||
field_type: FieldType::Object,
|
||||
allow_empty: true,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 0,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Ocr,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "frame_count",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "fps",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "frames",
|
||||
field_type: FieldType::Array,
|
||||
allow_empty: true,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 0,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Face,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "frame_count",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "fps",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "frames",
|
||||
field_type: FieldType::Array,
|
||||
allow_empty: true,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 0,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Pose,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "frame_count",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "fps",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "frames",
|
||||
field_type: FieldType::Array,
|
||||
allow_empty: true,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 0,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Appearance,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "frame_count",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "fps",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "frames",
|
||||
field_type: FieldType::Array,
|
||||
allow_empty: true,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 0,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Asr,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "language",
|
||||
field_type: FieldType::OptionalNumber,
|
||||
allow_empty: true,
|
||||
},
|
||||
RequiredField {
|
||||
path: "segments",
|
||||
field_type: FieldType::Array,
|
||||
allow_empty: true,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 0,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Asrx,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "language",
|
||||
field_type: FieldType::OptionalNumber,
|
||||
allow_empty: true,
|
||||
},
|
||||
RequiredField {
|
||||
path: "segments",
|
||||
field_type: FieldType::Array,
|
||||
allow_empty: true,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 0,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Scene,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "frame_count",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "fps",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "scenes",
|
||||
field_type: FieldType::NonEmptyArray,
|
||||
allow_empty: false,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 1,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::Story,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "child_chunks",
|
||||
field_type: FieldType::Array,
|
||||
allow_empty: true,
|
||||
},
|
||||
RequiredField {
|
||||
path: "parent_chunks",
|
||||
field_type: FieldType::Array,
|
||||
allow_empty: true,
|
||||
},
|
||||
RequiredField {
|
||||
path: "stats",
|
||||
field_type: FieldType::Object,
|
||||
allow_empty: false,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 0,
|
||||
},
|
||||
ProcessorJsonSchema {
|
||||
processor: ProcessorType::MediaPipe,
|
||||
required_fields: &[
|
||||
RequiredField {
|
||||
path: "frame_count",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "fps",
|
||||
field_type: FieldType::PositiveNumber,
|
||||
allow_empty: false,
|
||||
},
|
||||
RequiredField {
|
||||
path: "frames",
|
||||
field_type: FieldType::Array,
|
||||
allow_empty: true,
|
||||
},
|
||||
],
|
||||
min_data_threshold: 0,
|
||||
},
|
||||
];
|
||||
|
||||
/// Get schema for a processor
|
||||
pub fn get_schema(processor: &ProcessorType) -> Option<&'static ProcessorJsonSchema> {
|
||||
PROCESSOR_SCHEMAS.iter().find(|s| s.processor == *processor)
|
||||
}
|
||||
|
||||
/// Verification result for a single processor
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ProcessorVerification {
|
||||
pub processor: String,
|
||||
pub file_exists: bool,
|
||||
pub valid_json: bool,
|
||||
pub completeness: bool,
|
||||
pub dependency_ok: bool,
|
||||
pub reasonableness: bool,
|
||||
pub trust_level: String,
|
||||
pub issues: Vec<String>,
|
||||
pub data_summary: serde_json::Value,
|
||||
}
|
||||
|
||||
impl ProcessorVerification {
|
||||
pub fn new(processor: &str) -> Self {
|
||||
Self {
|
||||
processor: processor.to_string(),
|
||||
file_exists: false,
|
||||
valid_json: false,
|
||||
completeness: false,
|
||||
dependency_ok: true,
|
||||
reasonableness: true,
|
||||
trust_level: "untrusted".to_string(),
|
||||
issues: Vec::new(),
|
||||
data_summary: serde_json::json!({}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_trust_level(&mut self) {
|
||||
if self.file_exists
|
||||
&& self.valid_json
|
||||
&& self.completeness
|
||||
&& self.dependency_ok
|
||||
&& self.reasonableness
|
||||
{
|
||||
self.trust_level = "trusted".to_string();
|
||||
} else if self.file_exists && self.valid_json && self.completeness && !self.dependency_ok {
|
||||
self.trust_level = "degraded".to_string();
|
||||
} else if self.file_exists
|
||||
&& self.valid_json
|
||||
&& self.completeness
|
||||
&& self.dependency_ok
|
||||
&& !self.reasonableness
|
||||
{
|
||||
self.trust_level = "suspicious".to_string();
|
||||
} else {
|
||||
self.trust_level = "untrusted".to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Overall file verification report
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FileVerificationReport {
|
||||
pub file_uuid: String,
|
||||
pub trust_level: String,
|
||||
pub processors: Vec<ProcessorVerification>,
|
||||
pub summary: serde_json::Value,
|
||||
}
|
||||
|
||||
impl FileVerificationReport {
|
||||
pub fn update_overall_trust(&mut self) {
|
||||
let levels: Vec<&str> = self
|
||||
.processors
|
||||
.iter()
|
||||
.map(|p| p.trust_level.as_str())
|
||||
.collect();
|
||||
|
||||
self.trust_level = if levels.is_empty() {
|
||||
"untrusted".to_string()
|
||||
} else if levels.iter().all(|&l| l == "trusted") {
|
||||
"trusted".to_string()
|
||||
} else if levels.iter().all(|&l| l == "trusted" || l == "degraded") {
|
||||
"degraded".to_string()
|
||||
} else if levels.iter().any(|&l| l == "suspicious") {
|
||||
"suspicious".to_string()
|
||||
} else {
|
||||
"untrusted".to_string()
|
||||
};
|
||||
|
||||
let trusted = levels.iter().filter(|&&l| l == "trusted").count();
|
||||
let degraded = levels.iter().filter(|&&l| l == "degraded").count();
|
||||
let suspicious = levels.iter().filter(|&&l| l == "suspicious").count();
|
||||
let untrusted = levels.iter().filter(|&&l| l == "untrusted").count();
|
||||
|
||||
self.summary = serde_json::json!({
|
||||
"total": levels.len(),
|
||||
"trusted": trusted,
|
||||
"degraded": degraded,
|
||||
"suspicious": suspicious,
|
||||
"untrusted": untrusted
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,13 @@
|
||||
use crate::core::config::OUTPUT_DIR;
|
||||
use crate::core::db::ProcessorType;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use tracing::info;
|
||||
|
||||
use super::schema::{
|
||||
get_schema, FieldType, FileVerificationReport, ProcessorJsonSchema, ProcessorVerification,
|
||||
RequiredField,
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct VerificationResult {
|
||||
@@ -37,6 +42,601 @@ pub struct VerifierError {
|
||||
pub reason: String,
|
||||
}
|
||||
|
||||
/// Resolve file_uuid (supports short prefix)
|
||||
fn resolve_uuid(file_uuid: &str) -> String {
|
||||
if file_uuid.len() == 32 {
|
||||
file_uuid.to_string()
|
||||
} else {
|
||||
// Try to find full UUID by scanning output directory
|
||||
let prefix = file_uuid;
|
||||
if let Ok(entries) = std::fs::read_dir(OUTPUT_DIR.as_str()) {
|
||||
for entry in entries.flatten() {
|
||||
if let Some(name) = entry.file_name().to_str() {
|
||||
if name.starts_with(prefix) && name.ends_with(".probe.json") {
|
||||
return name.split('.').next().unwrap_or(prefix).to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
file_uuid.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Layer 1: Check JSON structure and data completeness
|
||||
fn check_completeness(processor: &ProcessorType, value: &serde_json::Value) -> (bool, Vec<String>) {
|
||||
let schema = match get_schema(processor) {
|
||||
Some(s) => s,
|
||||
None => return (true, Vec::new()), // No schema = pass
|
||||
};
|
||||
|
||||
let mut issues = Vec::new();
|
||||
|
||||
for field in schema.required_fields {
|
||||
if let Some(val) = get_value_at_path(value, field.path) {
|
||||
match field.field_type {
|
||||
FieldType::Number | FieldType::PositiveNumber => {
|
||||
if val.as_f64().is_none() {
|
||||
issues.push(format!("'{}' is not a number", field.path));
|
||||
}
|
||||
if field.field_type == FieldType::PositiveNumber && val.as_f64() == Some(0.0) {
|
||||
issues.push(format!("'{}' is zero (expected positive)", field.path));
|
||||
}
|
||||
}
|
||||
FieldType::Array | FieldType::NonEmptyArray => {
|
||||
if let Some(arr) = val.as_array() {
|
||||
if arr.is_empty() && !field.allow_empty {
|
||||
issues.push(format!("'{}' is empty", field.path));
|
||||
}
|
||||
} else {
|
||||
issues.push(format!("'{}' is not an array", field.path));
|
||||
}
|
||||
}
|
||||
FieldType::Object => {
|
||||
if val.as_object().is_none() {
|
||||
issues.push(format!("'{}' is not an object", field.path));
|
||||
}
|
||||
}
|
||||
FieldType::String => {
|
||||
if val.as_str().is_none() {
|
||||
issues.push(format!("'{}' is not a string", field.path));
|
||||
}
|
||||
}
|
||||
FieldType::OptionalNumber => {
|
||||
// Optional, skip if null/missing
|
||||
}
|
||||
}
|
||||
} else if !field.allow_empty || field.field_type != FieldType::OptionalNumber {
|
||||
issues.push(format!("'{}' is missing", field.path));
|
||||
}
|
||||
}
|
||||
|
||||
// Check data threshold
|
||||
let data_count = count_data_items(processor, value);
|
||||
if data_count < schema.min_data_threshold {
|
||||
issues.push(format!(
|
||||
"data count {} below minimum threshold {}",
|
||||
data_count, schema.min_data_threshold
|
||||
));
|
||||
}
|
||||
|
||||
(issues.is_empty(), issues)
|
||||
}
|
||||
|
||||
/// Extract data count from JSON based on processor type
|
||||
fn count_data_items(processor: &ProcessorType, value: &serde_json::Value) -> usize {
|
||||
match processor {
|
||||
ProcessorType::Cut => value
|
||||
.get("scenes")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0),
|
||||
ProcessorType::Yolo => {
|
||||
// YOLO uses dict-based frames
|
||||
value
|
||||
.get("frames")
|
||||
.and_then(|v| v.as_object())
|
||||
.map(|o| o.len())
|
||||
.unwrap_or(0)
|
||||
}
|
||||
ProcessorType::Ocr
|
||||
| ProcessorType::Face
|
||||
| ProcessorType::Pose
|
||||
| ProcessorType::Appearance => value
|
||||
.get("frames")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0),
|
||||
ProcessorType::Asr => value
|
||||
.get("segments")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0),
|
||||
ProcessorType::Asrx => value
|
||||
.get("segments")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0),
|
||||
ProcessorType::Scene => value
|
||||
.get("scenes")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0),
|
||||
ProcessorType::Story => {
|
||||
let child = value
|
||||
.get("child_chunks")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0);
|
||||
let parent = value
|
||||
.get("parent_chunks")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0);
|
||||
child + parent
|
||||
}
|
||||
ProcessorType::MediaPipe => value
|
||||
.get("frames")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0),
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get value at a JSON path (e.g., "stats.total_child_chunks")
|
||||
fn get_value_at_path<'a>(
|
||||
value: &'a serde_json::Value,
|
||||
path: &str,
|
||||
) -> Option<&'a serde_json::Value> {
|
||||
let parts: Vec<&str> = path.split('.').collect();
|
||||
let mut current = value;
|
||||
for part in parts {
|
||||
current = current.get(part)?;
|
||||
}
|
||||
Some(current)
|
||||
}
|
||||
|
||||
/// Layer 2: Check dependency completeness
|
||||
fn check_dependencies(
|
||||
processor: &ProcessorType,
|
||||
all_results: &HashMap<String, &ProcessorVerification>,
|
||||
) -> (bool, Vec<String>) {
|
||||
let deps = processor.dependencies();
|
||||
let mut issues = Vec::new();
|
||||
|
||||
if deps.is_empty() {
|
||||
return (true, Vec::new());
|
||||
}
|
||||
|
||||
for dep in &deps {
|
||||
let dep_name = dep.as_str();
|
||||
match all_results.get(dep_name) {
|
||||
Some(dep_result) => {
|
||||
if !dep_result.file_exists || !dep_result.valid_json {
|
||||
issues.push(format!("dependency '{}' missing or invalid", dep_name));
|
||||
} else if !dep_result.completeness {
|
||||
issues.push(format!("dependency '{}' incomplete", dep_name));
|
||||
}
|
||||
// Note: trust_level not checked here as it's updated after this function runs
|
||||
}
|
||||
None => {
|
||||
issues.push(format!("dependency '{}' not found", dep_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(issues.is_empty(), issues)
|
||||
}
|
||||
|
||||
/// Layer 3: Cross-JSON reasonableness checks
|
||||
fn check_reasonableness(
|
||||
processor: &ProcessorType,
|
||||
value: &serde_json::Value,
|
||||
probe_value: Option<&serde_json::Value>,
|
||||
all_values: &HashMap<String, &serde_json::Value>,
|
||||
) -> (bool, Vec<String>) {
|
||||
let mut issues = Vec::new();
|
||||
|
||||
// Get probe data if available
|
||||
let probe_fps = probe_value
|
||||
.and_then(|p| p.get("streams"))
|
||||
.and_then(|s| s.as_array())
|
||||
.and_then(|streams| {
|
||||
streams
|
||||
.iter()
|
||||
.find(|s| s.get("codec_type").and_then(|c| c.as_str()) == Some("video"))
|
||||
})
|
||||
.and_then(|v| v.get("r_frame_rate"))
|
||||
.and_then(|r| r.as_str())
|
||||
.and_then(|fps_str| {
|
||||
if let Some((num, den)) = fps_str.split_once('/') {
|
||||
if let (Ok(n), Ok(d)) = (num.parse::<f64>(), den.parse::<f64>()) {
|
||||
if d > 0.0 {
|
||||
return Some(n / d);
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
});
|
||||
|
||||
let probe_frames = probe_value
|
||||
.and_then(|p| p.get("streams"))
|
||||
.and_then(|s| s.as_array())
|
||||
.and_then(|streams| {
|
||||
streams
|
||||
.iter()
|
||||
.find(|s| s.get("codec_type").and_then(|c| c.as_str()) == Some("video"))
|
||||
})
|
||||
.and_then(|v| v.get("nb_frames"))
|
||||
.and_then(|n| n.as_str())
|
||||
.and_then(|s| s.parse::<u64>().ok());
|
||||
|
||||
// Check fps consistency with probe
|
||||
if let Some(json_fps) = value.get("fps").and_then(|v| v.as_f64()) {
|
||||
if json_fps <= 0.0 {
|
||||
issues.push("fps is zero or negative".to_string());
|
||||
} else if let Some(p_fps) = probe_fps {
|
||||
let diff = (json_fps - p_fps).abs();
|
||||
if diff > 0.5 {
|
||||
issues.push(format!(
|
||||
"fps mismatch: JSON={}, probe={:.2}",
|
||||
json_fps, p_fps
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check frame_count consistency with probe
|
||||
// For sampled processors (8Hz), frame_count should be ~total_frames/8
|
||||
// Only flag if the count is wildly off (less than 10% of expected)
|
||||
if let Some(json_frames) = value.get("frame_count").and_then(|v| v.as_u64()) {
|
||||
if let Some(p_frames) = probe_frames {
|
||||
// Check if this is a sampled processor (most frame processors use 8Hz)
|
||||
let is_sampled = matches!(
|
||||
processor,
|
||||
ProcessorType::Cut
|
||||
| ProcessorType::Yolo
|
||||
| ProcessorType::Ocr
|
||||
| ProcessorType::Face
|
||||
| ProcessorType::Pose
|
||||
| ProcessorType::Appearance
|
||||
| ProcessorType::Scene
|
||||
);
|
||||
let expected = if is_sampled {
|
||||
(p_frames as f64 / 8.0) as u64
|
||||
} else {
|
||||
p_frames
|
||||
};
|
||||
// Allow 50% tolerance for sampling variations
|
||||
let min_expected = (expected as f64 * 0.1) as u64;
|
||||
if json_frames > 0 && json_frames < min_expected && min_expected > 0 {
|
||||
issues.push(format!(
|
||||
"frame_count {} much less than expected ~{} (probe={})",
|
||||
json_frames, expected, p_frames
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Story-specific: check chunk count vs cut scene count
|
||||
if *processor == ProcessorType::Story {
|
||||
if let Some(cut_value) = all_values.get("cut") {
|
||||
let story_chunks = count_data_items(processor, value);
|
||||
let cut_scenes = count_data_items(&ProcessorType::Cut, cut_value);
|
||||
if story_chunks > 0 && cut_scenes > 0 {
|
||||
// Story chunks should be >= cut scenes (one chunk per scene minimum)
|
||||
if story_chunks < cut_scenes / 2 {
|
||||
issues.push(format!(
|
||||
"story chunk count ({}) much less than cut scene count ({})",
|
||||
story_chunks, cut_scenes
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ASR-specific: check segments vs cut scenes
|
||||
if *processor == ProcessorType::Asr {
|
||||
if let Some(cut_value) = all_values.get("cut") {
|
||||
let asr_segments = count_data_items(processor, value);
|
||||
let cut_scenes = count_data_items(&ProcessorType::Cut, cut_value);
|
||||
if asr_segments == 0 && cut_scenes > 5 {
|
||||
issues.push(format!(
|
||||
"ASR has 0 segments but CUT has {} scenes",
|
||||
cut_scenes
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ASRX-specific: check segments vs cut scenes
|
||||
if *processor == ProcessorType::Asrx {
|
||||
if let Some(cut_value) = all_values.get("cut") {
|
||||
let asrx_segments = count_data_items(processor, value);
|
||||
let cut_scenes = count_data_items(&ProcessorType::Cut, cut_value);
|
||||
// Only flag if CUT has many scenes but ASRX has none (likely a processing issue)
|
||||
if asrx_segments == 0 && cut_scenes > 5 {
|
||||
issues.push(format!(
|
||||
"ASRX has 0 segments but CUT has {} scenes",
|
||||
cut_scenes
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check scene time ranges
|
||||
if *processor == ProcessorType::Cut || *processor == ProcessorType::Scene {
|
||||
if let Some(scenes) = value.get("scenes").and_then(|v| v.as_array()) {
|
||||
for (i, scene) in scenes.iter().enumerate() {
|
||||
let start = scene.get("start_time").and_then(|v| v.as_f64());
|
||||
let end = scene.get("end_time").and_then(|v| v.as_f64());
|
||||
if let (Some(s), Some(e)) = (start, end) {
|
||||
if e < s {
|
||||
issues.push(format!("scene {}: end_time < start_time", i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(issues.is_empty(), issues)
|
||||
}
|
||||
|
||||
/// Build data summary for a processor JSON
|
||||
fn build_data_summary(processor: &ProcessorType, value: &serde_json::Value) -> serde_json::Value {
|
||||
let data_count = count_data_items(processor, value);
|
||||
|
||||
let mut summary = serde_json::json!({
|
||||
"data_count": data_count
|
||||
});
|
||||
|
||||
match processor {
|
||||
ProcessorType::Cut => {
|
||||
if let Some(scenes) = value.get("scenes").and_then(|v| v.as_array()) {
|
||||
summary["scene_count"] = serde_json::json!(scenes.len());
|
||||
if let Some(first) = scenes.first() {
|
||||
summary["first_scene_start"] =
|
||||
first.get("start_time").and_then(|v| v.as_f64()).into();
|
||||
}
|
||||
if let Some(last) = scenes.last() {
|
||||
summary["last_scene_end"] =
|
||||
last.get("end_time").and_then(|v| v.as_f64()).into();
|
||||
}
|
||||
}
|
||||
}
|
||||
ProcessorType::Face
|
||||
| ProcessorType::Ocr
|
||||
| ProcessorType::Pose
|
||||
| ProcessorType::Appearance => {
|
||||
if let Some(frames) = value.get("frames").and_then(|v| v.as_array()) {
|
||||
let total_detections: usize = frames
|
||||
.iter()
|
||||
.map(|f| {
|
||||
f.get("faces")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0)
|
||||
+ f.get("objects")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0)
|
||||
+ f.get("texts")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0)
|
||||
+ f.get("persons")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.sum();
|
||||
summary["total_detections"] = serde_json::json!(total_detections);
|
||||
summary["frames_with_data"] = serde_json::json!(frames
|
||||
.iter()
|
||||
.filter(|f| {
|
||||
f.get("faces")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| !a.is_empty())
|
||||
.unwrap_or(false)
|
||||
|| f.get("objects")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| !a.is_empty())
|
||||
.unwrap_or(false)
|
||||
|| f.get("texts")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| !a.is_empty())
|
||||
.unwrap_or(false)
|
||||
|| f.get("persons")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| !a.is_empty())
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.count());
|
||||
}
|
||||
}
|
||||
ProcessorType::Yolo => {
|
||||
if let Some(frames) = value.get("frames").and_then(|v| v.as_object()) {
|
||||
let total_detections: usize = frames
|
||||
.values()
|
||||
.map(|f| {
|
||||
f.get("objects")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.sum();
|
||||
summary["total_detections"] = serde_json::json!(total_detections);
|
||||
summary["frames_with_data"] = serde_json::json!(frames
|
||||
.values()
|
||||
.filter(|f| {
|
||||
f.get("objects")
|
||||
.and_then(|v| v.as_array())
|
||||
.map(|a| !a.is_empty())
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.count());
|
||||
}
|
||||
}
|
||||
ProcessorType::Asr => {
|
||||
if let Some(segments) = value.get("segments").and_then(|v| v.as_array()) {
|
||||
summary["segment_count"] = serde_json::json!(segments.len());
|
||||
if let Some(lang) = value.get("language").and_then(|v| v.as_str()) {
|
||||
summary["language"] = serde_json::json!(lang);
|
||||
}
|
||||
}
|
||||
}
|
||||
ProcessorType::Asrx => {
|
||||
if let Some(segments) = value.get("segments").and_then(|v| v.as_array()) {
|
||||
let speakers: std::collections::HashSet<String> = segments
|
||||
.iter()
|
||||
.filter_map(|s| {
|
||||
s.get("speaker_id")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string())
|
||||
})
|
||||
.collect();
|
||||
summary["segment_count"] = serde_json::json!(segments.len());
|
||||
summary["speaker_count"] = serde_json::json!(speakers.len());
|
||||
}
|
||||
}
|
||||
ProcessorType::Story => {
|
||||
if let Some(stats) = value.get("stats") {
|
||||
summary["stats"] = stats.clone();
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
summary
|
||||
}
|
||||
|
||||
/// Load probe.json for a file
|
||||
fn load_probe_json(file_uuid: &str) -> Option<serde_json::Value> {
|
||||
let probe_path = PathBuf::from(OUTPUT_DIR.as_str()).join(format!("{}.probe.json", file_uuid));
|
||||
if let Ok(content) = std::fs::read_to_string(&probe_path) {
|
||||
serde_json::from_str(&content).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Main verification function for a file
|
||||
pub fn verify_file(file_uuid: &str) -> FileVerificationReport {
|
||||
let full_uuid = resolve_uuid(file_uuid);
|
||||
let processors = ProcessorType::all();
|
||||
let mut report = FileVerificationReport {
|
||||
file_uuid: full_uuid.clone(),
|
||||
trust_level: "untrusted".to_string(),
|
||||
processors: Vec::new(),
|
||||
summary: serde_json::json!({}),
|
||||
};
|
||||
|
||||
// Load probe.json once
|
||||
let probe_value = load_probe_json(&full_uuid);
|
||||
|
||||
// Phase 1: Load all JSON values
|
||||
let mut all_values: HashMap<String, serde_json::Value> = HashMap::new();
|
||||
for processor in &processors {
|
||||
let proc_name = processor.as_str();
|
||||
let filename = match processor {
|
||||
ProcessorType::Story => format!("{}.story_story.json", full_uuid),
|
||||
_ => format!("{}.{}.json", full_uuid, proc_name),
|
||||
};
|
||||
let path = PathBuf::from(OUTPUT_DIR.as_str()).join(&filename);
|
||||
|
||||
if let Ok(content) = std::fs::read_to_string(&path) {
|
||||
if let Ok(value) = serde_json::from_str(&content) {
|
||||
all_values.insert(proc_name.to_string(), value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Verify each processor
|
||||
let mut verifications: Vec<ProcessorVerification> = Vec::new();
|
||||
let mut value_refs: HashMap<String, &serde_json::Value> = HashMap::new();
|
||||
|
||||
for processor in &processors {
|
||||
let proc_name = processor.as_str();
|
||||
let mut pv = ProcessorVerification::new(proc_name);
|
||||
|
||||
if let Some(value) = all_values.get(proc_name) {
|
||||
pv.file_exists = true;
|
||||
pv.valid_json = true;
|
||||
value_refs.insert(proc_name.to_string(), value);
|
||||
|
||||
// Layer 1: Completeness
|
||||
let (complete, issues) = check_completeness(processor, value);
|
||||
pv.completeness = complete;
|
||||
pv.issues.extend(issues);
|
||||
|
||||
// Data summary
|
||||
pv.data_summary = build_data_summary(processor, value);
|
||||
} else {
|
||||
pv.issues.push("JSON file not found".to_string());
|
||||
}
|
||||
|
||||
verifications.push(pv);
|
||||
}
|
||||
|
||||
// Phase 3: Check dependencies and reasonableness
|
||||
// Build references once outside the loop
|
||||
let mut all_value_refs: HashMap<String, &serde_json::Value> = HashMap::new();
|
||||
for (name, value) in &all_values {
|
||||
all_value_refs.insert(name.clone(), value);
|
||||
}
|
||||
|
||||
let probe_ref = probe_value.as_ref();
|
||||
|
||||
// Collect updates first, then apply
|
||||
let updates: Vec<(String, bool, bool, Vec<String>)> = verifications
|
||||
.iter()
|
||||
.map(|pv| {
|
||||
let processor = ProcessorType::all()
|
||||
.iter()
|
||||
.find(|p| p.as_str() == pv.processor)
|
||||
.cloned();
|
||||
if let Some(ref proc_type) = processor {
|
||||
// Build verification refs for dependency checking
|
||||
// Use completeness/valid_json/file_exists from Layer 1 results, not trust_level
|
||||
let mut verif_refs: HashMap<String, &ProcessorVerification> = HashMap::new();
|
||||
for v in &verifications {
|
||||
verif_refs.insert(v.processor.clone(), v);
|
||||
}
|
||||
|
||||
// Layer 2: Dependencies (check completeness, not trust_level)
|
||||
let (deps_ok, dep_issues) = check_dependencies(proc_type, &verif_refs);
|
||||
|
||||
// Layer 3: Reasonableness
|
||||
let (reasonable, reason_issues) = if let Some(val) = all_values.get(&pv.processor) {
|
||||
check_reasonableness(proc_type, val, probe_ref, &all_value_refs)
|
||||
} else {
|
||||
(true, Vec::new())
|
||||
};
|
||||
|
||||
let mut all_issues = dep_issues.clone();
|
||||
all_issues.extend(reason_issues);
|
||||
(pv.processor.clone(), deps_ok, reasonable, all_issues)
|
||||
} else {
|
||||
(pv.processor.clone(), true, true, Vec::new())
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Apply updates
|
||||
for (i, update) in updates.into_iter().enumerate() {
|
||||
verifications[i].dependency_ok = update.1;
|
||||
verifications[i].reasonableness = update.2;
|
||||
verifications[i].issues.extend(update.3);
|
||||
verifications[i].update_trust_level();
|
||||
}
|
||||
|
||||
report.processors = verifications;
|
||||
report.update_overall_trust();
|
||||
|
||||
report
|
||||
}
|
||||
|
||||
/// Legacy verification function (backward compatible)
|
||||
pub fn verify_output(processor: &ProcessorType, file_uuid: &str) -> VerificationResult {
|
||||
let proc_name = processor.as_str();
|
||||
let filename = match processor {
|
||||
@@ -63,53 +663,16 @@ pub fn verify_output(processor: &ProcessorType, file_uuid: &str) -> Verification
|
||||
}
|
||||
};
|
||||
|
||||
match processor {
|
||||
ProcessorType::Asrx => {
|
||||
let segs = value.get("segments").and_then(|v| v.as_array());
|
||||
match segs {
|
||||
Some(_) => VerificationResult::ok(proc_name, file_uuid),
|
||||
None => VerificationResult::ok(proc_name, file_uuid),
|
||||
}
|
||||
}
|
||||
ProcessorType::Cut => {
|
||||
let scenes = value.get("scenes").and_then(|v| v.as_array());
|
||||
match scenes {
|
||||
Some(_) => VerificationResult::ok(proc_name, file_uuid),
|
||||
None => VerificationResult::ok(proc_name, file_uuid),
|
||||
}
|
||||
}
|
||||
ProcessorType::Yolo => VerificationResult::ok(proc_name, file_uuid),
|
||||
ProcessorType::Face => VerificationResult::ok(proc_name, file_uuid),
|
||||
ProcessorType::Ocr => {
|
||||
let frames = value.get("frames").and_then(|v| v.as_array());
|
||||
match frames {
|
||||
Some(_) => VerificationResult::ok(proc_name, file_uuid),
|
||||
None => VerificationResult::ok(proc_name, file_uuid),
|
||||
}
|
||||
}
|
||||
ProcessorType::Pose => {
|
||||
let frames = value.get("frames").and_then(|v| v.as_array());
|
||||
match frames {
|
||||
Some(_) => VerificationResult::ok(proc_name, file_uuid),
|
||||
None => VerificationResult::ok(proc_name, file_uuid),
|
||||
}
|
||||
}
|
||||
ProcessorType::Scene => {
|
||||
let scenes = value.get("scenes").and_then(|v| v.as_array());
|
||||
match scenes {
|
||||
Some(s) if s.is_empty() => {
|
||||
VerificationResult::fail(proc_name, file_uuid, "0 scenes")
|
||||
}
|
||||
Some(_) => VerificationResult::ok(proc_name, file_uuid),
|
||||
None => VerificationResult::ok(proc_name, file_uuid),
|
||||
}
|
||||
}
|
||||
ProcessorType::Story => VerificationResult::ok(proc_name, file_uuid),
|
||||
_ => VerificationResult::ok(proc_name, file_uuid),
|
||||
// Use new completeness check
|
||||
let (complete, issues) = check_completeness(processor, &value);
|
||||
if !complete {
|
||||
return VerificationResult::fail(proc_name, file_uuid, &issues.join("; "));
|
||||
}
|
||||
|
||||
VerificationResult::ok(proc_name, file_uuid)
|
||||
}
|
||||
|
||||
/// 清理通過驗收的 processor 暫存檔,只保留最終 .json
|
||||
/// Clean up temp files for a processor
|
||||
pub fn cleanup_temp_files(processor: &ProcessorType, file_uuid: &str) {
|
||||
let proc_name = processor.as_str();
|
||||
let prefix = format!("{}.{}.", file_uuid, proc_name);
|
||||
@@ -133,9 +696,11 @@ pub fn cleanup_temp_files(processor: &ProcessorType, file_uuid: &str) {
|
||||
}
|
||||
}
|
||||
if removed > 0 {
|
||||
info!(
|
||||
tracing::info!(
|
||||
"Cleaned up {} temp files for {}.{}",
|
||||
removed, file_uuid, proc_name
|
||||
removed,
|
||||
file_uuid,
|
||||
proc_name
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user