Files
momentry_core/tests/visual_chunk_concept.rs
Warren b54c2def30 feat: add migrations, test scripts, and utility tools
- Add database migrations (006-028) for face recognition, identity, file_uuid
- Add test scripts for ASR, face, search, processing
- Add portal frontend (Tauri)
- Add config, benchmark, and monitoring utilities
- Add model checkpoints and pretrained model references
2026-04-30 15:11:53 +08:00

452 lines
14 KiB
Rust
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! 視覺分片概念驗證測試
//!
//! 此測試驗證視覺分片的數據結構和基本功能
/// 視覺分片類型
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ChunkType {
TimeBased,
Sentence,
Cut,
Trace,
Story,
Visual, // 視覺分片 (Phase 2.1)
}
impl ChunkType {
pub fn as_str(&self) -> &'static str {
match self {
ChunkType::TimeBased => "time",
ChunkType::Sentence => "sentence",
ChunkType::Cut => "cut",
ChunkType::Trace => "trace",
ChunkType::Story => "story",
ChunkType::Visual => "visual",
}
}
}
/// 檢測到的物件
#[derive(Debug, Clone)]
pub struct DetectedObject {
/// 物件類別名稱
pub class_name: String,
/// 物件類別 ID
pub class_id: u32,
/// 信心值 (0.0-1.0)
pub confidence: f32,
/// 邊界框 (x, y, width, height)
pub bbox: Option<(i32, i32, i32, i32)>,
}
/// 關鍵幀的物件列表
#[derive(Debug, Clone)]
pub struct KeyframeObjects {
/// 關鍵幀時間 (秒)
pub timestamp: f64,
/// 關鍵幀幀號
pub frame_number: u64,
/// 檢測到的物件
pub objects: Vec<DetectedObject>,
}
/// 視覺分片內容
#[derive(Debug, Clone)]
pub struct VisualChunkContent {
pub start_time: f64,
pub end_time: f64,
pub keyframe_objects: Vec<KeyframeObjects>,
pub dominant_objects: Vec<String>,
pub scene_description: Option<String>,
pub metadata: VisualMetadata,
}
/// 視覺元數據
#[derive(Debug, Clone)]
pub struct VisualMetadata {
pub object_count: u32,
pub unique_classes: Vec<String>,
pub max_confidence: f32,
pub avg_confidence: f32,
pub spatial_density: f32, // objects per frame
}
impl VisualChunkContent {
/// 獲取視覺分片的摘要
pub fn summary(&self) -> String {
let duration = self.end_time - self.start_time;
let frame_count = self.keyframe_objects.len();
format!(
"視覺分片: {:.1}s 到 {:.1}s (持續時間: {:.1}s, {} 幀). 物件: {} 個總計, {} 個唯一. 主要物件: {}",
self.start_time,
self.end_time,
duration,
frame_count,
self.metadata.object_count,
self.metadata.unique_classes.len(),
if self.dominant_objects.is_empty() {
"".to_string()
} else {
self.dominant_objects.join(", ")
}
)
}
/// 檢查是否包含特定物件類別
pub fn contains_object(&self, class_name: &str) -> bool {
self.keyframe_objects
.iter()
.any(|ko| ko.objects.iter().any(|obj| obj.class_name == class_name))
}
}
/// 模擬 YOLO 結果
#[derive(Debug, Clone)]
pub struct MockYoloResult {
pub frames: Vec<MockYoloFrame>,
}
#[derive(Debug, Clone)]
pub struct MockYoloFrame {
pub frame: u64,
pub timestamp: f64,
pub objects: Vec<MockYoloObject>,
}
#[derive(Debug, Clone)]
pub struct MockYoloObject {
pub class_name: String,
pub class_id: u32,
pub x: i32,
pub y: i32,
pub width: i32,
pub height: i32,
pub confidence: f32,
}
impl MockYoloResult {
/// 從模擬 YOLO 結果創建視覺分片
pub fn to_visual_chunk(&self, start_frame: u64, end_frame: u64) -> Option<VisualChunkContent> {
let frames: Vec<_> = self
.frames
.iter()
.filter(|f| f.frame >= start_frame && f.frame <= end_frame)
.collect();
if frames.is_empty() {
return None;
}
// 轉換幀為關鍵幀物件
let keyframe_objects: Vec<KeyframeObjects> = frames
.iter()
.map(|frame| {
let objects: Vec<DetectedObject> = frame
.objects
.iter()
.map(|obj| DetectedObject {
class_name: obj.class_name.clone(),
class_id: obj.class_id,
confidence: obj.confidence,
bbox: Some((obj.x, obj.y, obj.width, obj.height)),
})
.collect();
KeyframeObjects {
timestamp: frame.timestamp,
frame_number: frame.frame,
objects,
}
})
.collect();
// 計算元數據
let total_objects: u32 = frames.iter().map(|f| f.objects.len() as u32).sum();
let all_classes: Vec<String> = frames
.iter()
.flat_map(|f| f.objects.iter().map(|o| o.class_name.clone()))
.collect();
let unique_classes: Vec<String> = all_classes
.iter()
.cloned()
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
let confidences: Vec<f32> = frames
.iter()
.flat_map(|f| f.objects.iter().map(|o| o.confidence))
.collect();
let max_confidence = confidences.iter().copied().fold(0.0f32, f32::max);
let avg_confidence = if !confidences.is_empty() {
confidences.iter().sum::<f32>() / confidences.len() as f32
} else {
0.0
};
let start_time = frames.first().map(|f| f.timestamp).unwrap_or(0.0);
let end_time = frames.last().map(|f| f.timestamp).unwrap_or(0.0);
// 查找主要物件(出現在大多數幀中的物件)
let mut object_counts = std::collections::HashMap::new();
for frame in &frames {
let frame_classes: std::collections::HashSet<_> =
frame.objects.iter().map(|o| o.class_name.clone()).collect();
for class in frame_classes {
*object_counts.entry(class).or_insert(0) += 1;
}
}
let mut dominant_objects: Vec<String> = object_counts
.into_iter()
.filter(|(_, count)| *count as f32 / frames.len() as f32 > 0.5) // 出現在 >50% 的幀中
.map(|(class, _)| class)
.collect();
dominant_objects.sort();
Some(VisualChunkContent {
start_time,
end_time,
keyframe_objects,
dominant_objects,
scene_description: None, // 可由 LLM 後期生成
metadata: VisualMetadata {
object_count: total_objects,
unique_classes,
max_confidence,
avg_confidence,
spatial_density: if frames.len() > 0 {
total_objects as f32 / frames.len() as f32
} else {
0.0
},
},
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunk_type_visual() {
let chunk_type = ChunkType::Visual;
assert_eq!(chunk_type.as_str(), "visual");
assert_eq!(chunk_type, ChunkType::Visual);
}
#[test]
fn test_visual_chunk_creation() {
// 創建模擬 YOLO 結果
let yolo_result = MockYoloResult {
frames: vec![
MockYoloFrame {
frame: 0,
timestamp: 0.0,
objects: vec![
MockYoloObject {
class_name: "person".to_string(),
class_id: 0,
x: 100,
y: 200,
width: 50,
height: 100,
confidence: 0.95,
},
MockYoloObject {
class_name: "car".to_string(),
class_id: 2,
x: 300,
y: 150,
width: 80,
height: 60,
confidence: 0.87,
},
],
},
MockYoloFrame {
frame: 1,
timestamp: 0.033, // 1/30 秒
objects: vec![MockYoloObject {
class_name: "person".to_string(),
class_id: 0,
x: 110,
y: 210,
width: 52,
height: 102,
confidence: 0.92,
}],
},
],
};
// 從 YOLO 結果創建視覺分片
let chunk = yolo_result.to_visual_chunk(0, 1).unwrap();
// 驗證分片屬性
assert_eq!(chunk.start_time, 0.0);
assert_eq!(chunk.end_time, 0.033);
assert_eq!(chunk.metadata.object_count, 3);
assert_eq!(chunk.metadata.unique_classes.len(), 2);
assert!(chunk
.metadata
.unique_classes
.contains(&"person".to_string()));
assert!(chunk.metadata.unique_classes.contains(&"car".to_string()));
assert_eq!(chunk.dominant_objects, vec!["person"]);
assert_eq!(chunk.keyframe_objects.len(), 2);
}
#[test]
fn test_visual_chunk_content_methods() {
let content = VisualChunkContent {
start_time: 0.0,
end_time: 5.0,
keyframe_objects: vec![KeyframeObjects {
timestamp: 0.0,
frame_number: 0,
objects: vec![
DetectedObject {
class_name: "person".to_string(),
class_id: 0,
confidence: 0.95,
bbox: Some((100, 200, 50, 100)),
},
DetectedObject {
class_name: "car".to_string(),
class_id: 2,
confidence: 0.87,
bbox: Some((300, 150, 80, 60)),
},
],
}],
dominant_objects: vec!["person".to_string()],
scene_description: Some("一個人站在車旁".to_string()),
metadata: VisualMetadata {
object_count: 2,
unique_classes: vec!["person".to_string(), "car".to_string()],
max_confidence: 0.95,
avg_confidence: 0.91,
spatial_density: 2.0,
},
};
// 測試摘要方法
let summary = content.summary();
assert!(summary.contains("視覺分片"));
assert!(summary.contains("person"));
assert!(summary.contains(""));
// 測試 contains_object 方法
assert!(content.contains_object("person"));
assert!(content.contains_object("car"));
assert!(!content.contains_object("dog"));
}
#[test]
fn test_frame_similarity_concept() {
// 測試幀相似度計算概念
let frame1_objects = vec![
DetectedObject {
class_name: "person".to_string(),
class_id: 0,
confidence: 0.95,
bbox: Some((100, 200, 50, 100)),
},
DetectedObject {
class_name: "car".to_string(),
class_id: 2,
confidence: 0.87,
bbox: Some((300, 150, 80, 60)),
},
];
let frame2_objects = vec![
DetectedObject {
class_name: "person".to_string(),
class_id: 0,
confidence: 0.92,
bbox: Some((110, 210, 52, 102)),
},
DetectedObject {
class_name: "car".to_string(),
class_id: 2,
confidence: 0.85,
bbox: Some((310, 155, 82, 62)),
},
];
// 創建集合
let set1: std::collections::HashSet<String> = frame1_objects
.iter()
.map(|o| o.class_name.clone())
.collect();
let set2: std::collections::HashSet<String> = frame2_objects
.iter()
.map(|o| o.class_name.clone())
.collect();
// 計算交集和聯集
let intersection: Vec<_> = set1.intersection(&set2).collect();
let union: Vec<_> = set1.union(&set2).collect();
// 驗證相似度
assert_eq!(intersection.len(), 2); // person, car
assert_eq!(union.len(), 2); // person, car
assert_eq!(intersection.len() as f32 / union.len() as f32, 1.0); // 完全相似
}
#[test]
fn test_dominant_objects_detection() {
let yolo_result = MockYoloResult {
frames: vec![
MockYoloFrame {
frame: 0,
timestamp: 0.0,
objects: vec![MockYoloObject {
class_name: "person".to_string(),
class_id: 0,
x: 100,
y: 200,
width: 50,
height: 100,
confidence: 0.95,
}],
},
MockYoloFrame {
frame: 1,
timestamp: 0.033,
objects: vec![MockYoloObject {
class_name: "person".to_string(),
class_id: 0,
x: 110,
y: 210,
width: 52,
height: 102,
confidence: 0.92,
}],
},
MockYoloFrame {
frame: 2,
timestamp: 0.066,
objects: vec![MockYoloObject {
class_name: "car".to_string(),
class_id: 2,
x: 300,
y: 150,
width: 80,
height: 60,
confidence: 0.87,
}],
},
],
};
let chunk = yolo_result.to_visual_chunk(0, 2).unwrap();
// person 出現在 2/3 幀中67% > 50%car 出現在 1/3 幀中33% < 50%
assert_eq!(chunk.dominant_objects, vec!["person"]);
assert!(!chunk.dominant_objects.contains(&"car".to_string()));
}
}