momentry_core/tests/visual_chunk_concept.rs

//! 視覺分片概念驗證測試
//!
//! 此測試驗證視覺分片的數據結構和基本功能

/// 視覺分片類型
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ChunkType {
    TimeBased,
    Sentence,
    Cut,
    Trace,
    Story,
    Visual, // 視覺分片 (Phase 2.1)
}

impl ChunkType {
    pub fn as_str(&self) -> &'static str {
        match self {
            ChunkType::TimeBased => "time",
            ChunkType::Sentence => "sentence",
            ChunkType::Cut => "cut",
            ChunkType::Trace => "trace",
            ChunkType::Story => "story",
            ChunkType::Visual => "visual",
        }
    }
}

/// 檢測到的物件
#[derive(Debug, Clone)]
pub struct DetectedObject {
    /// 物件類別名稱
    pub class_name: String,
    /// 物件類別 ID
    pub class_id: u32,
    /// 信心值 (0.0-1.0)
    pub confidence: f32,
    /// 邊界框 (x, y, width, height)
    pub bbox: Option<(i32, i32, i32, i32)>,
}

/// 關鍵幀的物件列表
#[derive(Debug, Clone)]
pub struct KeyframeObjects {
    /// 關鍵幀時間 (秒)
    pub timestamp: f64,
    /// 關鍵幀幀號
    pub frame_number: u64,
    /// 檢測到的物件
    pub objects: Vec<DetectedObject>,
}

/// 視覺分片內容
#[derive(Debug, Clone)]
pub struct VisualChunkContent {
    pub start_time: f64,
    pub end_time: f64,
    pub keyframe_objects: Vec<KeyframeObjects>,
    pub dominant_objects: Vec<String>,
    pub scene_description: Option<String>,
    pub metadata: VisualMetadata,
}

/// 視覺元數據
#[derive(Debug, Clone)]
pub struct VisualMetadata {
    pub object_count: u32,
    pub unique_classes: Vec<String>,
    pub max_confidence: f32,
    pub avg_confidence: f32,
    pub spatial_density: f32, // objects per frame
}

impl VisualChunkContent {
    /// 獲取視覺分片的摘要
    pub fn summary(&self) -> String {
        let duration = self.end_time - self.start_time;
        let frame_count = self.keyframe_objects.len();

        format!(
            "視覺分片: {:.1}s 到 {:.1}s (持續時間: {:.1}s, {} 幀). 物件: {} 個總計, {} 個唯一. 主要物件: {}",
            self.start_time,
            self.end_time,
            duration,
            frame_count,
            self.metadata.object_count,
            self.metadata.unique_classes.len(),
            if self.dominant_objects.is_empty() {
                "無".to_string()
            } else {
                self.dominant_objects.join(", ")
            }
        )
    }

    /// 檢查是否包含特定物件類別
    pub fn contains_object(&self, class_name: &str) -> bool {
        self.keyframe_objects
            .iter()
            .any(|ko| ko.objects.iter().any(|obj| obj.class_name == class_name))
    }
}

/// 模擬 YOLO 結果
#[derive(Debug, Clone)]
pub struct MockYoloResult {
    pub frames: Vec<MockYoloFrame>,
}

#[derive(Debug, Clone)]
pub struct MockYoloFrame {
    pub frame: u64,
    pub timestamp: f64,
    pub objects: Vec<MockYoloObject>,
}

#[derive(Debug, Clone)]
pub struct MockYoloObject {
    pub class_name: String,
    pub class_id: u32,
    pub x: i32,
    pub y: i32,
    pub width: i32,
    pub height: i32,
    pub confidence: f32,
}

impl MockYoloResult {
    /// 從模擬 YOLO 結果創建視覺分片
    pub fn to_visual_chunk(&self, start_frame: u64, end_frame: u64) -> Option<VisualChunkContent> {
        let frames: Vec<_> = self
            .frames
            .iter()
            .filter(|f| f.frame >= start_frame && f.frame <= end_frame)
            .collect();

        if frames.is_empty() {
            return None;
        }

        // 轉換幀為關鍵幀物件
        let keyframe_objects: Vec<KeyframeObjects> = frames
            .iter()
            .map(|frame| {
                let objects: Vec<DetectedObject> = frame
                    .objects
                    .iter()
                    .map(|obj| DetectedObject {
                        class_name: obj.class_name.clone(),
                        class_id: obj.class_id,
                        confidence: obj.confidence,
                        bbox: Some((obj.x, obj.y, obj.width, obj.height)),
                    })
                    .collect();
                KeyframeObjects {
                    timestamp: frame.timestamp,
                    frame_number: frame.frame,
                    objects,
                }
            })
            .collect();

        // 計算元數據
        let total_objects: u32 = frames.iter().map(|f| f.objects.len() as u32).sum();
        let all_classes: Vec<String> = frames
            .iter()
            .flat_map(|f| f.objects.iter().map(|o| o.class_name.clone()))
            .collect();
        let unique_classes: Vec<String> = all_classes
            .iter()
            .cloned()
            .collect::<std::collections::HashSet<_>>()
            .into_iter()
            .collect();
        let confidences: Vec<f32> = frames
            .iter()
            .flat_map(|f| f.objects.iter().map(|o| o.confidence))
            .collect();
        let max_confidence = confidences.iter().copied().fold(0.0f32, f32::max);
        let avg_confidence = if !confidences.is_empty() {
            confidences.iter().sum::<f32>() / confidences.len() as f32
        } else {
            0.0
        };

        let start_time = frames.first().map(|f| f.timestamp).unwrap_or(0.0);
        let end_time = frames.last().map(|f| f.timestamp).unwrap_or(0.0);

        // 查找主要物件（出現在大多數幀中的物件）
        let mut object_counts = std::collections::HashMap::new();
        for frame in &frames {
            let frame_classes: std::collections::HashSet<_> =
                frame.objects.iter().map(|o| o.class_name.clone()).collect();
            for class in frame_classes {
                *object_counts.entry(class).or_insert(0) += 1;
            }
        }

        let mut dominant_objects: Vec<String> = object_counts
            .into_iter()
            .filter(|(_, count)| *count as f32 / frames.len() as f32 > 0.5) // 出現在 >50% 的幀中
            .map(|(class, _)| class)
            .collect();
        dominant_objects.sort();

        Some(VisualChunkContent {
            start_time,
            end_time,
            keyframe_objects,
            dominant_objects,
            scene_description: None, // 可由 LLM 後期生成
            metadata: VisualMetadata {
                object_count: total_objects,
                unique_classes,
                max_confidence,
                avg_confidence,
                spatial_density: if frames.len() > 0 {
                    total_objects as f32 / frames.len() as f32
                } else {
                    0.0
                },
            },
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_chunk_type_visual() {
        let chunk_type = ChunkType::Visual;
        assert_eq!(chunk_type.as_str(), "visual");
        assert_eq!(chunk_type, ChunkType::Visual);
    }

    #[test]
    fn test_visual_chunk_creation() {
        // 創建模擬 YOLO 結果
        let yolo_result = MockYoloResult {
            frames: vec![
                MockYoloFrame {
                    frame: 0,
                    timestamp: 0.0,
                    objects: vec![
                        MockYoloObject {
                            class_name: "person".to_string(),
                            class_id: 0,
                            x: 100,
                            y: 200,
                            width: 50,
                            height: 100,
                            confidence: 0.95,
                        },
                        MockYoloObject {
                            class_name: "car".to_string(),
                            class_id: 2,
                            x: 300,
                            y: 150,
                            width: 80,
                            height: 60,
                            confidence: 0.87,
                        },
                    ],
                },
                MockYoloFrame {
                    frame: 1,
                    timestamp: 0.033, // 1/30 秒
                    objects: vec![MockYoloObject {
                        class_name: "person".to_string(),
                        class_id: 0,
                        x: 110,
                        y: 210,
                        width: 52,
                        height: 102,
                        confidence: 0.92,
                    }],
                },
            ],
        };

        // 從 YOLO 結果創建視覺分片
        let chunk = yolo_result.to_visual_chunk(0, 1).unwrap();

        // 驗證分片屬性
        assert_eq!(chunk.start_time, 0.0);
        assert_eq!(chunk.end_time, 0.033);
        assert_eq!(chunk.metadata.object_count, 3);
        assert_eq!(chunk.metadata.unique_classes.len(), 2);
        assert!(chunk
            .metadata
            .unique_classes
            .contains(&"person".to_string()));
        assert!(chunk.metadata.unique_classes.contains(&"car".to_string()));
        assert_eq!(chunk.dominant_objects, vec!["person"]);
        assert_eq!(chunk.keyframe_objects.len(), 2);
    }

    #[test]
    fn test_visual_chunk_content_methods() {
        let content = VisualChunkContent {
            start_time: 0.0,
            end_time: 5.0,
            keyframe_objects: vec![KeyframeObjects {
                timestamp: 0.0,
                frame_number: 0,
                objects: vec![
                    DetectedObject {
                        class_name: "person".to_string(),
                        class_id: 0,
                        confidence: 0.95,
                        bbox: Some((100, 200, 50, 100)),
                    },
                    DetectedObject {
                        class_name: "car".to_string(),
                        class_id: 2,
                        confidence: 0.87,
                        bbox: Some((300, 150, 80, 60)),
                    },
                ],
            }],
            dominant_objects: vec!["person".to_string()],
            scene_description: Some("一個人站在車旁".to_string()),
            metadata: VisualMetadata {
                object_count: 2,
                unique_classes: vec!["person".to_string(), "car".to_string()],
                max_confidence: 0.95,
                avg_confidence: 0.91,
                spatial_density: 2.0,
            },
        };

        // 測試摘要方法
        let summary = content.summary();
        assert!(summary.contains("視覺分片"));
        assert!(summary.contains("person"));
        assert!(summary.contains("車"));

        // 測試 contains_object 方法
        assert!(content.contains_object("person"));
        assert!(content.contains_object("car"));
        assert!(!content.contains_object("dog"));
    }

    #[test]
    fn test_frame_similarity_concept() {
        // 測試幀相似度計算概念
        let frame1_objects = vec![
            DetectedObject {
                class_name: "person".to_string(),
                class_id: 0,
                confidence: 0.95,
                bbox: Some((100, 200, 50, 100)),
            },
            DetectedObject {
                class_name: "car".to_string(),
                class_id: 2,
                confidence: 0.87,
                bbox: Some((300, 150, 80, 60)),
            },
        ];

        let frame2_objects = vec![
            DetectedObject {
                class_name: "person".to_string(),
                class_id: 0,
                confidence: 0.92,
                bbox: Some((110, 210, 52, 102)),
            },
            DetectedObject {
                class_name: "car".to_string(),
                class_id: 2,
                confidence: 0.85,
                bbox: Some((310, 155, 82, 62)),
            },
        ];

        // 創建集合
        let set1: std::collections::HashSet<String> = frame1_objects
            .iter()
            .map(|o| o.class_name.clone())
            .collect();
        let set2: std::collections::HashSet<String> = frame2_objects
            .iter()
            .map(|o| o.class_name.clone())
            .collect();

        // 計算交集和聯集
        let intersection: Vec<_> = set1.intersection(&set2).collect();
        let union: Vec<_> = set1.union(&set2).collect();

        // 驗證相似度
        assert_eq!(intersection.len(), 2); // person, car
        assert_eq!(union.len(), 2); // person, car
        assert_eq!(intersection.len() as f32 / union.len() as f32, 1.0); // 完全相似
    }

    #[test]
    fn test_dominant_objects_detection() {
        let yolo_result = MockYoloResult {
            frames: vec![
                MockYoloFrame {
                    frame: 0,
                    timestamp: 0.0,
                    objects: vec![MockYoloObject {
                        class_name: "person".to_string(),
                        class_id: 0,
                        x: 100,
                        y: 200,
                        width: 50,
                        height: 100,
                        confidence: 0.95,
                    }],
                },
                MockYoloFrame {
                    frame: 1,
                    timestamp: 0.033,
                    objects: vec![MockYoloObject {
                        class_name: "person".to_string(),
                        class_id: 0,
                        x: 110,
                        y: 210,
                        width: 52,
                        height: 102,
                        confidence: 0.92,
                    }],
                },
                MockYoloFrame {
                    frame: 2,
                    timestamp: 0.066,
                    objects: vec![MockYoloObject {
                        class_name: "car".to_string(),
                        class_id: 2,
                        x: 300,
                        y: 150,
                        width: 80,
                        height: 60,
                        confidence: 0.87,
                    }],
                },
            ],
        };

        let chunk = yolo_result.to_visual_chunk(0, 2).unwrap();

        // person 出現在 2/3 幀中（67% > 50%），car 出現在 1/3 幀中（33% < 50%）
        assert_eq!(chunk.dominant_objects, vec!["person"]);
        assert!(!chunk.dominant_objects.contains(&"car".to_string()));
    }
}