use anyhow::{Context, Result}; use clap::Parser; use crossterm::event::{self, Event, KeyCode}; use crossterm::terminal as crossterm_terminal; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::io::{self, IsTerminal, Write}; use std::path::PathBuf; use std::process::{Command, Stdio}; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::thread; use std::time::Duration; #[derive(Parser, Debug)] #[command(name = "integrated_player")] #[command(about = "Integrated player for ASR, Face, ASRX, and Pose")] struct Args { #[arg(short, long)] video: PathBuf, #[arg(short = 'r', long)] asr: Option, #[arg(short = 'f', long)] face: Option, #[arg(short = 'x', long)] asrx: Option, #[arg(short = 'p', long)] pose: Option, #[arg(short = 's', long, default_value = "0.0")] start: f64, #[arg(long)] speaker_name: Option, #[arg(long)] auto_play_speaker: bool, #[arg(long)] demo: bool, #[arg(long, default_value = "3")] demo_segments_per_speaker: usize, #[arg(long, default_value = "2.0")] demo_speed: f64, #[arg(long)] show_video: bool, #[arg(long, default_value = "800")] video_width: u32, #[arg(long, default_value = "600")] video_height: u32, #[arg(long)] continuous_demo: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] struct AsrSegment { start: f64, end: f64, text: String, } #[derive(Debug, Clone, Serialize, Deserialize)] struct AsrData { language: Option, segments: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] struct FaceInfo { face_id: Option, x: i32, y: i32, width: i32, height: i32, confidence: f64, } #[derive(Debug, Clone, Serialize, Deserialize)] struct FaceFrame { frame: u64, timestamp: f64, faces: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] struct FaceData { fps: f64, frame_count: u64, frames: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] struct AsrxSegment { index: usize, start: f64, end: f64, duration: f64, speaker: String, } #[derive(Debug, Clone, Serialize, Deserialize)] struct AsrxData { segments: Vec, speaker_stats: HashMap, } #[derive(Debug, Clone, Serialize, Deserialize)] struct SpeakerStats { count: usize, duration: f64, } #[derive(Debug, Clone, Serialize, Deserialize)] struct Keypoint { name: String, x: f32, y: f32, confidence: f32, } #[derive(Debug, Clone, Serialize, Deserialize)] struct PersonPose { keypoints: Vec, bbox: Bbox, } #[derive(Debug, Clone, Serialize, Deserialize)] struct Bbox { x: i32, y: i32, width: i32, height: i32, } #[derive(Debug, Clone, Serialize, Deserialize)] struct PoseFrame { frame: u64, timestamp: f64, persons: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] struct PoseData { frames: Vec, } #[derive(Debug, Clone)] struct IntegratedSegment { start: f64, end: f64, text: Option, speaker: Option, face: Option, mouth_landmarks: Option>, } struct IntegratedPlayer { asr_data: Option, face_data: Option, asrx_data: Option, pose_data: Option, current_time: f64, speaker_names: HashMap, } impl IntegratedPlayer { fn new() -> Self { let mut speaker_names = HashMap::new(); speaker_names.insert( "SPEAKER_0".to_string(), ("Cary Grant".to_string(), "Peter Joshua".to_string()), ); speaker_names.insert( "SPEAKER_1".to_string(), ("Audrey Hepburn".to_string(), "Regina Lampert".to_string()), ); speaker_names.insert( "SPEAKER_2".to_string(), ( "Walter Matthau".to_string(), "Hamilton Bartholomew".to_string(), ), ); speaker_names.insert( "SPEAKER_4".to_string(), ("James Coburn".to_string(), "Tex Panthollow".to_string()), ); Self { asr_data: None, face_data: None, asrx_data: None, pose_data: None, current_time: 0.0, speaker_names, } } fn load_asr(&mut self, path: &PathBuf) -> Result<()> { let content = std::fs::read_to_string(path) .with_context(|| format!("Failed to read ASR file: {:?}", path))?; self.asr_data = Some(serde_json::from_str(&content)?); println!( "βœ“ Loaded {} ASR segments", self.asr_data.as_ref().unwrap().segments.len() ); Ok(()) } fn load_face(&mut self, path: &PathBuf) -> Result<()> { let content = std::fs::read_to_string(path) .with_context(|| format!("Failed to read Face file: {:?}", path))?; self.face_data = Some(serde_json::from_str(&content)?); let total_faces = self .face_data .as_ref() .unwrap() .frames .iter() .map(|f| f.faces.len()) .sum::(); println!( "βœ“ Loaded {} face frames, {} total detections", self.face_data.as_ref().unwrap().frames.len(), total_faces ); Ok(()) } fn load_asrx(&mut self, path: &PathBuf) -> Result<()> { let content = std::fs::read_to_string(path) .with_context(|| format!("Failed to read ASRX file: {:?}", path))?; self.asrx_data = Some(serde_json::from_str(&content)?); println!( "βœ“ Loaded {} ASRX segments, {} speakers", self.asrx_data.as_ref().unwrap().segments.len(), self.asrx_data.as_ref().unwrap().speaker_stats.len() ); Ok(()) } fn load_pose(&mut self, path: &PathBuf) -> Result<()> { let content = std::fs::read_to_string(path) .with_context(|| format!("Failed to read Pose file: {:?}", path))?; self.pose_data = Some(serde_json::from_str(&content)?); println!( "βœ“ Loaded {} pose frames", self.pose_data.as_ref().unwrap().frames.len() ); Ok(()) } fn get_current_segment(&self, time: f64) -> Option { let mut segment = IntegratedSegment { start: 0.0, end: 0.0, text: None, speaker: None, face: None, mouth_landmarks: None, }; if let Some(asr) = &self.asr_data { for seg in &asr.segments { if time >= seg.start && time <= seg.end { segment.start = seg.start; segment.end = seg.end; segment.text = Some(seg.text.clone()); break; } } } if let Some(asrx) = &self.asrx_data { for seg in &asrx.segments { if time >= seg.start && time <= seg.end { segment.start = seg.start; segment.end = seg.end; segment.speaker = Some(seg.speaker.clone()); break; } } } if let Some(face) = &self.face_data { for frame in &face.frames { if (frame.timestamp - time).abs() < 1.0 { if let Some(face_info) = frame.faces.first() { segment.face = Some(face_info.clone()); break; } } } } if let Some(pose) = &self.pose_data { for frame in &pose.frames { if (frame.timestamp - time).abs() < 0.5 { if let Some(person) = frame.persons.first() { let mouth_points: Vec = person .keypoints .iter() .filter(|kp| { kp.name.contains("mouth") || kp.name.contains("lip") || kp.name == "nose" }) .cloned() .collect(); if !mouth_points.is_empty() { segment.mouth_landmarks = Some(mouth_points); break; } } } } } if segment.text.is_some() || segment.speaker.is_some() || segment.face.is_some() || segment.mouth_landmarks.is_some() { Some(segment) } else { None } } fn get_speaker_info(&self, speaker_id: &str) -> (String, String) { self.speaker_names .get(speaker_id) .cloned() .unwrap_or_else(|| ("Unknown".to_string(), "Unknown".to_string())) } fn list_speakers(&self) { if let Some(asrx) = &self.asrx_data { println!("\nπŸ“Š Speaker Statistics:"); println!("{:-<80}", ""); println!( "{:15} {:20} {:20} {:>10} {:>10}", "Speaker ID", "Actor", "Character", "Segments", "Duration" ); println!("{:-<80}", ""); for (speaker_id, stats) in &asrx.speaker_stats { let (actor, character) = self.get_speaker_info(speaker_id); println!( "{:15} {:20} {:20} {:>10} {:>9.1}s", speaker_id, actor, character, stats.count, stats.duration ); } println!("{:-<80}", ""); } } } fn run_continuous_demo(player: &IntegratedPlayer, args: &Args) -> Result<()> { println!("\n🎬 Continuous Demo Mode"); println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); let is_interactive = io::stdin().is_terminal(); if is_interactive { println!("Controls:"); println!(" SPACE - Pause/Resume"); println!(" Q - Quit"); } else { println!("Running in non-interactive mode (no keyboard control)"); println!("Use Ctrl+C to stop"); } println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); println!(); let paused = Arc::new(AtomicBool::new(false)); let quit = Arc::new(AtomicBool::new(false)); let paused_clone = paused.clone(); let quit_clone = quit.clone(); let raw_mode_enabled = if is_interactive { crossterm_terminal::enable_raw_mode().ok().is_some() } else { false }; if is_interactive && raw_mode_enabled { thread::spawn(move || loop { if let Ok(Event::Key(key_event)) = event::read() { if key_event.code == KeyCode::Char(' ') { paused_clone.fetch_xor(true, Ordering::SeqCst); } else if key_event.code == KeyCode::Char('q') || key_event.code == KeyCode::Char('Q') || key_event.code == KeyCode::Esc { quit_clone.store(true, Ordering::SeqCst); break; } } if quit_clone.load(Ordering::SeqCst) { break; } thread::sleep(Duration::from_millis(50)); }); } if let Some(asr) = &player.asr_data { let total_segments = asr.segments.len(); for (i, seg) in asr.segments.iter().enumerate() { if quit.load(Ordering::SeqCst) { println!("\n⏹️ Stopped by user"); break; } while paused.load(Ordering::SeqCst) { println!("\r⏸️ Paused - Press SPACE to resume"); io::stdout().flush()?; thread::sleep(Duration::from_millis(100)); if quit.load(Ordering::SeqCst) { println!("\n⏹️ Stopped by user"); if raw_mode_enabled { crossterm_terminal::disable_raw_mode().ok(); } return Ok(()); } } println!("\n[{}/{}] Segment", i + 1, total_segments); println!("{:=<80}", ""); println!("πŸ“ ASR Text: {}", seg.text); println!("⏱ Time: {:.2}s - {:.2}s", seg.start, seg.end); if let Some(asrx) = &player.asrx_data { for asrx_seg in &asrx.segments { if seg.start >= asrx_seg.start && seg.start <= asrx_seg.end { let (actor, character) = player.get_speaker_info(&asrx_seg.speaker); println!( "🎀 Speaker: {} β†’ {} ({})", asrx_seg.speaker, actor, character ); break; } } } if let Some(segment) = player.get_current_segment(seg.start + 0.01) { if let Some(face) = &segment.face { println!( "πŸ‘€ Face: bbox=({},{}) {}x{}, conf={:.2}", face.x, face.y, face.width, face.height, face.confidence ); } if let Some(landmarks) = &segment.mouth_landmarks { println!("πŸ‘„ Mouth landmarks: {} points", landmarks.len()); } } let duration = seg.end - seg.start; println!( "▢️ Playing: {:.2}s - {:.2}s ({:.2}s)", seg.start, seg.end, duration ); let mut cmd = Command::new("ffplay"); if args.show_video { cmd.args([ "-ss", &format!("{:.2}", seg.start), "-t", &format!("{:.2}", duration), "-autoexit", "-x", &format!("{}", args.video_width), "-y", &format!("{}", args.video_height), args.video.to_str().unwrap(), ]); } else { cmd.args([ "-ss", &format!("{:.2}", seg.start), "-t", &format!("{:.2}", duration), "-autoexit", "-nodisp", args.video.to_str().unwrap(), ]); } let _child = cmd .stdout(Stdio::null()) .stderr(Stdio::null()) .spawn() .context("Failed to start ffplay")?; thread::sleep(Duration::from_millis((duration * 1000.0) as u64 + 100)); } println!("\n{:=<80}", ""); println!("βœ… Demo completed! Played {} segments", total_segments); println!("{:=<80}", ""); } else if let Some(asrx) = &player.asrx_data { let total_segments = asrx.segments.len(); println!( "Playing {} ASRX segments (no ASR text available)", total_segments ); for (i, seg) in asrx.segments.iter().enumerate() { if quit.load(Ordering::SeqCst) { println!("\n⏹️ Stopped by user"); break; } while paused.load(Ordering::SeqCst) { println!("\r⏸️ Paused - Press SPACE to resume"); io::stdout().flush()?; thread::sleep(Duration::from_millis(100)); if quit.load(Ordering::SeqCst) { println!("\n⏹️ Stopped by user"); if raw_mode_enabled { crossterm_terminal::disable_raw_mode().ok(); } return Ok(()); } } let (actor, character) = player.get_speaker_info(&seg.speaker); println!("\n[{}/{}] Segment", i + 1, total_segments); println!("{:=<80}", ""); println!( "⏱ Time: {:.2}s - {:.2}s ({:.2}s)", seg.start, seg.end, seg.duration ); println!("🎀 Speaker: {} β†’ {} ({})", seg.speaker, actor, character); if let Some(segment) = player.get_current_segment(seg.start + 0.01) { if let Some(face) = &segment.face { println!( "πŸ‘€ Face: bbox=({},{}) {}x{}, conf={:.2}", face.x, face.y, face.width, face.height, face.confidence ); } if let Some(landmarks) = &segment.mouth_landmarks { println!("πŸ‘„ Mouth landmarks: {} points", landmarks.len()); } } println!("▢️ Playing audio segment"); let mut cmd = Command::new("ffplay"); if args.show_video { cmd.args([ "-ss", &format!("{:.2}", seg.start), "-t", &format!("{:.2}", seg.duration), "-autoexit", "-x", &format!("{}", args.video_width), "-y", &format!("{}", args.video_height), args.video.to_str().unwrap(), ]); } else { cmd.args([ "-ss", &format!("{:.2}", seg.start), "-t", &format!("{:.2}", seg.duration), "-autoexit", "-nodisp", args.video.to_str().unwrap(), ]); } let _child = cmd .stdout(Stdio::null()) .stderr(Stdio::null()) .spawn() .context("Failed to start ffplay")?; thread::sleep(Duration::from_millis((seg.duration * 1000.0) as u64 + 100)); } println!("\n{:=<80}", ""); println!("βœ… Demo completed! Played {} segments", total_segments); println!("{:=<80}", ""); } else { println!("⚠️ No ASR or ASRX data loaded"); } if raw_mode_enabled { crossterm_terminal::disable_raw_mode().ok(); } Ok(()) } fn main() -> Result<()> { let args = Args::parse(); if !args.video.exists() { anyhow::bail!("Video file not found: {:?}", args.video); } println!("🎬 Integrated Player for ASR/Face/ASRX/Pose"); println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); println!("Video: {:?}", args.video); let mut player = IntegratedPlayer::new(); if let Some(asr_path) = &args.asr { if asr_path.exists() { player.load_asr(asr_path)?; } } if let Some(face_path) = &args.face { if face_path.exists() { player.load_face(face_path)?; } } if let Some(asrx_path) = &args.asrx { if asrx_path.exists() { player.load_asrx(asrx_path)?; } } if let Some(pose_path) = &args.pose { if pose_path.exists() { player.load_pose(pose_path)?; } } player.list_speakers(); if args.continuous_demo { run_continuous_demo(&player, &args)?; } else { println!("\n⚠️ Please use --continuous-demo flag"); } Ok(()) }