Files
momentry_core/src/bin/integrated_player.rs

660 lines
20 KiB
Rust

use anyhow::{Context, Result};
use clap::Parser;
use crossterm::event::{self, Event, KeyCode};
use crossterm::terminal as crossterm_terminal;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::io::{self, IsTerminal, Write};
use std::path::PathBuf;
use std::process::{Command, Stdio};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
#[derive(Parser, Debug)]
#[command(name = "integrated_player")]
#[command(about = "Integrated player for ASR, Face, ASRX, and Pose")]
struct Args {
#[arg(short, long)]
video: PathBuf,
#[arg(short = 'r', long)]
asr: Option<PathBuf>,
#[arg(short = 'f', long)]
face: Option<PathBuf>,
#[arg(short = 'x', long)]
asrx: Option<PathBuf>,
#[arg(short = 'p', long)]
pose: Option<PathBuf>,
#[arg(short = 's', long, default_value = "0.0")]
start: f64,
#[arg(long)]
speaker_name: Option<String>,
#[arg(long)]
auto_play_speaker: bool,
#[arg(long)]
demo: bool,
#[arg(long, default_value = "3")]
demo_segments_per_speaker: usize,
#[arg(long, default_value = "2.0")]
demo_speed: f64,
#[arg(long)]
show_video: bool,
#[arg(long, default_value = "800")]
video_width: u32,
#[arg(long, default_value = "600")]
video_height: u32,
#[arg(long)]
continuous_demo: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AsrSegment {
start: f64,
end: f64,
text: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AsrData {
language: Option<String>,
segments: Vec<AsrSegment>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct FaceInfo {
face_id: Option<String>,
x: i32,
y: i32,
width: i32,
height: i32,
confidence: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct FaceFrame {
frame: u64,
timestamp: f64,
faces: Vec<FaceInfo>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct FaceData {
fps: f64,
frame_count: u64,
frames: Vec<FaceFrame>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AsrxSegment {
index: usize,
start: f64,
end: f64,
duration: f64,
speaker: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AsrxData {
segments: Vec<AsrxSegment>,
speaker_stats: HashMap<String, SpeakerStats>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct SpeakerStats {
count: usize,
duration: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Keypoint {
name: String,
x: f32,
y: f32,
confidence: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PersonPose {
keypoints: Vec<Keypoint>,
bbox: Bbox,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Bbox {
x: i32,
y: i32,
width: i32,
height: i32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PoseFrame {
frame: u64,
timestamp: f64,
persons: Vec<PersonPose>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PoseData {
frames: Vec<PoseFrame>,
}
#[derive(Debug, Clone)]
struct IntegratedSegment {
start: f64,
end: f64,
text: Option<String>,
speaker: Option<String>,
face: Option<FaceInfo>,
mouth_landmarks: Option<Vec<Keypoint>>,
}
struct IntegratedPlayer {
asr_data: Option<AsrData>,
face_data: Option<FaceData>,
asrx_data: Option<AsrxData>,
pose_data: Option<PoseData>,
current_time: f64,
speaker_names: HashMap<String, (String, String)>,
}
impl IntegratedPlayer {
fn new() -> Self {
let mut speaker_names = HashMap::new();
speaker_names.insert(
"SPEAKER_0".to_string(),
("Cary Grant".to_string(), "Peter Joshua".to_string()),
);
speaker_names.insert(
"SPEAKER_1".to_string(),
("Audrey Hepburn".to_string(), "Regina Lampert".to_string()),
);
speaker_names.insert(
"SPEAKER_2".to_string(),
(
"Walter Matthau".to_string(),
"Hamilton Bartholomew".to_string(),
),
);
speaker_names.insert(
"SPEAKER_4".to_string(),
("James Coburn".to_string(), "Tex Panthollow".to_string()),
);
Self {
asr_data: None,
face_data: None,
asrx_data: None,
pose_data: None,
current_time: 0.0,
speaker_names,
}
}
fn load_asr(&mut self, path: &PathBuf) -> Result<()> {
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read ASR file: {:?}", path))?;
self.asr_data = Some(serde_json::from_str(&content)?);
println!(
"✓ Loaded {} ASR segments",
self.asr_data.as_ref().unwrap().segments.len()
);
Ok(())
}
fn load_face(&mut self, path: &PathBuf) -> Result<()> {
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read Face file: {:?}", path))?;
self.face_data = Some(serde_json::from_str(&content)?);
let total_faces = self
.face_data
.as_ref()
.unwrap()
.frames
.iter()
.map(|f| f.faces.len())
.sum::<usize>();
println!(
"✓ Loaded {} face frames, {} total detections",
self.face_data.as_ref().unwrap().frames.len(),
total_faces
);
Ok(())
}
fn load_asrx(&mut self, path: &PathBuf) -> Result<()> {
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read ASRX file: {:?}", path))?;
self.asrx_data = Some(serde_json::from_str(&content)?);
println!(
"✓ Loaded {} ASRX segments, {} speakers",
self.asrx_data.as_ref().unwrap().segments.len(),
self.asrx_data.as_ref().unwrap().speaker_stats.len()
);
Ok(())
}
fn load_pose(&mut self, path: &PathBuf) -> Result<()> {
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read Pose file: {:?}", path))?;
self.pose_data = Some(serde_json::from_str(&content)?);
println!(
"✓ Loaded {} pose frames",
self.pose_data.as_ref().unwrap().frames.len()
);
Ok(())
}
fn get_current_segment(&self, time: f64) -> Option<IntegratedSegment> {
let mut segment = IntegratedSegment {
start: 0.0,
end: 0.0,
text: None,
speaker: None,
face: None,
mouth_landmarks: None,
};
if let Some(asr) = &self.asr_data {
for seg in &asr.segments {
if time >= seg.start && time <= seg.end {
segment.start = seg.start;
segment.end = seg.end;
segment.text = Some(seg.text.clone());
break;
}
}
}
if let Some(asrx) = &self.asrx_data {
for seg in &asrx.segments {
if time >= seg.start && time <= seg.end {
segment.start = seg.start;
segment.end = seg.end;
segment.speaker = Some(seg.speaker.clone());
break;
}
}
}
if let Some(face) = &self.face_data {
for frame in &face.frames {
if (frame.timestamp - time).abs() < 1.0 {
if let Some(face_info) = frame.faces.first() {
segment.face = Some(face_info.clone());
break;
}
}
}
}
if let Some(pose) = &self.pose_data {
for frame in &pose.frames {
if (frame.timestamp - time).abs() < 0.5 {
if let Some(person) = frame.persons.first() {
let mouth_points: Vec<Keypoint> = person
.keypoints
.iter()
.filter(|kp| {
kp.name.contains("mouth")
|| kp.name.contains("lip")
|| kp.name == "nose"
})
.cloned()
.collect();
if !mouth_points.is_empty() {
segment.mouth_landmarks = Some(mouth_points);
break;
}
}
}
}
}
if segment.text.is_some()
|| segment.speaker.is_some()
|| segment.face.is_some()
|| segment.mouth_landmarks.is_some()
{
Some(segment)
} else {
None
}
}
fn get_speaker_info(&self, speaker_id: &str) -> (String, String) {
self.speaker_names
.get(speaker_id)
.cloned()
.unwrap_or_else(|| ("Unknown".to_string(), "Unknown".to_string()))
}
fn list_speakers(&self) {
if let Some(asrx) = &self.asrx_data {
println!("\n📊 Speaker Statistics:");
println!("{:-<80}", "");
println!(
"{:15} {:20} {:20} {:>10} {:>10}",
"Speaker ID", "Actor", "Character", "Segments", "Duration"
);
println!("{:-<80}", "");
for (speaker_id, stats) in &asrx.speaker_stats {
let (actor, character) = self.get_speaker_info(speaker_id);
println!(
"{:15} {:20} {:20} {:>10} {:>9.1}s",
speaker_id, actor, character, stats.count, stats.duration
);
}
println!("{:-<80}", "");
}
}
}
fn run_continuous_demo(player: &IntegratedPlayer, args: &Args) -> Result<()> {
println!("\n🎬 Continuous Demo Mode");
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
let is_interactive = io::stdin().is_terminal();
if is_interactive {
println!("Controls:");
println!(" SPACE - Pause/Resume");
println!(" Q - Quit");
} else {
println!("Running in non-interactive mode (no keyboard control)");
println!("Use Ctrl+C to stop");
}
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
println!();
let paused = Arc::new(AtomicBool::new(false));
let quit = Arc::new(AtomicBool::new(false));
let paused_clone = paused.clone();
let quit_clone = quit.clone();
let raw_mode_enabled = if is_interactive {
crossterm_terminal::enable_raw_mode().ok().is_some()
} else {
false
};
if is_interactive && raw_mode_enabled {
thread::spawn(move || loop {
if let Ok(Event::Key(key_event)) = event::read() {
if key_event.code == KeyCode::Char(' ') {
paused_clone.fetch_xor(true, Ordering::SeqCst);
} else if key_event.code == KeyCode::Char('q')
|| key_event.code == KeyCode::Char('Q')
|| key_event.code == KeyCode::Esc
{
quit_clone.store(true, Ordering::SeqCst);
break;
}
}
if quit_clone.load(Ordering::SeqCst) {
break;
}
thread::sleep(Duration::from_millis(50));
});
}
if let Some(asr) = &player.asr_data {
let total_segments = asr.segments.len();
for (i, seg) in asr.segments.iter().enumerate() {
if quit.load(Ordering::SeqCst) {
println!("\n⏹️ Stopped by user");
break;
}
while paused.load(Ordering::SeqCst) {
println!("\r⏸️ Paused - Press SPACE to resume");
io::stdout().flush()?;
thread::sleep(Duration::from_millis(100));
if quit.load(Ordering::SeqCst) {
println!("\n⏹️ Stopped by user");
if raw_mode_enabled {
crossterm_terminal::disable_raw_mode().ok();
}
return Ok(());
}
}
println!("\n[{}/{}] Segment", i + 1, total_segments);
println!("{:=<80}", "");
println!("📝 ASR Text: {}", seg.text);
println!("⏱ Time: {:.2}s - {:.2}s", seg.start, seg.end);
if let Some(asrx) = &player.asrx_data {
for asrx_seg in &asrx.segments {
if seg.start >= asrx_seg.start && seg.start <= asrx_seg.end {
let (actor, character) = player.get_speaker_info(&asrx_seg.speaker);
println!(
"🎤 Speaker: {}{} ({})",
asrx_seg.speaker, actor, character
);
break;
}
}
}
if let Some(segment) = player.get_current_segment(seg.start + 0.01) {
if let Some(face) = &segment.face {
println!(
"👤 Face: bbox=({},{}) {}x{}, conf={:.2}",
face.x, face.y, face.width, face.height, face.confidence
);
}
if let Some(landmarks) = &segment.mouth_landmarks {
println!("👄 Mouth landmarks: {} points", landmarks.len());
}
}
let duration = seg.end - seg.start;
println!(
"▶️ Playing: {:.2}s - {:.2}s ({:.2}s)",
seg.start, seg.end, duration
);
let mut cmd = Command::new("ffplay");
if args.show_video {
cmd.args([
"-ss",
&format!("{:.2}", seg.start),
"-t",
&format!("{:.2}", duration),
"-autoexit",
"-x",
&format!("{}", args.video_width),
"-y",
&format!("{}", args.video_height),
args.video.to_str().unwrap(),
]);
} else {
cmd.args([
"-ss",
&format!("{:.2}", seg.start),
"-t",
&format!("{:.2}", duration),
"-autoexit",
"-nodisp",
args.video.to_str().unwrap(),
]);
}
let _child = cmd
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.context("Failed to start ffplay")?;
thread::sleep(Duration::from_millis((duration * 1000.0) as u64 + 100));
}
println!("\n{:=<80}", "");
println!("✅ Demo completed! Played {} segments", total_segments);
println!("{:=<80}", "");
} else if let Some(asrx) = &player.asrx_data {
let total_segments = asrx.segments.len();
println!(
"Playing {} ASRX segments (no ASR text available)",
total_segments
);
for (i, seg) in asrx.segments.iter().enumerate() {
if quit.load(Ordering::SeqCst) {
println!("\n⏹️ Stopped by user");
break;
}
while paused.load(Ordering::SeqCst) {
println!("\r⏸️ Paused - Press SPACE to resume");
io::stdout().flush()?;
thread::sleep(Duration::from_millis(100));
if quit.load(Ordering::SeqCst) {
println!("\n⏹️ Stopped by user");
if raw_mode_enabled {
crossterm_terminal::disable_raw_mode().ok();
}
return Ok(());
}
}
let (actor, character) = player.get_speaker_info(&seg.speaker);
println!("\n[{}/{}] Segment", i + 1, total_segments);
println!("{:=<80}", "");
println!(
"⏱ Time: {:.2}s - {:.2}s ({:.2}s)",
seg.start, seg.end, seg.duration
);
println!("🎤 Speaker: {}{} ({})", seg.speaker, actor, character);
if let Some(segment) = player.get_current_segment(seg.start + 0.01) {
if let Some(face) = &segment.face {
println!(
"👤 Face: bbox=({},{}) {}x{}, conf={:.2}",
face.x, face.y, face.width, face.height, face.confidence
);
}
if let Some(landmarks) = &segment.mouth_landmarks {
println!("👄 Mouth landmarks: {} points", landmarks.len());
}
}
println!("▶️ Playing audio segment");
let mut cmd = Command::new("ffplay");
if args.show_video {
cmd.args([
"-ss",
&format!("{:.2}", seg.start),
"-t",
&format!("{:.2}", seg.duration),
"-autoexit",
"-x",
&format!("{}", args.video_width),
"-y",
&format!("{}", args.video_height),
args.video.to_str().unwrap(),
]);
} else {
cmd.args([
"-ss",
&format!("{:.2}", seg.start),
"-t",
&format!("{:.2}", seg.duration),
"-autoexit",
"-nodisp",
args.video.to_str().unwrap(),
]);
}
let _child = cmd
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.context("Failed to start ffplay")?;
thread::sleep(Duration::from_millis((seg.duration * 1000.0) as u64 + 100));
}
println!("\n{:=<80}", "");
println!("✅ Demo completed! Played {} segments", total_segments);
println!("{:=<80}", "");
} else {
println!("⚠️ No ASR or ASRX data loaded");
}
if raw_mode_enabled {
crossterm_terminal::disable_raw_mode().ok();
}
Ok(())
}
fn main() -> Result<()> {
let args = Args::parse();
if !args.video.exists() {
anyhow::bail!("Video file not found: {:?}", args.video);
}
println!("🎬 Integrated Player for ASR/Face/ASRX/Pose");
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
println!("Video: {:?}", args.video);
let mut player = IntegratedPlayer::new();
if let Some(asr_path) = &args.asr {
if asr_path.exists() {
player.load_asr(asr_path)?;
}
}
if let Some(face_path) = &args.face {
if face_path.exists() {
player.load_face(face_path)?;
}
}
if let Some(asrx_path) = &args.asrx {
if asrx_path.exists() {
player.load_asrx(asrx_path)?;
}
}
if let Some(pose_path) = &args.pose {
if pose_path.exists() {
player.load_pose(pose_path)?;
}
}
player.list_speakers();
if args.continuous_demo {
run_continuous_demo(&player, &args)?;
} else {
println!("\n⚠️ Please use --continuous-demo flag");
}
Ok(())
}