660 lines
20 KiB
Rust
660 lines
20 KiB
Rust
use anyhow::{Context, Result};
|
|
use clap::Parser;
|
|
use crossterm::event::{self, Event, KeyCode};
|
|
use crossterm::terminal as crossterm_terminal;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashMap;
|
|
use std::io::{self, IsTerminal, Write};
|
|
use std::path::PathBuf;
|
|
use std::process::{Command, Stdio};
|
|
use std::sync::atomic::{AtomicBool, Ordering};
|
|
use std::sync::Arc;
|
|
use std::thread;
|
|
use std::time::Duration;
|
|
|
|
#[derive(Parser, Debug)]
|
|
#[command(name = "integrated_player")]
|
|
#[command(about = "Integrated player for ASR, Face, ASRX, and Pose")]
|
|
struct Args {
|
|
#[arg(short, long)]
|
|
video: PathBuf,
|
|
|
|
#[arg(short = 'r', long)]
|
|
asr: Option<PathBuf>,
|
|
|
|
#[arg(short = 'f', long)]
|
|
face: Option<PathBuf>,
|
|
|
|
#[arg(short = 'x', long)]
|
|
asrx: Option<PathBuf>,
|
|
|
|
#[arg(short = 'p', long)]
|
|
pose: Option<PathBuf>,
|
|
|
|
#[arg(short = 's', long, default_value = "0.0")]
|
|
start: f64,
|
|
|
|
#[arg(long)]
|
|
speaker_name: Option<String>,
|
|
|
|
#[arg(long)]
|
|
auto_play_speaker: bool,
|
|
|
|
#[arg(long)]
|
|
demo: bool,
|
|
|
|
#[arg(long, default_value = "3")]
|
|
demo_segments_per_speaker: usize,
|
|
|
|
#[arg(long, default_value = "2.0")]
|
|
demo_speed: f64,
|
|
|
|
#[arg(long)]
|
|
show_video: bool,
|
|
|
|
#[arg(long, default_value = "800")]
|
|
video_width: u32,
|
|
|
|
#[arg(long, default_value = "600")]
|
|
video_height: u32,
|
|
|
|
#[arg(long)]
|
|
continuous_demo: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct AsrSegment {
|
|
start: f64,
|
|
end: f64,
|
|
text: String,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct AsrData {
|
|
language: Option<String>,
|
|
segments: Vec<AsrSegment>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct FaceInfo {
|
|
face_id: Option<String>,
|
|
x: i32,
|
|
y: i32,
|
|
width: i32,
|
|
height: i32,
|
|
confidence: f64,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct FaceFrame {
|
|
frame: u64,
|
|
timestamp: f64,
|
|
faces: Vec<FaceInfo>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct FaceData {
|
|
fps: f64,
|
|
frame_count: u64,
|
|
frames: Vec<FaceFrame>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct AsrxSegment {
|
|
index: usize,
|
|
start: f64,
|
|
end: f64,
|
|
duration: f64,
|
|
speaker: String,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct AsrxData {
|
|
segments: Vec<AsrxSegment>,
|
|
speaker_stats: HashMap<String, SpeakerStats>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct SpeakerStats {
|
|
count: usize,
|
|
duration: f64,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct Keypoint {
|
|
name: String,
|
|
x: f32,
|
|
y: f32,
|
|
confidence: f32,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct PersonPose {
|
|
keypoints: Vec<Keypoint>,
|
|
bbox: Bbox,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct Bbox {
|
|
x: i32,
|
|
y: i32,
|
|
width: i32,
|
|
height: i32,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct PoseFrame {
|
|
frame: u64,
|
|
timestamp: f64,
|
|
persons: Vec<PersonPose>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct PoseData {
|
|
frames: Vec<PoseFrame>,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct IntegratedSegment {
|
|
start: f64,
|
|
end: f64,
|
|
text: Option<String>,
|
|
speaker: Option<String>,
|
|
face: Option<FaceInfo>,
|
|
mouth_landmarks: Option<Vec<Keypoint>>,
|
|
}
|
|
|
|
struct IntegratedPlayer {
|
|
asr_data: Option<AsrData>,
|
|
face_data: Option<FaceData>,
|
|
asrx_data: Option<AsrxData>,
|
|
pose_data: Option<PoseData>,
|
|
current_time: f64,
|
|
speaker_names: HashMap<String, (String, String)>,
|
|
}
|
|
|
|
impl IntegratedPlayer {
|
|
fn new() -> Self {
|
|
let mut speaker_names = HashMap::new();
|
|
speaker_names.insert(
|
|
"SPEAKER_0".to_string(),
|
|
("Cary Grant".to_string(), "Peter Joshua".to_string()),
|
|
);
|
|
speaker_names.insert(
|
|
"SPEAKER_1".to_string(),
|
|
("Audrey Hepburn".to_string(), "Regina Lampert".to_string()),
|
|
);
|
|
speaker_names.insert(
|
|
"SPEAKER_2".to_string(),
|
|
(
|
|
"Walter Matthau".to_string(),
|
|
"Hamilton Bartholomew".to_string(),
|
|
),
|
|
);
|
|
speaker_names.insert(
|
|
"SPEAKER_4".to_string(),
|
|
("James Coburn".to_string(), "Tex Panthollow".to_string()),
|
|
);
|
|
|
|
Self {
|
|
asr_data: None,
|
|
face_data: None,
|
|
asrx_data: None,
|
|
pose_data: None,
|
|
current_time: 0.0,
|
|
speaker_names,
|
|
}
|
|
}
|
|
|
|
fn load_asr(&mut self, path: &PathBuf) -> Result<()> {
|
|
let content = std::fs::read_to_string(path)
|
|
.with_context(|| format!("Failed to read ASR file: {:?}", path))?;
|
|
self.asr_data = Some(serde_json::from_str(&content)?);
|
|
println!(
|
|
"✓ Loaded {} ASR segments",
|
|
self.asr_data.as_ref().unwrap().segments.len()
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
fn load_face(&mut self, path: &PathBuf) -> Result<()> {
|
|
let content = std::fs::read_to_string(path)
|
|
.with_context(|| format!("Failed to read Face file: {:?}", path))?;
|
|
self.face_data = Some(serde_json::from_str(&content)?);
|
|
let total_faces = self
|
|
.face_data
|
|
.as_ref()
|
|
.unwrap()
|
|
.frames
|
|
.iter()
|
|
.map(|f| f.faces.len())
|
|
.sum::<usize>();
|
|
println!(
|
|
"✓ Loaded {} face frames, {} total detections",
|
|
self.face_data.as_ref().unwrap().frames.len(),
|
|
total_faces
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
fn load_asrx(&mut self, path: &PathBuf) -> Result<()> {
|
|
let content = std::fs::read_to_string(path)
|
|
.with_context(|| format!("Failed to read ASRX file: {:?}", path))?;
|
|
self.asrx_data = Some(serde_json::from_str(&content)?);
|
|
println!(
|
|
"✓ Loaded {} ASRX segments, {} speakers",
|
|
self.asrx_data.as_ref().unwrap().segments.len(),
|
|
self.asrx_data.as_ref().unwrap().speaker_stats.len()
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
fn load_pose(&mut self, path: &PathBuf) -> Result<()> {
|
|
let content = std::fs::read_to_string(path)
|
|
.with_context(|| format!("Failed to read Pose file: {:?}", path))?;
|
|
self.pose_data = Some(serde_json::from_str(&content)?);
|
|
println!(
|
|
"✓ Loaded {} pose frames",
|
|
self.pose_data.as_ref().unwrap().frames.len()
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
fn get_current_segment(&self, time: f64) -> Option<IntegratedSegment> {
|
|
let mut segment = IntegratedSegment {
|
|
start: 0.0,
|
|
end: 0.0,
|
|
text: None,
|
|
speaker: None,
|
|
face: None,
|
|
mouth_landmarks: None,
|
|
};
|
|
|
|
if let Some(asr) = &self.asr_data {
|
|
for seg in &asr.segments {
|
|
if time >= seg.start && time <= seg.end {
|
|
segment.start = seg.start;
|
|
segment.end = seg.end;
|
|
segment.text = Some(seg.text.clone());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(asrx) = &self.asrx_data {
|
|
for seg in &asrx.segments {
|
|
if time >= seg.start && time <= seg.end {
|
|
segment.start = seg.start;
|
|
segment.end = seg.end;
|
|
segment.speaker = Some(seg.speaker.clone());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(face) = &self.face_data {
|
|
for frame in &face.frames {
|
|
if (frame.timestamp - time).abs() < 1.0 {
|
|
if let Some(face_info) = frame.faces.first() {
|
|
segment.face = Some(face_info.clone());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(pose) = &self.pose_data {
|
|
for frame in &pose.frames {
|
|
if (frame.timestamp - time).abs() < 0.5 {
|
|
if let Some(person) = frame.persons.first() {
|
|
let mouth_points: Vec<Keypoint> = person
|
|
.keypoints
|
|
.iter()
|
|
.filter(|kp| {
|
|
kp.name.contains("mouth")
|
|
|| kp.name.contains("lip")
|
|
|| kp.name == "nose"
|
|
})
|
|
.cloned()
|
|
.collect();
|
|
if !mouth_points.is_empty() {
|
|
segment.mouth_landmarks = Some(mouth_points);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if segment.text.is_some()
|
|
|| segment.speaker.is_some()
|
|
|| segment.face.is_some()
|
|
|| segment.mouth_landmarks.is_some()
|
|
{
|
|
Some(segment)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
fn get_speaker_info(&self, speaker_id: &str) -> (String, String) {
|
|
self.speaker_names
|
|
.get(speaker_id)
|
|
.cloned()
|
|
.unwrap_or_else(|| ("Unknown".to_string(), "Unknown".to_string()))
|
|
}
|
|
|
|
fn list_speakers(&self) {
|
|
if let Some(asrx) = &self.asrx_data {
|
|
println!("\n📊 Speaker Statistics:");
|
|
println!("{:-<80}", "");
|
|
println!(
|
|
"{:15} {:20} {:20} {:>10} {:>10}",
|
|
"Speaker ID", "Actor", "Character", "Segments", "Duration"
|
|
);
|
|
println!("{:-<80}", "");
|
|
|
|
for (speaker_id, stats) in &asrx.speaker_stats {
|
|
let (actor, character) = self.get_speaker_info(speaker_id);
|
|
println!(
|
|
"{:15} {:20} {:20} {:>10} {:>9.1}s",
|
|
speaker_id, actor, character, stats.count, stats.duration
|
|
);
|
|
}
|
|
println!("{:-<80}", "");
|
|
}
|
|
}
|
|
}
|
|
|
|
fn run_continuous_demo(player: &IntegratedPlayer, args: &Args) -> Result<()> {
|
|
println!("\n🎬 Continuous Demo Mode");
|
|
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
|
|
let is_interactive = io::stdin().is_terminal();
|
|
if is_interactive {
|
|
println!("Controls:");
|
|
println!(" SPACE - Pause/Resume");
|
|
println!(" Q - Quit");
|
|
} else {
|
|
println!("Running in non-interactive mode (no keyboard control)");
|
|
println!("Use Ctrl+C to stop");
|
|
}
|
|
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
println!();
|
|
|
|
let paused = Arc::new(AtomicBool::new(false));
|
|
let quit = Arc::new(AtomicBool::new(false));
|
|
|
|
let paused_clone = paused.clone();
|
|
let quit_clone = quit.clone();
|
|
|
|
let raw_mode_enabled = if is_interactive {
|
|
crossterm_terminal::enable_raw_mode().ok().is_some()
|
|
} else {
|
|
false
|
|
};
|
|
|
|
if is_interactive && raw_mode_enabled {
|
|
thread::spawn(move || loop {
|
|
if let Ok(Event::Key(key_event)) = event::read() {
|
|
if key_event.code == KeyCode::Char(' ') {
|
|
paused_clone.fetch_xor(true, Ordering::SeqCst);
|
|
} else if key_event.code == KeyCode::Char('q')
|
|
|| key_event.code == KeyCode::Char('Q')
|
|
|| key_event.code == KeyCode::Esc
|
|
{
|
|
quit_clone.store(true, Ordering::SeqCst);
|
|
break;
|
|
}
|
|
}
|
|
if quit_clone.load(Ordering::SeqCst) {
|
|
break;
|
|
}
|
|
thread::sleep(Duration::from_millis(50));
|
|
});
|
|
}
|
|
|
|
if let Some(asr) = &player.asr_data {
|
|
let total_segments = asr.segments.len();
|
|
|
|
for (i, seg) in asr.segments.iter().enumerate() {
|
|
if quit.load(Ordering::SeqCst) {
|
|
println!("\n⏹️ Stopped by user");
|
|
break;
|
|
}
|
|
|
|
while paused.load(Ordering::SeqCst) {
|
|
println!("\r⏸️ Paused - Press SPACE to resume");
|
|
io::stdout().flush()?;
|
|
thread::sleep(Duration::from_millis(100));
|
|
|
|
if quit.load(Ordering::SeqCst) {
|
|
println!("\n⏹️ Stopped by user");
|
|
if raw_mode_enabled {
|
|
crossterm_terminal::disable_raw_mode().ok();
|
|
}
|
|
return Ok(());
|
|
}
|
|
}
|
|
|
|
println!("\n[{}/{}] Segment", i + 1, total_segments);
|
|
println!("{:=<80}", "");
|
|
println!("📝 ASR Text: {}", seg.text);
|
|
println!("⏱ Time: {:.2}s - {:.2}s", seg.start, seg.end);
|
|
|
|
if let Some(asrx) = &player.asrx_data {
|
|
for asrx_seg in &asrx.segments {
|
|
if seg.start >= asrx_seg.start && seg.start <= asrx_seg.end {
|
|
let (actor, character) = player.get_speaker_info(&asrx_seg.speaker);
|
|
println!(
|
|
"🎤 Speaker: {} → {} ({})",
|
|
asrx_seg.speaker, actor, character
|
|
);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(segment) = player.get_current_segment(seg.start + 0.01) {
|
|
if let Some(face) = &segment.face {
|
|
println!(
|
|
"👤 Face: bbox=({},{}) {}x{}, conf={:.2}",
|
|
face.x, face.y, face.width, face.height, face.confidence
|
|
);
|
|
}
|
|
if let Some(landmarks) = &segment.mouth_landmarks {
|
|
println!("👄 Mouth landmarks: {} points", landmarks.len());
|
|
}
|
|
}
|
|
|
|
let duration = seg.end - seg.start;
|
|
println!(
|
|
"▶️ Playing: {:.2}s - {:.2}s ({:.2}s)",
|
|
seg.start, seg.end, duration
|
|
);
|
|
|
|
let mut cmd = Command::new("ffplay");
|
|
if args.show_video {
|
|
cmd.args([
|
|
"-ss",
|
|
&format!("{:.2}", seg.start),
|
|
"-t",
|
|
&format!("{:.2}", duration),
|
|
"-autoexit",
|
|
"-x",
|
|
&format!("{}", args.video_width),
|
|
"-y",
|
|
&format!("{}", args.video_height),
|
|
args.video.to_str().unwrap(),
|
|
]);
|
|
} else {
|
|
cmd.args([
|
|
"-ss",
|
|
&format!("{:.2}", seg.start),
|
|
"-t",
|
|
&format!("{:.2}", duration),
|
|
"-autoexit",
|
|
"-nodisp",
|
|
args.video.to_str().unwrap(),
|
|
]);
|
|
}
|
|
|
|
let _child = cmd
|
|
.stdout(Stdio::null())
|
|
.stderr(Stdio::null())
|
|
.spawn()
|
|
.context("Failed to start ffplay")?;
|
|
|
|
thread::sleep(Duration::from_millis((duration * 1000.0) as u64 + 100));
|
|
}
|
|
|
|
println!("\n{:=<80}", "");
|
|
println!("✅ Demo completed! Played {} segments", total_segments);
|
|
println!("{:=<80}", "");
|
|
} else if let Some(asrx) = &player.asrx_data {
|
|
let total_segments = asrx.segments.len();
|
|
println!(
|
|
"Playing {} ASRX segments (no ASR text available)",
|
|
total_segments
|
|
);
|
|
|
|
for (i, seg) in asrx.segments.iter().enumerate() {
|
|
if quit.load(Ordering::SeqCst) {
|
|
println!("\n⏹️ Stopped by user");
|
|
break;
|
|
}
|
|
|
|
while paused.load(Ordering::SeqCst) {
|
|
println!("\r⏸️ Paused - Press SPACE to resume");
|
|
io::stdout().flush()?;
|
|
thread::sleep(Duration::from_millis(100));
|
|
|
|
if quit.load(Ordering::SeqCst) {
|
|
println!("\n⏹️ Stopped by user");
|
|
if raw_mode_enabled {
|
|
crossterm_terminal::disable_raw_mode().ok();
|
|
}
|
|
return Ok(());
|
|
}
|
|
}
|
|
|
|
let (actor, character) = player.get_speaker_info(&seg.speaker);
|
|
|
|
println!("\n[{}/{}] Segment", i + 1, total_segments);
|
|
println!("{:=<80}", "");
|
|
println!(
|
|
"⏱ Time: {:.2}s - {:.2}s ({:.2}s)",
|
|
seg.start, seg.end, seg.duration
|
|
);
|
|
println!("🎤 Speaker: {} → {} ({})", seg.speaker, actor, character);
|
|
|
|
if let Some(segment) = player.get_current_segment(seg.start + 0.01) {
|
|
if let Some(face) = &segment.face {
|
|
println!(
|
|
"👤 Face: bbox=({},{}) {}x{}, conf={:.2}",
|
|
face.x, face.y, face.width, face.height, face.confidence
|
|
);
|
|
}
|
|
if let Some(landmarks) = &segment.mouth_landmarks {
|
|
println!("👄 Mouth landmarks: {} points", landmarks.len());
|
|
}
|
|
}
|
|
|
|
println!("▶️ Playing audio segment");
|
|
|
|
let mut cmd = Command::new("ffplay");
|
|
if args.show_video {
|
|
cmd.args([
|
|
"-ss",
|
|
&format!("{:.2}", seg.start),
|
|
"-t",
|
|
&format!("{:.2}", seg.duration),
|
|
"-autoexit",
|
|
"-x",
|
|
&format!("{}", args.video_width),
|
|
"-y",
|
|
&format!("{}", args.video_height),
|
|
args.video.to_str().unwrap(),
|
|
]);
|
|
} else {
|
|
cmd.args([
|
|
"-ss",
|
|
&format!("{:.2}", seg.start),
|
|
"-t",
|
|
&format!("{:.2}", seg.duration),
|
|
"-autoexit",
|
|
"-nodisp",
|
|
args.video.to_str().unwrap(),
|
|
]);
|
|
}
|
|
|
|
let _child = cmd
|
|
.stdout(Stdio::null())
|
|
.stderr(Stdio::null())
|
|
.spawn()
|
|
.context("Failed to start ffplay")?;
|
|
|
|
thread::sleep(Duration::from_millis((seg.duration * 1000.0) as u64 + 100));
|
|
}
|
|
|
|
println!("\n{:=<80}", "");
|
|
println!("✅ Demo completed! Played {} segments", total_segments);
|
|
println!("{:=<80}", "");
|
|
} else {
|
|
println!("⚠️ No ASR or ASRX data loaded");
|
|
}
|
|
|
|
if raw_mode_enabled {
|
|
crossterm_terminal::disable_raw_mode().ok();
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn main() -> Result<()> {
|
|
let args = Args::parse();
|
|
|
|
if !args.video.exists() {
|
|
anyhow::bail!("Video file not found: {:?}", args.video);
|
|
}
|
|
|
|
println!("🎬 Integrated Player for ASR/Face/ASRX/Pose");
|
|
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
|
println!("Video: {:?}", args.video);
|
|
|
|
let mut player = IntegratedPlayer::new();
|
|
|
|
if let Some(asr_path) = &args.asr {
|
|
if asr_path.exists() {
|
|
player.load_asr(asr_path)?;
|
|
}
|
|
}
|
|
|
|
if let Some(face_path) = &args.face {
|
|
if face_path.exists() {
|
|
player.load_face(face_path)?;
|
|
}
|
|
}
|
|
|
|
if let Some(asrx_path) = &args.asrx {
|
|
if asrx_path.exists() {
|
|
player.load_asrx(asrx_path)?;
|
|
}
|
|
}
|
|
|
|
if let Some(pose_path) = &args.pose {
|
|
if pose_path.exists() {
|
|
player.load_pose(pose_path)?;
|
|
}
|
|
}
|
|
|
|
player.list_speakers();
|
|
|
|
if args.continuous_demo {
|
|
run_continuous_demo(&player, &args)?;
|
|
} else {
|
|
println!("\n⚠️ Please use --continuous-demo flag");
|
|
}
|
|
|
|
Ok(())
|
|
}
|