feat: add Vision LLM integration (CLIP + Qwen3-VL cascade)

- Add Qwen3-VL dynamic management (start/stop/status CLI)
- Add CLIP + Qwen3-VL cascade detection strategy
- Add Vision CLI commands (vision start/stop/status, detect)
- Add cascade_vision processor module
- Add clip processor module
- Add qwen_vl_manager module

Changes:
- scripts/start_qwen3vl.sh, stop_qwen3vl.sh: Qwen3-VL management scripts
- src/core/vision/: Qwen3-VL manager module
- src/core/processor/cascade_vision.rs: CLIP + Qwen3-VL cascade logic
- src/core/processor/clip.rs: CLIP classification and detection
- src/api/clip_api.rs: CLIP API endpoints
- src/cli/vision.rs: Vision CLI implementation
- src/cli/args.rs: Add Vision and Detect commands
- src/main.rs: Integrate Vision CLI
- src/core/mod.rs: Add vision module
- src/core/processor/mod.rs: Add cascade_vision module
This commit is contained in:
Accusys
2026-06-13 16:25:52 +08:00
parent 834b0d4865
commit 17e4e15860
37 changed files with 2185 additions and 294 deletions

194
src/api/clip_api.rs Normal file
View File

@@ -0,0 +1,194 @@
use axum::{
extract::{Query, State},
http::StatusCode,
response::{IntoResponse, Response},
routing::{get, post},
Json, Router,
};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use crate::core::processor::{classify_image, classify_images, detect_objects, ClipPrediction};
use crate::api::types::AppState;
#[derive(Debug, Deserialize)]
pub struct ClassifyRequest {
image_path: String,
labels: String,
#[serde(default = "default_top_k")]
top_k: usize,
#[serde(default)]
model: Option<String>,
}
fn default_top_k() -> usize {
5
}
#[derive(Debug, Deserialize)]
pub struct DetectRequest {
image_path: String,
objects: String,
#[serde(default = "default_threshold")]
threshold: f32,
#[serde(default)]
model: Option<String>,
}
fn default_threshold() -> f32 {
0.15
}
#[derive(Debug, Deserialize)]
pub struct BatchClassifyRequest {
image_paths: String,
labels: String,
#[serde(default = "default_top_k")]
top_k: usize,
#[serde(default)]
model: Option<String>,
}
#[derive(Debug, Serialize)]
pub struct ClassifyResponse {
success: bool,
predictions: Vec<ClipPrediction>,
}
#[derive(Debug, Serialize)]
pub struct DetectResponse {
success: bool,
detected: Vec<ClipPrediction>,
}
#[derive(Debug, Serialize)]
pub struct BatchClassifyResponse {
success: bool,
results: HashMap<String, Vec<ClipPrediction>>,
}
#[derive(Debug, Serialize)]
pub struct ErrorResponse {
success: bool,
error: String,
}
pub fn clip_routes() -> Router<AppState> {
Router::new()
.route("/api/v1/clip/classify", post(classify_image_endpoint))
.route("/api/v1/clip/detect", post(detect_objects_endpoint))
.route("/api/v1/clip/batch", post(batch_classify_endpoint))
}
async fn classify_image_endpoint(
State(_state): State<AppState>,
Json(req): Json<ClassifyRequest>,
) -> Response {
let labels: Vec<&str> = req.labels.split(',').map(|s| s.trim()).collect();
let result = classify_image(
&req.image_path,
&labels,
Some(req.top_k),
req.model.as_deref(),
).await;
match result {
Ok(predictions) => {
tracing::info!(
"[CLIP_API] Classified {} -> top: {} ({:.3})",
req.image_path,
predictions.first().map(|p| p.label.as_str()).unwrap_or("none"),
predictions.first().map(|p| p.confidence).unwrap_or(0.0)
);
Json(ClassifyResponse {
success: true,
predictions,
}).into_response()
}
Err(e) => {
tracing::error!("[CLIP_API] Classification failed: {}", e);
Json(ErrorResponse {
success: false,
error: e.to_string(),
}).into_response()
}
}
}
async fn detect_objects_endpoint(
State(_state): State<AppState>,
Json(req): Json<DetectRequest>,
) -> Response {
let objects: Vec<&str> = req.objects.split(',').map(|s| s.trim()).collect();
let result = detect_objects(
&req.image_path,
&objects,
Some(req.threshold),
req.model.as_deref(),
).await;
match result {
Ok(detected) => {
if !detected.is_empty() {
tracing::info!(
"[CLIP_API] Detected {} objects in {}: {}",
detected.len(),
req.image_path,
detected.iter().map(|p| p.label.as_str()).collect::<Vec<_>>().join(", ")
);
} else {
tracing::info!("[CLIP_API] No objects detected in {} (threshold: {:.2})", req.image_path, req.threshold);
}
Json(DetectResponse {
success: true,
detected,
}).into_response()
}
Err(e) => {
tracing::error!("[CLIP_API] Detection failed: {}", e);
Json(ErrorResponse {
success: false,
error: e.to_string(),
}).into_response()
}
}
}
async fn batch_classify_endpoint(
State(_state): State<AppState>,
Json(req): Json<BatchClassifyRequest>,
) -> Response {
let image_paths: Vec<&str> = req.image_paths.split(',').map(|s| s.trim()).collect();
let labels: Vec<&str> = req.labels.split(',').map(|s| s.trim()).collect();
let result = classify_images(
&image_paths,
&labels,
Some(req.top_k),
req.model.as_deref(),
).await;
match result {
Ok(results_vec) => {
let results: HashMap<String, Vec<ClipPrediction>> = results_vec
.into_iter()
.map(|r| (r.image_path, r.predictions))
.collect();
tracing::info!("[CLIP_API] Batch classified {} images", results.len());
Json(BatchClassifyResponse {
success: true,
results,
}).into_response()
}
Err(e) => {
tracing::error!("[CLIP_API] Batch classification failed: {}", e);
Json(ErrorResponse {
success: false,
error: e.to_string(),
}).into_response()
}
}
}