684 lines
21 KiB
Python
684 lines
21 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
場景識別處理器 (Scene Classification Processor)
|
||
使用 Core ML + Places365 模型進行場景識別
|
||
|
||
支援 Apple Silicon M4 優化
|
||
- Core ML 模型 (原生)
|
||
- PyTorch + MPS (備案)
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import sys
|
||
import time
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional, Any
|
||
|
||
# 嘗試導入 Core ML
|
||
try:
|
||
import coremltools as ct
|
||
|
||
HAS_COREML = True
|
||
except ImportError:
|
||
HAS_COREML = False
|
||
|
||
# 嘗試導入 PyTorch (備案)
|
||
try:
|
||
import torch
|
||
from torchvision import transforms, models
|
||
|
||
HAS_TORCH = True
|
||
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
||
except ImportError:
|
||
HAS_TORCH = False
|
||
DEVICE = torch.device("cpu")
|
||
|
||
# 嘗試導入 Pillow 用於圖像處理
|
||
try:
|
||
from PIL import Image
|
||
|
||
HAS_PIL = True
|
||
except ImportError:
|
||
HAS_PIL = False
|
||
|
||
# 嘗試導入 OpenCV 用於影片處理
|
||
try:
|
||
import cv2
|
||
|
||
HAS_CV = True
|
||
except ImportError:
|
||
HAS_CV = False
|
||
|
||
# 載入 Places365 類別
|
||
PLACES365_CATEGORIES = {}
|
||
try:
|
||
import json
|
||
from pathlib import Path
|
||
|
||
categories_path = Path(__file__).parent / "places365_categories.json"
|
||
if categories_path.exists():
|
||
with open(categories_path, "r", encoding="utf-8") as f:
|
||
PLACES365_CATEGORIES = json.load(f)
|
||
print(f"[SCENE] Loaded {len(PLACES365_CATEGORIES)} Places365 categories")
|
||
except Exception as e:
|
||
print(f"[SCENE] Warning: Could not load Places365 categories: {e}")
|
||
|
||
|
||
# 場景類型中英文對照
|
||
SCENE_TYPE_ZH = {
|
||
"hospital_room": "醫院病房",
|
||
"pharmacy": "藥房",
|
||
"classroom": "教室",
|
||
"office": "辦公室",
|
||
"kitchen": "廚房",
|
||
"living_room": "客廳",
|
||
"bedroom": "臥室",
|
||
"bathroom": "浴室",
|
||
"restaurant": "餐廳",
|
||
"gym": "健身房",
|
||
"supermarket": "超市",
|
||
"basketball_court": "籃球場",
|
||
"football_field": "足球場",
|
||
"tennis_court": "網球場",
|
||
"swimming_pool": "游泳池",
|
||
"park": "公園",
|
||
"street": "街道",
|
||
"beach": "海灘",
|
||
"mountain": "山地",
|
||
"forest": "森林",
|
||
"airport": "機場",
|
||
"train_station": "火車站",
|
||
"subway_station": "地鐵站",
|
||
"gas_station": "加油站",
|
||
"parking_lot": "停車場",
|
||
"auditorium": "禮堂",
|
||
"library": "圖書館",
|
||
"laboratory": "實驗室",
|
||
"art_studio": "藝術工作室",
|
||
"music_store": "音樂商店",
|
||
"computer_room": "電腦室",
|
||
"conference_room": "會議室",
|
||
"playground": "遊樂場",
|
||
"ski_slope": "滑雪坡",
|
||
"ice_rink": "溜冰場",
|
||
"boxing_ring": "拳擊場",
|
||
"volleyball_court": "排球場",
|
||
"baseball_field": "棒球場",
|
||
}
|
||
|
||
# 場景類別(Places365 子集)
|
||
SCENE_CATEGORIES = [
|
||
"hospital_room",
|
||
"pharmacy",
|
||
"classroom",
|
||
"office",
|
||
"kitchen",
|
||
"living_room",
|
||
"bedroom",
|
||
"bathroom",
|
||
"restaurant",
|
||
"gym",
|
||
"supermarket",
|
||
"basketball_court",
|
||
"football_field",
|
||
"tennis_court",
|
||
"swimming_pool",
|
||
"park",
|
||
"street",
|
||
"beach",
|
||
"mountain",
|
||
"forest",
|
||
"airport",
|
||
"train_station",
|
||
"subway_station",
|
||
"gas_station",
|
||
"parking_lot",
|
||
"auditorium",
|
||
"library",
|
||
"laboratory",
|
||
"art_studio",
|
||
"music_store",
|
||
"computer_room",
|
||
"conference_room",
|
||
"playground",
|
||
"ski_slope",
|
||
"ice_rink",
|
||
"boxing_ring",
|
||
"volleyball_court",
|
||
"baseball_field",
|
||
]
|
||
|
||
|
||
class SceneClassifier:
|
||
"""場景識別器"""
|
||
|
||
def __init__(self, model_path: Optional[str] = None):
|
||
"""
|
||
初始化場景識別器
|
||
|
||
Args:
|
||
model_path: Core ML 模型路徑 (可選)
|
||
"""
|
||
self.model_path = model_path
|
||
self.places365_model_path = (
|
||
"/Users/accusys/momentry/models/resnet18_places365.pth.tar"
|
||
)
|
||
self.model = None
|
||
self.coreml_model = None
|
||
self.transform = None
|
||
self.model_type = "unknown"
|
||
|
||
# 圖像預處理
|
||
self.transform = transforms.Compose(
|
||
[
|
||
transforms.Resize((224, 224)),
|
||
transforms.ToTensor(),
|
||
transforms.Normalize(
|
||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||
),
|
||
]
|
||
)
|
||
|
||
def load_model(self) -> bool:
|
||
"""
|
||
載入模型
|
||
|
||
Returns:
|
||
bool: 是否成功載入
|
||
"""
|
||
# 優先使用 Core ML
|
||
if HAS_COREML and self.model_path and Path(self.model_path).exists():
|
||
try:
|
||
print(f"[SCENE] Loading Core ML model: {self.model_path}")
|
||
self.coreml_model = ct.models.MLModel(self.model_path)
|
||
self.model_type = "coreml"
|
||
print("[SCENE] Core ML model loaded successfully")
|
||
return True
|
||
except Exception as e:
|
||
print(f"[SCENE] Warning: Failed to load Core ML model: {e}")
|
||
|
||
# 備案:使用 PyTorch + Places365
|
||
if HAS_TORCH:
|
||
try:
|
||
print(f"[SCENE] Loading PyTorch model on {DEVICE}")
|
||
|
||
# 檢查 Places365 模型是否存在
|
||
if Path(self.places365_model_path).exists():
|
||
print(
|
||
f"[SCENE] Loading Places365 model: {self.places365_model_path}"
|
||
)
|
||
checkpoint = torch.load(
|
||
self.places365_model_path, map_location=DEVICE
|
||
)
|
||
|
||
# 建立 ResNet18 模型 (Places365 有 365 個類別)
|
||
self.model = models.resnet18(num_classes=365)
|
||
|
||
# 移除 'module.' prefix (DataParallel training)
|
||
state_dict = checkpoint["state_dict"]
|
||
new_state_dict = {}
|
||
for k, v in state_dict.items():
|
||
if k.startswith("module."):
|
||
new_state_dict[k[7:]] = v
|
||
else:
|
||
new_state_dict[k] = v
|
||
|
||
self.model.load_state_dict(new_state_dict)
|
||
self.model_type = "places365"
|
||
print("[SCENE] Places365 model loaded successfully (365 classes)")
|
||
else:
|
||
print(
|
||
f"[SCENE] Places365 model not found, using ImageNet pretrained"
|
||
)
|
||
self.model = models.resnet18(pretrained=True)
|
||
self.model_type = "imagenet"
|
||
|
||
self.model.to(DEVICE)
|
||
self.model.eval()
|
||
print("[SCENE] PyTorch model loaded successfully")
|
||
return True
|
||
except Exception as e:
|
||
print(f"[SCENE] Warning: Failed to load PyTorch model: {e}")
|
||
import traceback
|
||
|
||
traceback.print_exc()
|
||
|
||
print("[SCENE] Error: No model available")
|
||
return False
|
||
|
||
def predict_frame(self, frame: Any) -> List[Dict[str, Any]]:
|
||
"""
|
||
預測單幀圖像的場景類型
|
||
|
||
Args:
|
||
frame: 圖像幀 (OpenCV ndarray 或 PIL)
|
||
|
||
Returns:
|
||
List[Dict]: 前 5 個預測結果
|
||
"""
|
||
if self.coreml_model is None and self.model is None:
|
||
print("[SCENE] Warning: No model loaded")
|
||
return []
|
||
|
||
# 轉換為 PIL Image
|
||
if isinstance(frame, str):
|
||
img = Image.open(frame).convert("RGB")
|
||
elif HAS_CV and hasattr(frame, "shape") and len(frame.shape) == 3:
|
||
# OpenCV frame (BGR ndarray)
|
||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||
elif hasattr(frame, "convert"):
|
||
# PIL Image
|
||
img = frame.convert("RGB")
|
||
else:
|
||
print(f"[SCENE] Warning: Unknown frame type: {type(frame)}")
|
||
return []
|
||
|
||
if img is None:
|
||
print("[SCENE] Warning: Failed to convert to PIL Image")
|
||
return []
|
||
|
||
# 使用 Core ML
|
||
if self.coreml_model is not None:
|
||
try:
|
||
# Core ML 需要 dict 輸入
|
||
input_dict = {"image": img}
|
||
output = self.coreml_model.predict(input_dict)
|
||
|
||
# 解析輸出
|
||
probs = output.get("probs", {})
|
||
top_5 = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5]
|
||
|
||
return [
|
||
{"scene_type": label, "confidence": float(conf)}
|
||
for label, conf in top_5
|
||
]
|
||
except Exception as e:
|
||
print(f"[SCENE] Core ML prediction error: {e}")
|
||
return []
|
||
|
||
# 使用 PyTorch
|
||
if self.model is not None:
|
||
try:
|
||
with torch.no_grad():
|
||
# 預處理
|
||
input_tensor = self.transform(img).unsqueeze(0).to(DEVICE)
|
||
|
||
# 推理
|
||
outputs = self.model(input_tensor)
|
||
probs = torch.nn.functional.softmax(outputs, dim=1)
|
||
|
||
# 取得 top 5
|
||
top_5_probs, top_5_indices = torch.topk(probs, 5)
|
||
|
||
# 簡化:使用 Places365 類別映射
|
||
results = []
|
||
for i in range(5):
|
||
prob = top_5_probs[0][i].item()
|
||
idx = top_5_indices[0][i].item()
|
||
|
||
# 使用 Places365 類別名稱(如果可用)
|
||
scene_type = PLACES365_CATEGORIES.get(str(idx), f"scene_{idx}")
|
||
|
||
results.append({"scene_type": scene_type, "confidence": prob})
|
||
|
||
return results
|
||
except Exception as e:
|
||
print(f"[SCENE] PyTorch prediction error: {e}")
|
||
import traceback
|
||
|
||
traceback.print_exc()
|
||
return []
|
||
|
||
return []
|
||
|
||
# 轉換為 PIL Image
|
||
if isinstance(frame, str):
|
||
img = Image.open(frame).convert("RGB")
|
||
elif HAS_CV and hasattr(frame, "shape") and len(frame.shape) == 3:
|
||
# OpenCV frame (BGR ndarray)
|
||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||
elif hasattr(frame, "convert"):
|
||
# PIL Image
|
||
img = frame.convert("RGB")
|
||
else:
|
||
print(f"[SCENE] Warning: Unknown frame type: {type(frame)}")
|
||
return []
|
||
|
||
if img is None:
|
||
return []
|
||
|
||
# 轉換為 PIL Image
|
||
if isinstance(frame, str):
|
||
img = Image.open(frame).convert("RGB")
|
||
elif HAS_CV and isinstance(frame, dict):
|
||
# OpenCV frame (BGR)
|
||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||
else:
|
||
img = frame.convert("RGB") if hasattr(frame, "convert") else None
|
||
|
||
if img is None:
|
||
return []
|
||
|
||
# 使用 Core ML
|
||
if self.coreml_model is not None:
|
||
try:
|
||
# Core ML 需要 dict 輸入
|
||
input_dict = {"image": img}
|
||
output = self.coreml_model.predict(input_dict)
|
||
|
||
# 解析輸出
|
||
probs = output.get("probs", {})
|
||
top_5 = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5]
|
||
|
||
return [
|
||
{"scene_type": label, "confidence": float(conf)}
|
||
for label, conf in top_5
|
||
]
|
||
except Exception as e:
|
||
print(f"[SCENE] Core ML prediction error: {e}")
|
||
return []
|
||
|
||
# 使用 PyTorch
|
||
if self.model is not None:
|
||
try:
|
||
with torch.no_grad():
|
||
# 預處理
|
||
input_tensor = self.transform(img).unsqueeze(0).to(DEVICE)
|
||
|
||
# 推理
|
||
outputs = self.model(input_tensor)
|
||
probs = torch.nn.functional.softmax(outputs, dim=1)
|
||
|
||
# 取得 top 5
|
||
top_5_probs, top_5_indices = torch.topk(probs, 5)
|
||
|
||
# 載入 ImageNet 類別(簡化版,實際應該用 Places365)
|
||
# 這裡返回通用預測
|
||
results = []
|
||
for i in range(5):
|
||
prob = top_5_probs[0][i].item()
|
||
# 簡化:返回 "unknown" + 信心度
|
||
results.append(
|
||
{"scene_type": f"unknown_{i}", "confidence": prob}
|
||
)
|
||
|
||
return results
|
||
except Exception as e:
|
||
print(f"[SCENE] PyTorch prediction error: {e}")
|
||
return []
|
||
|
||
return []
|
||
|
||
def classify_video(
|
||
self,
|
||
video_path: str,
|
||
output_path: str,
|
||
sample_interval: float = 2.0,
|
||
min_scene_duration: float = 3.0,
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
分類整個影片
|
||
|
||
Args:
|
||
video_path: 影片路徑
|
||
output_path: 輸出 JSON 路徑
|
||
sample_interval: 取樣間隔(秒)
|
||
min_scene_duration: 最小場景持續時間(秒)
|
||
|
||
Returns:
|
||
Dict: 分類結果
|
||
"""
|
||
if not HAS_CV:
|
||
print("[SCENE] Error: OpenCV not available")
|
||
return {"frame_count": 0, "fps": 0.0, "scenes": []}
|
||
|
||
# 開啟影片
|
||
cap = cv2.VideoCapture(video_path)
|
||
if not cap.isOpened():
|
||
print(f"[SCENE] Error: Cannot open video: {video_path}")
|
||
return {"frame_count": 0, "fps": 0.0, "scenes": []}
|
||
|
||
# 取得影片資訊
|
||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||
duration = total_frames / fps if fps > 0 else 0
|
||
|
||
print(f"[SCENE] Video: {video_path}")
|
||
print(f"[SCENE] FPS: {fps}, Frames: {total_frames}, Duration: {duration:.1f}s")
|
||
|
||
# 取樣幀進行分類
|
||
sample_interval_frames = max(1, int(fps * sample_interval))
|
||
predictions = []
|
||
frame_count = 0
|
||
|
||
while True:
|
||
ret, frame = cap.read()
|
||
if not ret:
|
||
break
|
||
|
||
frame_count += 1
|
||
|
||
# 只在取樣點預測
|
||
if frame_count % sample_interval_frames == 0:
|
||
timestamp = frame_count / fps
|
||
pred = self.predict_frame(frame)
|
||
|
||
if pred:
|
||
predictions.append({"timestamp": timestamp, "predictions": pred})
|
||
|
||
# 顯示進度
|
||
if len(predictions) % 10 == 0:
|
||
progress = (frame_count / total_frames) * 100
|
||
print(
|
||
f"[SCENE] Progress: {progress:.1f}% ({len(predictions)} samples)"
|
||
)
|
||
|
||
cap.release()
|
||
|
||
print(f"[SCENE] Collected {len(predictions)} predictions")
|
||
|
||
# 合併連續相同場景
|
||
scenes = self._merge_scenes(predictions, min_scene_duration, duration)
|
||
|
||
# 建立結果
|
||
result = {
|
||
"frame_count": total_frames,
|
||
"fps": fps,
|
||
"scenes": scenes,
|
||
"metadata": {
|
||
"video_path": video_path,
|
||
"duration": duration,
|
||
"sample_interval": sample_interval,
|
||
"min_scene_duration": min_scene_duration,
|
||
"processed_at": datetime.now().isoformat(),
|
||
"model_type": "coreml"
|
||
if self.coreml_model
|
||
else "pytorch"
|
||
if self.model
|
||
else "none",
|
||
},
|
||
}
|
||
|
||
# 寫出 JSON
|
||
with open(output_path, "w", encoding="utf-8") as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"[SCENE] Result saved to: {output_path}")
|
||
print(f"[SCENE] Detected {len(scenes)} scenes")
|
||
|
||
return result
|
||
|
||
def _merge_scenes(
|
||
self, predictions: List[Dict], min_duration: float, total_duration: float
|
||
) -> List[Dict[str, Any]]:
|
||
"""
|
||
合併連續相同場景
|
||
|
||
使用 Places365 類別名稱
|
||
"""
|
||
if not predictions:
|
||
return []
|
||
|
||
# 統計所有預測的場景類型
|
||
scene_counts = {}
|
||
for pred in predictions:
|
||
if pred["predictions"]:
|
||
scene_type = pred["predictions"][0]["scene_type"]
|
||
scene_counts[scene_type] = scene_counts.get(scene_type, 0) + 1
|
||
|
||
# 找出最常見的場景類型
|
||
if scene_counts:
|
||
most_common_scene = max(scene_counts.items(), key=lambda x: x[1])[0]
|
||
|
||
# 計算平均信心度
|
||
avg_confidence = (
|
||
sum(
|
||
p["predictions"][0]["confidence"]
|
||
for p in predictions
|
||
if p["predictions"]
|
||
)
|
||
/ len(predictions)
|
||
if predictions
|
||
else 0.0
|
||
)
|
||
|
||
first_pred = predictions[0]
|
||
last_pred = predictions[-1]
|
||
|
||
return [
|
||
{
|
||
"start_time": first_pred["timestamp"],
|
||
"end_time": last_pred["timestamp"],
|
||
"scene_type": most_common_scene,
|
||
"scene_type_zh": SCENE_TYPE_ZH.get(most_common_scene),
|
||
"confidence": avg_confidence,
|
||
"top_5": first_pred["predictions"][:5],
|
||
}
|
||
]
|
||
|
||
return []
|
||
# 在沒有 Places365 模型的情況下,這是合理的預設行為
|
||
if predictions:
|
||
first_pred = predictions[0]
|
||
last_pred = predictions[-1]
|
||
|
||
# 使用平均信心度
|
||
avg_confidence = (
|
||
sum(
|
||
p["predictions"][0]["confidence"]
|
||
for p in predictions
|
||
if p["predictions"]
|
||
)
|
||
/ len(predictions)
|
||
if predictions
|
||
else 0.0
|
||
)
|
||
|
||
return [
|
||
{
|
||
"start_time": first_pred["timestamp"],
|
||
"end_time": last_pred["timestamp"],
|
||
"scene_type": "indoor_general", # 預設為室內一般場景
|
||
"scene_type_zh": "室內場景",
|
||
"confidence": avg_confidence,
|
||
"top_5": first_pred["predictions"][:5],
|
||
}
|
||
]
|
||
|
||
return []
|
||
|
||
|
||
def main():
|
||
"""主函數"""
|
||
parser = argparse.ArgumentParser(
|
||
description="場景識別處理器 - 使用 Core ML + Places365"
|
||
)
|
||
parser.add_argument("video_path", nargs="?", help="輸入影片路徑")
|
||
parser.add_argument("output_path", nargs="?", help="輸出 JSON 路徑")
|
||
parser.add_argument("--uuid", help="影片 UUID (用於日誌)", default=None)
|
||
parser.add_argument("--model", help="Core ML 模型路徑", default=None)
|
||
parser.add_argument(
|
||
"--sample-interval", type=float, default=2.0, help="取樣間隔 (秒),預設 2.0"
|
||
)
|
||
parser.add_argument(
|
||
"--min-scene-duration",
|
||
type=float,
|
||
default=3.0,
|
||
help="最小場景持續時間 (秒),預設 3.0",
|
||
)
|
||
parser.add_argument("--check-health", action="store_true", help="檢查環境並退出")
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 健康檢查
|
||
if args.check_health:
|
||
print("=== 場景識別處理器健康檢查 ===")
|
||
print(f"Core ML: {'✓ Available' if HAS_COREML else '✗ Not available'}")
|
||
print(f"PyTorch: {'✓ Available' if HAS_TORCH else '✗ Not available'}")
|
||
print(f"PIL: {'✓ Available' if HAS_PIL else '✗ Not available'}")
|
||
print(f"OpenCV: {'✓ Available' if HAS_CV else '✗ Not available'}")
|
||
if HAS_TORCH:
|
||
print(f"Device: {DEVICE}")
|
||
sys.exit(0)
|
||
|
||
# 檢查必要參數
|
||
if not args.video_path or not args.output_path:
|
||
parser.print_help()
|
||
sys.exit(1)
|
||
|
||
# 檢查依賴
|
||
if not HAS_PIL or not HAS_CV:
|
||
print("[SCENE] Error: Missing required dependencies (PIL/OpenCV)")
|
||
sys.exit(1)
|
||
|
||
# 建立分類器
|
||
classifier = SceneClassifier(model_path=args.model)
|
||
|
||
# 載入模型
|
||
if not classifier.load_model():
|
||
print("[SCENE] Warning: No model loaded, will return empty results")
|
||
# 建立空結果
|
||
result = {
|
||
"frame_count": 0,
|
||
"fps": 0.0,
|
||
"scenes": [],
|
||
"metadata": {
|
||
"video_path": args.video_path,
|
||
"error": "No model available",
|
||
"processed_at": datetime.now().isoformat(),
|
||
},
|
||
}
|
||
with open(args.output_path, "w", encoding="utf-8") as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
sys.exit(0)
|
||
|
||
# 執行分類
|
||
start_time = time.time()
|
||
|
||
result = classifier.classify_video(
|
||
video_path=args.video_path,
|
||
output_path=args.output_path,
|
||
sample_interval=args.sample_interval,
|
||
min_scene_duration=args.min_scene_duration,
|
||
)
|
||
|
||
elapsed = time.time() - start_time
|
||
print(f"[SCENE] Completed in {elapsed:.1f}s")
|
||
|
||
# 顯示統計
|
||
if result["scenes"]:
|
||
print("\n[SCENE] 場景統計:")
|
||
for scene in result["scenes"]:
|
||
scene_name = scene.get("scene_type_zh") or scene.get("scene_type")
|
||
duration = scene["end_time"] - scene["start_time"]
|
||
conf = scene.get("confidence", 0) * 100
|
||
print(
|
||
f" - {scene_name}: {scene['start_time']:.1f}s - {scene['end_time']:.1f}s ({duration:.1f}s, {conf:.0f}%)"
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|