Files
momentry_core/scripts/language_router.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

316 lines
11 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
語言路由工具
根據語言檢測結果路由到相應的同義詞庫
"""
import sys
import json
import argparse
from typing import Dict, List, Optional, Any
from pathlib import Path
class LanguageRouter:
def __init__(self, config_file: Optional[str] = None):
"""
初始化語言路由器
Args:
config_file: 配置文件路徑
"""
self.config = self.load_config(config_file)
self.language_mappings = self.config.get("language_mappings", {})
self.default_language = self.config.get("default_language", "zh-CN")
self.fallback_language = self.config.get("fallback_language", "en-US")
def load_config(self, config_file: Optional[str]) -> Dict[str, Any]:
"""
加載配置文件
Args:
config_file: 配置文件路徑
Returns:
配置字典
"""
default_config = {
"default_language": "zh-CN",
"fallback_language": "en-US",
"language_mappings": {
"zh-CN": {
"synonym_file": "synonyms_zh_CN.json",
"description": "簡體中文同義詞庫",
},
"zh-TW": {
"synonym_file": "synonyms_zh_TW.json",
"description": "繁體中文同義詞庫",
},
"en-US": {
"synonym_file": "synonyms_en_US.json",
"description": "美式英文同義詞庫",
},
"ja-JP": {
"synonym_file": "synonyms_ja_JP.json",
"description": "日文同義詞庫",
},
"ko-KR": {
"synonym_file": "synonyms_ko_KR.json",
"description": "韓文同義詞庫",
},
},
"cross_language_fallback": {
"enabled": True,
"fallback_order": ["zh-CN", "zh-TW", "en-US", "ja-JP", "ko-KR"],
},
}
if config_file:
try:
with open(config_file, "r", encoding="utf-8") as f:
user_config = json.load(f)
# 合併配置
if "language_routing" in user_config:
user_config = user_config["language_routing"]
# 深度合併
merged_config = self.deep_merge(default_config, user_config)
return merged_config
except Exception as e:
print(f"警告: 無法加載配置文件 {config_file}: {e}", file=sys.stderr)
print("使用默認配置", file=sys.stderr)
return default_config
else:
return default_config
def deep_merge(self, base: Dict, update: Dict) -> Dict:
"""
深度合併兩個字典
Args:
base: 基礎字典
update: 更新字典
Returns:
合併後的字典
"""
result = base.copy()
for key, value in update.items():
if (
key in result
and isinstance(result[key], dict)
and isinstance(value, dict)
):
result[key] = self.deep_merge(result[key], value)
else:
result[key] = value
return result
def route_language(
self, detected_lang: str, confidence: float = 0.0
) -> Dict[str, Any]:
"""
根據檢測到的語言進行路由
Args:
detected_lang: 檢測到的語言代碼
confidence: 檢測置信度
Returns:
路由結果字典
"""
result = {
"detected_language": detected_lang,
"confidence": confidence,
"routed_language": None,
"synonym_file": None,
"fallback_used": False,
"available_languages": list(self.language_mappings.keys()),
}
# 檢查檢測到的語言是否在映射中
if detected_lang in self.language_mappings:
result["routed_language"] = detected_lang
result["synonym_file"] = self.language_mappings[detected_lang][
"synonym_file"
]
return result
# 如果檢測到的語言不在映射中,嘗試語言變體
lang_variants = self.get_language_variants(detected_lang)
for variant in lang_variants:
if variant in self.language_mappings:
result["routed_language"] = variant
result["synonym_file"] = self.language_mappings[variant]["synonym_file"]
result["fallback_used"] = True
result["fallback_reason"] = f"使用變體 {variant} 替代 {detected_lang}"
return result
# 使用跨語言回退
if self.config.get("cross_language_fallback", {}).get("enabled", True):
fallback_order = self.config["cross_language_fallback"].get(
"fallback_order", []
)
for fallback_lang in fallback_order:
if fallback_lang in self.language_mappings:
result["routed_language"] = fallback_lang
result["synonym_file"] = self.language_mappings[fallback_lang][
"synonym_file"
]
result["fallback_used"] = True
result["fallback_reason"] = f"使用跨語言回退到 {fallback_lang}"
return result
# 使用默認語言
if self.default_language in self.language_mappings:
result["routed_language"] = self.default_language
result["synonym_file"] = self.language_mappings[self.default_language][
"synonym_file"
]
result["fallback_used"] = True
result["fallback_reason"] = f"使用默認語言 {self.default_language}"
return result
# 使用回退語言
if self.fallback_language in self.language_mappings:
result["routed_language"] = self.fallback_language
result["synonym_file"] = self.language_mappings[self.fallback_language][
"synonym_file"
]
result["fallback_used"] = True
result["fallback_reason"] = f"使用回退語言 {self.fallback_language}"
return result
# 沒有可用的語言
result["error"] = "沒有可用的語言映射"
return result
def get_language_variants(self, lang_code: str) -> List[str]:
"""
獲取語言變體
Args:
lang_code: 語言代碼
Returns:
語言變體列表
"""
variants = []
# 常見的語言變體映射
variant_mapping = {
"zh": ["zh-CN", "zh-TW", "zh-HK", "zh-SG", "zh-MO"],
"en": ["en-US", "en-GB", "en-CA", "en-AU", "en-NZ"],
"ja": ["ja-JP"],
"ko": ["ko-KR"],
"fr": ["fr-FR", "fr-CA", "fr-BE", "fr-CH"],
"de": ["de-DE", "de-AT", "de-CH"],
"es": ["es-ES", "es-MX", "es-AR", "es-CO"],
"pt": ["pt-BR", "pt-PT"],
"ru": ["ru-RU"],
"ar": ["ar-SA", "ar-EG", "ar-AE"],
}
# 提取語言部分(去掉地區代碼)
lang_part = lang_code.split("-")[0] if "-" in lang_code else lang_code
if lang_part in variant_mapping:
variants = variant_mapping[lang_part]
return variants
def get_synonym_file_path(
self, routed_result: Dict[str, Any], base_dir: str = "."
) -> Optional[Path]:
"""
獲取同義詞檔案路徑
Args:
routed_result: 路由結果
base_dir: 基礎目錄
Returns:
檔案路徑或 None
"""
if not routed_result.get("synonym_file"):
return None
file_path = Path(base_dir) / routed_result["synonym_file"]
# 檢查檔案是否存在
if file_path.exists():
return file_path
# 嘗試在常見位置尋找
common_paths = [
Path(base_dir) / "synonyms" / routed_result["synonym_file"],
Path(base_dir) / "data" / "synonyms" / routed_result["synonym_file"],
Path(base_dir) / "config" / "synonyms" / routed_result["synonym_file"],
Path(base_dir) / ".." / "synonyms" / routed_result["synonym_file"],
]
for path in common_paths:
if path.exists():
return path
return None
def main():
parser = argparse.ArgumentParser(description="語言路由工具")
parser.add_argument("language", help="檢測到的語言代碼")
parser.add_argument(
"-c", "--confidence", type=float, default=0.0, help="檢測置信度"
)
parser.add_argument("-j", "--json", action="store_true", help="輸出 JSON 格式")
parser.add_argument("-v", "--verbose", action="store_true", help="詳細輸出")
parser.add_argument("--config", help="配置文件路徑")
parser.add_argument("--base-dir", default=".", help="基礎目錄路徑")
args = parser.parse_args()
# 初始化路由器
router = LanguageRouter(args.config)
# 進行路由
result = router.route_language(args.language, args.confidence)
# 獲取檔案路徑
file_path = router.get_synonym_file_path(result, args.base_dir)
result["file_path"] = str(file_path) if file_path else None
result["file_exists"] = file_path is not None and file_path.exists()
# 輸出結果
if args.json:
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
if args.verbose:
print("語言路由結果:")
print(f" 檢測到的語言: {result['detected_language']}")
print(f" 置信度: {result['confidence']:.2%}")
print(f" 路由到的語言: {result['routed_language']}")
print(f" 同義詞檔案: {result['synonym_file']}")
print(f" 檔案路徑: {result['file_path']}")
print(f" 檔案存在: {result['file_exists']}")
if result.get("fallback_used"):
print(f" 使用了回退: 是")
print(f" 回退原因: {result.get('fallback_reason', '未知')}")
else:
print(f" 使用了回退: 否")
print(f" 可用語言: {', '.join(result['available_languages'])}")
else:
if result["file_exists"]:
print(f"{result['routed_language']}:{result['synonym_file']}")
else:
print(
f"{result['routed_language']}:{result['synonym_file']} (檔案不存在)"
)
if __name__ == "__main__":
main()