- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
452 lines
16 KiB
Python
452 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
統一格式多語系同義詞處理器
|
|
處理統一格式的多語系同義詞庫
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import argparse
|
|
from typing import Dict, List, Optional, Any, Set
|
|
from pathlib import Path
|
|
import re
|
|
|
|
|
|
class UnifiedSynonymProcessor:
|
|
def __init__(self, unified_file: str):
|
|
"""
|
|
初始化處理器
|
|
|
|
Args:
|
|
unified_file: 統一格式同義詞庫檔案路徑
|
|
"""
|
|
self.unified_file = Path(unified_file)
|
|
self.data = self.load_unified_data()
|
|
|
|
def load_unified_data(self) -> Dict[str, Any]:
|
|
"""
|
|
加載統一格式數據
|
|
|
|
Returns:
|
|
統一格式數據字典
|
|
"""
|
|
try:
|
|
with open(self.unified_file, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
# 驗證數據格式
|
|
if not self.validate_unified_format(data):
|
|
raise ValueError("無效的統一格式數據")
|
|
|
|
return data
|
|
except Exception as e:
|
|
print(
|
|
f"錯誤: 無法加載統一格式檔案 {self.unified_file}: {e}", file=sys.stderr
|
|
)
|
|
sys.exit(1)
|
|
|
|
def validate_unified_format(self, data: Dict[str, Any]) -> bool:
|
|
"""
|
|
驗證統一格式
|
|
|
|
Args:
|
|
data: 要驗證的數據
|
|
|
|
Returns:
|
|
是否有效
|
|
"""
|
|
required_fields = ["version", "format", "synonym_groups"]
|
|
|
|
for field in required_fields:
|
|
if field not in data:
|
|
print(f"錯誤: 缺少必要字段 {field}", file=sys.stderr)
|
|
return False
|
|
|
|
if data.get("format") != "unified_multilingual":
|
|
print(f"錯誤: 格式必須為 'unified_multilingual'", file=sys.stderr)
|
|
return False
|
|
|
|
if not isinstance(data["synonym_groups"], list):
|
|
print(f"錯誤: synonym_groups 必須是列表", file=sys.stderr)
|
|
return False
|
|
|
|
# 驗證每個同義詞組
|
|
for i, group in enumerate(data["synonym_groups"]):
|
|
if not isinstance(group, dict):
|
|
print(f"錯誤: 同義詞組 {i} 必須是字典", file=sys.stderr)
|
|
return False
|
|
|
|
required_group_fields = ["id", "primary_term", "language", "synonyms"]
|
|
for field in required_group_fields:
|
|
if field not in group:
|
|
print(f"錯誤: 同義詞組 {i} 缺少字段 {field}", file=sys.stderr)
|
|
return False
|
|
|
|
return True
|
|
|
|
def extract_language_specific(self, target_language: str) -> Dict[str, List[str]]:
|
|
"""
|
|
提取特定語言的同義詞映射
|
|
|
|
Args:
|
|
target_language: 目標語言代碼
|
|
|
|
Returns:
|
|
同義詞映射字典
|
|
"""
|
|
result = {}
|
|
|
|
for group in self.data["synonym_groups"]:
|
|
# 檢查是否為目標語言
|
|
if group["language"] == target_language:
|
|
primary_term = group["primary_term"]
|
|
synonyms = group["synonyms"].copy()
|
|
|
|
# 添加翻譯中的同義詞
|
|
if "translations" in group and target_language in group["translations"]:
|
|
synonyms.extend(group["translations"][target_language])
|
|
|
|
# 去重
|
|
unique_synonyms = list(set(synonyms))
|
|
result[primary_term] = unique_synonyms
|
|
|
|
return result
|
|
|
|
def create_cross_language_mapping(self) -> Dict[str, List[str]]:
|
|
"""
|
|
創建跨語言同義詞映射
|
|
|
|
Returns:
|
|
跨語言同義詞映射
|
|
"""
|
|
result = {}
|
|
|
|
for group in self.data["synonym_groups"]:
|
|
primary_term = group["primary_term"]
|
|
all_synonyms = set()
|
|
|
|
# 添加主要同義詞
|
|
all_synonyms.update(group["synonyms"])
|
|
|
|
# 添加所有翻譯
|
|
if "translations" in group:
|
|
for lang, terms in group["translations"].items():
|
|
all_synonyms.update(terms)
|
|
|
|
# 添加其他同義詞組的相關術語
|
|
for other_group in self.data["synonym_groups"]:
|
|
if other_group["id"] == group["id"]:
|
|
continue
|
|
|
|
# 檢查是否有共同的翻譯
|
|
if "translations" in other_group:
|
|
for lang, terms in other_group["translations"].items():
|
|
if primary_term in terms:
|
|
all_synonyms.add(other_group["primary_term"])
|
|
all_synonyms.update(other_group["synonyms"])
|
|
|
|
result[primary_term] = list(all_synonyms)
|
|
|
|
return result
|
|
|
|
def get_language_support(self) -> List[str]:
|
|
"""
|
|
獲取支援的語言列表
|
|
|
|
Returns:
|
|
語言代碼列表
|
|
"""
|
|
languages = set()
|
|
|
|
for group in self.data["synonym_groups"]:
|
|
languages.add(group["language"])
|
|
if "translations" in group:
|
|
languages.update(group["translations"].keys())
|
|
|
|
return sorted(list(languages))
|
|
|
|
def search_term(
|
|
self, term: str, target_language: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
搜索術語
|
|
|
|
Args:
|
|
term: 要搜索的術語
|
|
target_language: 目標語言(可選)
|
|
|
|
Returns:
|
|
搜索結果
|
|
"""
|
|
result = {"term": term, "found": False, "groups": [], "languages": []}
|
|
|
|
term_lower = term.lower()
|
|
|
|
for group in self.data["synonym_groups"]:
|
|
# 檢查主要術語
|
|
if group["primary_term"].lower() == term_lower:
|
|
result["found"] = True
|
|
result["groups"].append(
|
|
{
|
|
"id": group["id"],
|
|
"primary_term": group["primary_term"],
|
|
"language": group["language"],
|
|
"synonyms": group["synonyms"],
|
|
"is_primary": True,
|
|
}
|
|
)
|
|
|
|
# 檢查同義詞
|
|
for synonym in group["synonyms"]:
|
|
if synonym.lower() == term_lower:
|
|
result["found"] = True
|
|
result["groups"].append(
|
|
{
|
|
"id": group["id"],
|
|
"primary_term": group["primary_term"],
|
|
"language": group["language"],
|
|
"synonyms": group["synonyms"],
|
|
"is_primary": False,
|
|
"matched_synonym": synonym,
|
|
}
|
|
)
|
|
|
|
# 檢查翻譯
|
|
if "translations" in group:
|
|
for lang, terms in group["translations"].items():
|
|
for translation in terms:
|
|
if translation.lower() == term_lower:
|
|
result["found"] = True
|
|
result["groups"].append(
|
|
{
|
|
"id": group["id"],
|
|
"primary_term": group["primary_term"],
|
|
"language": group["language"],
|
|
"translations": {lang: terms},
|
|
"is_primary": False,
|
|
"matched_translation": translation,
|
|
"translation_language": lang,
|
|
}
|
|
)
|
|
|
|
# 過濾語言
|
|
if target_language:
|
|
filtered_groups = []
|
|
for group in result["groups"]:
|
|
if (
|
|
group.get("language") == target_language
|
|
or group.get("translation_language") == target_language
|
|
):
|
|
filtered_groups.append(group)
|
|
result["groups"] = filtered_groups
|
|
|
|
# 收集語言信息
|
|
languages = set()
|
|
for group in result["groups"]:
|
|
languages.add(group["language"])
|
|
if "translations" in group:
|
|
languages.update(group["translations"].keys())
|
|
result["languages"] = sorted(list(languages))
|
|
|
|
return result
|
|
|
|
def export_to_standard_format(
|
|
self, target_language: str, output_file: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
導出為標準格式
|
|
|
|
Args:
|
|
target_language: 目標語言
|
|
output_file: 輸出檔案路徑(可選)
|
|
|
|
Returns:
|
|
標準格式數據
|
|
"""
|
|
# 提取特定語言映射
|
|
synonym_mapping = self.extract_language_specific(target_language)
|
|
|
|
# 創建標準格式
|
|
standard_data = {
|
|
"version": self.data.get("version", "1.0.0"),
|
|
"description": f"{target_language} 同義詞庫 - 從統一格式提取",
|
|
"language": target_language,
|
|
"synonyms": synonym_mapping,
|
|
"metadata": {
|
|
"created_date": self.data.get("metadata", {}).get("created_date", ""),
|
|
"author": self.data.get("metadata", {}).get("author", ""),
|
|
"license": self.data.get("metadata", {}).get("license", ""),
|
|
"source": f"從 {self.unified_file.name} 提取",
|
|
"extracted_date": "2025-03-30",
|
|
"character_encoding": "UTF-8",
|
|
},
|
|
}
|
|
|
|
# 寫入檔案
|
|
if output_file:
|
|
try:
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(standard_data, f, ensure_ascii=False, indent=2)
|
|
print(f"已導出到 {output_file}", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f"錯誤: 無法寫入檔案 {output_file}: {e}", file=sys.stderr)
|
|
|
|
return standard_data
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="統一格式多語系同義詞處理器")
|
|
parser.add_argument("unified_file", help="統一格式同義詞庫檔案")
|
|
|
|
subparsers = parser.add_subparsers(dest="command", help="命令")
|
|
|
|
# 提取特定語言
|
|
extract_parser = subparsers.add_parser("extract", help="提取特定語言")
|
|
extract_parser.add_argument("language", help="目標語言代碼")
|
|
extract_parser.add_argument("-o", "--output", help="輸出檔案路徑")
|
|
extract_parser.add_argument(
|
|
"-j", "--json", action="store_true", help="輸出 JSON 格式"
|
|
)
|
|
|
|
# 創建跨語言映射
|
|
cross_parser = subparsers.add_parser("cross", help="創建跨語言映射")
|
|
cross_parser.add_argument("-o", "--output", help="輸出檔案路徑")
|
|
cross_parser.add_argument(
|
|
"-j", "--json", action="store_true", help="輸出 JSON 格式"
|
|
)
|
|
|
|
# 搜索術語
|
|
search_parser = subparsers.add_parser("search", help="搜索術語")
|
|
search_parser.add_argument("term", help="要搜索的術語")
|
|
search_parser.add_argument("-l", "--language", help="目標語言代碼")
|
|
search_parser.add_argument(
|
|
"-j", "--json", action="store_true", help="輸出 JSON 格式"
|
|
)
|
|
|
|
# 列出支援的語言
|
|
languages_parser = subparsers.add_parser("languages", help="列出支援的語言")
|
|
languages_parser.add_argument(
|
|
"-j", "--json", action="store_true", help="輸出 JSON 格式"
|
|
)
|
|
|
|
# 導出為標準格式
|
|
export_parser = subparsers.add_parser("export", help="導出為標準格式")
|
|
export_parser.add_argument("language", help="目標語言代碼")
|
|
export_parser.add_argument("-o", "--output", required=True, help="輸出檔案路徑")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.command:
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
# 初始化處理器
|
|
processor = UnifiedSynonymProcessor(args.unified_file)
|
|
|
|
# 執行命令
|
|
if args.command == "extract":
|
|
synonym_mapping = processor.extract_language_specific(args.language)
|
|
|
|
if args.json:
|
|
result = {
|
|
"language": args.language,
|
|
"synonym_count": len(synonym_mapping),
|
|
"synonyms": synonym_mapping,
|
|
}
|
|
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
else:
|
|
print(f"語言: {args.language}")
|
|
print(f"同義詞數量: {len(synonym_mapping)}")
|
|
print("\n同義詞映射:")
|
|
for term, synonyms in synonym_mapping.items():
|
|
print(f" {term}: {', '.join(synonyms)}")
|
|
|
|
if args.output:
|
|
standard_data = processor.export_to_standard_format(
|
|
args.language, args.output
|
|
)
|
|
|
|
elif args.command == "cross":
|
|
cross_mapping = processor.create_cross_language_mapping()
|
|
|
|
if args.json:
|
|
result = {
|
|
"cross_language_mapping": cross_mapping,
|
|
"term_count": len(cross_mapping),
|
|
}
|
|
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
else:
|
|
print(f"跨語言同義詞映射")
|
|
print(f"術語數量: {len(cross_mapping)}")
|
|
print("\n映射:")
|
|
for term, synonyms in list(cross_mapping.items())[:10]: # 只顯示前10個
|
|
print(
|
|
f" {term}: {', '.join(synonyms[:5])}{'...' if len(synonyms) > 5 else ''}"
|
|
)
|
|
|
|
if len(cross_mapping) > 10:
|
|
print(f"\n... 還有 {len(cross_mapping) - 10} 個術語未顯示")
|
|
|
|
if args.output:
|
|
try:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
json.dump(cross_mapping, f, ensure_ascii=False, indent=2)
|
|
print(f"\n已保存到 {args.output}", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f"錯誤: 無法保存到 {args.output}: {e}", file=sys.stderr)
|
|
|
|
elif args.command == "search":
|
|
search_result = processor.search_term(args.term, args.language)
|
|
|
|
if args.json:
|
|
print(json.dumps(search_result, ensure_ascii=False, indent=2))
|
|
else:
|
|
print(f"搜索術語: {args.term}")
|
|
print(f"找到: {'是' if search_result['found'] else '否'}")
|
|
|
|
if search_result["found"]:
|
|
print(f"相關語言: {', '.join(search_result['languages'])}")
|
|
print("\n相關同義詞組:")
|
|
|
|
for i, group in enumerate(search_result["groups"], 1):
|
|
print(f"\n{i}. 組 ID: {group['id']}")
|
|
print(f" 主要術語: {group['primary_term']}")
|
|
print(f" 語言: {group['language']}")
|
|
|
|
if group.get("is_primary"):
|
|
print(f" 匹配類型: 主要術語")
|
|
elif "matched_synonym" in group:
|
|
print(f" 匹配類型: 同義詞 ({group['matched_synonym']})")
|
|
elif "matched_translation" in group:
|
|
print(
|
|
f" 匹配類型: 翻譯 ({group['matched_translation']} -> {group['translation_language']})"
|
|
)
|
|
|
|
if "synonyms" in group and group["synonyms"]:
|
|
print(
|
|
f" 同義詞: {', '.join(group['synonyms'][:5])}{'...' if len(group['synonyms']) > 5 else ''}"
|
|
)
|
|
else:
|
|
print("未找到匹配的術語")
|
|
|
|
elif args.command == "languages":
|
|
languages = processor.get_language_support()
|
|
|
|
if args.json:
|
|
result = {"supported_languages": languages, "count": len(languages)}
|
|
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
else:
|
|
print(f"支援的語言: {len(languages)} 種")
|
|
print("\n語言列表:")
|
|
for lang in languages:
|
|
print(f" {lang}")
|
|
|
|
elif args.command == "export":
|
|
processor.export_to_standard_format(args.language, args.output)
|
|
print(f"已導出 {args.language} 同義詞庫到 {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|