Files
momentry_core/scripts/unified_synonym_processor.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

452 lines
16 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
統一格式多語系同義詞處理器
處理統一格式的多語系同義詞庫
"""
import sys
import json
import argparse
from typing import Dict, List, Optional, Any, Set
from pathlib import Path
import re
class UnifiedSynonymProcessor:
def __init__(self, unified_file: str):
"""
初始化處理器
Args:
unified_file: 統一格式同義詞庫檔案路徑
"""
self.unified_file = Path(unified_file)
self.data = self.load_unified_data()
def load_unified_data(self) -> Dict[str, Any]:
"""
加載統一格式數據
Returns:
統一格式數據字典
"""
try:
with open(self.unified_file, "r", encoding="utf-8") as f:
data = json.load(f)
# 驗證數據格式
if not self.validate_unified_format(data):
raise ValueError("無效的統一格式數據")
return data
except Exception as e:
print(
f"錯誤: 無法加載統一格式檔案 {self.unified_file}: {e}", file=sys.stderr
)
sys.exit(1)
def validate_unified_format(self, data: Dict[str, Any]) -> bool:
"""
驗證統一格式
Args:
data: 要驗證的數據
Returns:
是否有效
"""
required_fields = ["version", "format", "synonym_groups"]
for field in required_fields:
if field not in data:
print(f"錯誤: 缺少必要字段 {field}", file=sys.stderr)
return False
if data.get("format") != "unified_multilingual":
print(f"錯誤: 格式必須為 'unified_multilingual'", file=sys.stderr)
return False
if not isinstance(data["synonym_groups"], list):
print(f"錯誤: synonym_groups 必須是列表", file=sys.stderr)
return False
# 驗證每個同義詞組
for i, group in enumerate(data["synonym_groups"]):
if not isinstance(group, dict):
print(f"錯誤: 同義詞組 {i} 必須是字典", file=sys.stderr)
return False
required_group_fields = ["id", "primary_term", "language", "synonyms"]
for field in required_group_fields:
if field not in group:
print(f"錯誤: 同義詞組 {i} 缺少字段 {field}", file=sys.stderr)
return False
return True
def extract_language_specific(self, target_language: str) -> Dict[str, List[str]]:
"""
提取特定語言的同義詞映射
Args:
target_language: 目標語言代碼
Returns:
同義詞映射字典
"""
result = {}
for group in self.data["synonym_groups"]:
# 檢查是否為目標語言
if group["language"] == target_language:
primary_term = group["primary_term"]
synonyms = group["synonyms"].copy()
# 添加翻譯中的同義詞
if "translations" in group and target_language in group["translations"]:
synonyms.extend(group["translations"][target_language])
# 去重
unique_synonyms = list(set(synonyms))
result[primary_term] = unique_synonyms
return result
def create_cross_language_mapping(self) -> Dict[str, List[str]]:
"""
創建跨語言同義詞映射
Returns:
跨語言同義詞映射
"""
result = {}
for group in self.data["synonym_groups"]:
primary_term = group["primary_term"]
all_synonyms = set()
# 添加主要同義詞
all_synonyms.update(group["synonyms"])
# 添加所有翻譯
if "translations" in group:
for lang, terms in group["translations"].items():
all_synonyms.update(terms)
# 添加其他同義詞組的相關術語
for other_group in self.data["synonym_groups"]:
if other_group["id"] == group["id"]:
continue
# 檢查是否有共同的翻譯
if "translations" in other_group:
for lang, terms in other_group["translations"].items():
if primary_term in terms:
all_synonyms.add(other_group["primary_term"])
all_synonyms.update(other_group["synonyms"])
result[primary_term] = list(all_synonyms)
return result
def get_language_support(self) -> List[str]:
"""
獲取支援的語言列表
Returns:
語言代碼列表
"""
languages = set()
for group in self.data["synonym_groups"]:
languages.add(group["language"])
if "translations" in group:
languages.update(group["translations"].keys())
return sorted(list(languages))
def search_term(
self, term: str, target_language: Optional[str] = None
) -> Dict[str, Any]:
"""
搜索術語
Args:
term: 要搜索的術語
target_language: 目標語言(可選)
Returns:
搜索結果
"""
result = {"term": term, "found": False, "groups": [], "languages": []}
term_lower = term.lower()
for group in self.data["synonym_groups"]:
# 檢查主要術語
if group["primary_term"].lower() == term_lower:
result["found"] = True
result["groups"].append(
{
"id": group["id"],
"primary_term": group["primary_term"],
"language": group["language"],
"synonyms": group["synonyms"],
"is_primary": True,
}
)
# 檢查同義詞
for synonym in group["synonyms"]:
if synonym.lower() == term_lower:
result["found"] = True
result["groups"].append(
{
"id": group["id"],
"primary_term": group["primary_term"],
"language": group["language"],
"synonyms": group["synonyms"],
"is_primary": False,
"matched_synonym": synonym,
}
)
# 檢查翻譯
if "translations" in group:
for lang, terms in group["translations"].items():
for translation in terms:
if translation.lower() == term_lower:
result["found"] = True
result["groups"].append(
{
"id": group["id"],
"primary_term": group["primary_term"],
"language": group["language"],
"translations": {lang: terms},
"is_primary": False,
"matched_translation": translation,
"translation_language": lang,
}
)
# 過濾語言
if target_language:
filtered_groups = []
for group in result["groups"]:
if (
group.get("language") == target_language
or group.get("translation_language") == target_language
):
filtered_groups.append(group)
result["groups"] = filtered_groups
# 收集語言信息
languages = set()
for group in result["groups"]:
languages.add(group["language"])
if "translations" in group:
languages.update(group["translations"].keys())
result["languages"] = sorted(list(languages))
return result
def export_to_standard_format(
self, target_language: str, output_file: Optional[str] = None
) -> Dict[str, Any]:
"""
導出為標準格式
Args:
target_language: 目標語言
output_file: 輸出檔案路徑(可選)
Returns:
標準格式數據
"""
# 提取特定語言映射
synonym_mapping = self.extract_language_specific(target_language)
# 創建標準格式
standard_data = {
"version": self.data.get("version", "1.0.0"),
"description": f"{target_language} 同義詞庫 - 從統一格式提取",
"language": target_language,
"synonyms": synonym_mapping,
"metadata": {
"created_date": self.data.get("metadata", {}).get("created_date", ""),
"author": self.data.get("metadata", {}).get("author", ""),
"license": self.data.get("metadata", {}).get("license", ""),
"source": f"{self.unified_file.name} 提取",
"extracted_date": "2025-03-30",
"character_encoding": "UTF-8",
},
}
# 寫入檔案
if output_file:
try:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(standard_data, f, ensure_ascii=False, indent=2)
print(f"已導出到 {output_file}", file=sys.stderr)
except Exception as e:
print(f"錯誤: 無法寫入檔案 {output_file}: {e}", file=sys.stderr)
return standard_data
def main():
parser = argparse.ArgumentParser(description="統一格式多語系同義詞處理器")
parser.add_argument("unified_file", help="統一格式同義詞庫檔案")
subparsers = parser.add_subparsers(dest="command", help="命令")
# 提取特定語言
extract_parser = subparsers.add_parser("extract", help="提取特定語言")
extract_parser.add_argument("language", help="目標語言代碼")
extract_parser.add_argument("-o", "--output", help="輸出檔案路徑")
extract_parser.add_argument(
"-j", "--json", action="store_true", help="輸出 JSON 格式"
)
# 創建跨語言映射
cross_parser = subparsers.add_parser("cross", help="創建跨語言映射")
cross_parser.add_argument("-o", "--output", help="輸出檔案路徑")
cross_parser.add_argument(
"-j", "--json", action="store_true", help="輸出 JSON 格式"
)
# 搜索術語
search_parser = subparsers.add_parser("search", help="搜索術語")
search_parser.add_argument("term", help="要搜索的術語")
search_parser.add_argument("-l", "--language", help="目標語言代碼")
search_parser.add_argument(
"-j", "--json", action="store_true", help="輸出 JSON 格式"
)
# 列出支援的語言
languages_parser = subparsers.add_parser("languages", help="列出支援的語言")
languages_parser.add_argument(
"-j", "--json", action="store_true", help="輸出 JSON 格式"
)
# 導出為標準格式
export_parser = subparsers.add_parser("export", help="導出為標準格式")
export_parser.add_argument("language", help="目標語言代碼")
export_parser.add_argument("-o", "--output", required=True, help="輸出檔案路徑")
args = parser.parse_args()
if not args.command:
parser.print_help()
sys.exit(1)
# 初始化處理器
processor = UnifiedSynonymProcessor(args.unified_file)
# 執行命令
if args.command == "extract":
synonym_mapping = processor.extract_language_specific(args.language)
if args.json:
result = {
"language": args.language,
"synonym_count": len(synonym_mapping),
"synonyms": synonym_mapping,
}
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
print(f"語言: {args.language}")
print(f"同義詞數量: {len(synonym_mapping)}")
print("\n同義詞映射:")
for term, synonyms in synonym_mapping.items():
print(f" {term}: {', '.join(synonyms)}")
if args.output:
standard_data = processor.export_to_standard_format(
args.language, args.output
)
elif args.command == "cross":
cross_mapping = processor.create_cross_language_mapping()
if args.json:
result = {
"cross_language_mapping": cross_mapping,
"term_count": len(cross_mapping),
}
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
print(f"跨語言同義詞映射")
print(f"術語數量: {len(cross_mapping)}")
print("\n映射:")
for term, synonyms in list(cross_mapping.items())[:10]: # 只顯示前10個
print(
f" {term}: {', '.join(synonyms[:5])}{'...' if len(synonyms) > 5 else ''}"
)
if len(cross_mapping) > 10:
print(f"\n... 還有 {len(cross_mapping) - 10} 個術語未顯示")
if args.output:
try:
with open(args.output, "w", encoding="utf-8") as f:
json.dump(cross_mapping, f, ensure_ascii=False, indent=2)
print(f"\n已保存到 {args.output}", file=sys.stderr)
except Exception as e:
print(f"錯誤: 無法保存到 {args.output}: {e}", file=sys.stderr)
elif args.command == "search":
search_result = processor.search_term(args.term, args.language)
if args.json:
print(json.dumps(search_result, ensure_ascii=False, indent=2))
else:
print(f"搜索術語: {args.term}")
print(f"找到: {'' if search_result['found'] else ''}")
if search_result["found"]:
print(f"相關語言: {', '.join(search_result['languages'])}")
print("\n相關同義詞組:")
for i, group in enumerate(search_result["groups"], 1):
print(f"\n{i}. 組 ID: {group['id']}")
print(f" 主要術語: {group['primary_term']}")
print(f" 語言: {group['language']}")
if group.get("is_primary"):
print(f" 匹配類型: 主要術語")
elif "matched_synonym" in group:
print(f" 匹配類型: 同義詞 ({group['matched_synonym']})")
elif "matched_translation" in group:
print(
f" 匹配類型: 翻譯 ({group['matched_translation']} -> {group['translation_language']})"
)
if "synonyms" in group and group["synonyms"]:
print(
f" 同義詞: {', '.join(group['synonyms'][:5])}{'...' if len(group['synonyms']) > 5 else ''}"
)
else:
print("未找到匹配的術語")
elif args.command == "languages":
languages = processor.get_language_support()
if args.json:
result = {"supported_languages": languages, "count": len(languages)}
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
print(f"支援的語言: {len(languages)}")
print("\n語言列表:")
for lang in languages:
print(f" {lang}")
elif args.command == "export":
processor.export_to_standard_format(args.language, args.output)
print(f"已導出 {args.language} 同義詞庫到 {args.output}")
if __name__ == "__main__":
main()