#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 統一格式多語系同義詞處理器 處理統一格式的多語系同義詞庫 """ import sys import json import argparse from typing import Dict, List, Optional, Any from pathlib import Path class UnifiedSynonymProcessor: def __init__(self, unified_file: str): """ 初始化處理器 Args: unified_file: 統一格式同義詞庫檔案路徑 """ self.unified_file = Path(unified_file) self.data = self.load_unified_data() def load_unified_data(self) -> Dict[str, Any]: """ 加載統一格式數據 Returns: 統一格式數據字典 """ try: with open(self.unified_file, "r", encoding="utf-8") as f: data = json.load(f) # 驗證數據格式 if not self.validate_unified_format(data): raise ValueError("無效的統一格式數據") return data except Exception as e: print( f"錯誤: 無法加載統一格式檔案 {self.unified_file}: {e}", file=sys.stderr ) sys.exit(1) def validate_unified_format(self, data: Dict[str, Any]) -> bool: """ 驗證統一格式 Args: data: 要驗證的數據 Returns: 是否有效 """ required_fields = ["version", "format", "synonym_groups"] for field in required_fields: if field not in data: print(f"錯誤: 缺少必要字段 {field}", file=sys.stderr) return False if data.get("format") != "unified_multilingual": print("錯誤: 格式必須為 'unified_multilingual'", file=sys.stderr) return False if not isinstance(data["synonym_groups"], list): print("錯誤: synonym_groups 必須是列表", file=sys.stderr) return False # 驗證每個同義詞組 for i, group in enumerate(data["synonym_groups"]): if not isinstance(group, dict): print(f"錯誤: 同義詞組 {i} 必須是字典", file=sys.stderr) return False required_group_fields = ["id", "primary_term", "language", "synonyms"] for field in required_group_fields: if field not in group: print(f"錯誤: 同義詞組 {i} 缺少字段 {field}", file=sys.stderr) return False return True def extract_language_specific(self, target_language: str) -> Dict[str, List[str]]: """ 提取特定語言的同義詞映射 Args: target_language: 目標語言代碼 Returns: 同義詞映射字典 """ result = {} for group in self.data["synonym_groups"]: # 檢查是否為目標語言 if group["language"] == target_language: primary_term = group["primary_term"] synonyms = group["synonyms"].copy() # 添加翻譯中的同義詞 if "translations" in group and target_language in group["translations"]: synonyms.extend(group["translations"][target_language]) # 去重 unique_synonyms = list(set(synonyms)) result[primary_term] = unique_synonyms return result def create_cross_language_mapping(self) -> Dict[str, List[str]]: """ 創建跨語言同義詞映射 Returns: 跨語言同義詞映射 """ result = {} for group in self.data["synonym_groups"]: primary_term = group["primary_term"] all_synonyms = set() # 添加主要同義詞 all_synonyms.update(group["synonyms"]) # 添加所有翻譯 if "translations" in group: for lang, terms in group["translations"].items(): all_synonyms.update(terms) # 添加其他同義詞組的相關術語 for other_group in self.data["synonym_groups"]: if other_group["id"] == group["id"]: continue # 檢查是否有共同的翻譯 if "translations" in other_group: for lang, terms in other_group["translations"].items(): if primary_term in terms: all_synonyms.add(other_group["primary_term"]) all_synonyms.update(other_group["synonyms"]) result[primary_term] = list(all_synonyms) return result def get_language_support(self) -> List[str]: """ 獲取支援的語言列表 Returns: 語言代碼列表 """ languages = set() for group in self.data["synonym_groups"]: languages.add(group["language"]) if "translations" in group: languages.update(group["translations"].keys()) return sorted(list(languages)) def search_term( self, term: str, target_language: Optional[str] = None ) -> Dict[str, Any]: """ 搜索術語 Args: term: 要搜索的術語 target_language: 目標語言(可選) Returns: 搜索結果 """ result = {"term": term, "found": False, "groups": [], "languages": []} term_lower = term.lower() for group in self.data["synonym_groups"]: # 檢查主要術語 if group["primary_term"].lower() == term_lower: result["found"] = True result["groups"].append( { "id": group["id"], "primary_term": group["primary_term"], "language": group["language"], "synonyms": group["synonyms"], "is_primary": True, } ) # 檢查同義詞 for synonym in group["synonyms"]: if synonym.lower() == term_lower: result["found"] = True result["groups"].append( { "id": group["id"], "primary_term": group["primary_term"], "language": group["language"], "synonyms": group["synonyms"], "is_primary": False, "matched_synonym": synonym, } ) # 檢查翻譯 if "translations" in group: for lang, terms in group["translations"].items(): for translation in terms: if translation.lower() == term_lower: result["found"] = True result["groups"].append( { "id": group["id"], "primary_term": group["primary_term"], "language": group["language"], "translations": {lang: terms}, "is_primary": False, "matched_translation": translation, "translation_language": lang, } ) # 過濾語言 if target_language: filtered_groups = [] for group in result["groups"]: if ( group.get("language") == target_language or group.get("translation_language") == target_language ): filtered_groups.append(group) result["groups"] = filtered_groups # 收集語言信息 languages = set() for group in result["groups"]: languages.add(group["language"]) if "translations" in group: languages.update(group["translations"].keys()) result["languages"] = sorted(list(languages)) return result def export_to_standard_format( self, target_language: str, output_file: Optional[str] = None ) -> Dict[str, Any]: """ 導出為標準格式 Args: target_language: 目標語言 output_file: 輸出檔案路徑(可選) Returns: 標準格式數據 """ # 提取特定語言映射 synonym_mapping = self.extract_language_specific(target_language) # 創建標準格式 standard_data = { "version": self.data.get("version", "1.0.0"), "description": f"{target_language} 同義詞庫 - 從統一格式提取", "language": target_language, "synonyms": synonym_mapping, "metadata": { "created_date": self.data.get("metadata", {}).get("created_date", ""), "author": self.data.get("metadata", {}).get("author", ""), "license": self.data.get("metadata", {}).get("license", ""), "source": f"從 {self.unified_file.name} 提取", "extracted_date": "2025-03-30", "character_encoding": "UTF-8", }, } # 寫入檔案 if output_file: try: with open(output_file, "w", encoding="utf-8") as f: json.dump(standard_data, f, ensure_ascii=False, indent=2) print(f"已導出到 {output_file}", file=sys.stderr) except Exception as e: print(f"錯誤: 無法寫入檔案 {output_file}: {e}", file=sys.stderr) return standard_data def main(): parser = argparse.ArgumentParser(description="統一格式多語系同義詞處理器") parser.add_argument("unified_file", help="統一格式同義詞庫檔案") subparsers = parser.add_subparsers(dest="command", help="命令") # 提取特定語言 extract_parser = subparsers.add_parser("extract", help="提取特定語言") extract_parser.add_argument("language", help="目標語言代碼") extract_parser.add_argument("-o", "--output", help="輸出檔案路徑") extract_parser.add_argument( "-j", "--json", action="store_true", help="輸出 JSON 格式" ) # 創建跨語言映射 cross_parser = subparsers.add_parser("cross", help="創建跨語言映射") cross_parser.add_argument("-o", "--output", help="輸出檔案路徑") cross_parser.add_argument( "-j", "--json", action="store_true", help="輸出 JSON 格式" ) # 搜索術語 search_parser = subparsers.add_parser("search", help="搜索術語") search_parser.add_argument("term", help="要搜索的術語") search_parser.add_argument("-l", "--language", help="目標語言代碼") search_parser.add_argument( "-j", "--json", action="store_true", help="輸出 JSON 格式" ) # 列出支援的語言 languages_parser = subparsers.add_parser("languages", help="列出支援的語言") languages_parser.add_argument( "-j", "--json", action="store_true", help="輸出 JSON 格式" ) # 導出為標準格式 export_parser = subparsers.add_parser("export", help="導出為標準格式") export_parser.add_argument("language", help="目標語言代碼") export_parser.add_argument("-o", "--output", required=True, help="輸出檔案路徑") args = parser.parse_args() if not args.command: parser.print_help() sys.exit(1) # 初始化處理器 processor = UnifiedSynonymProcessor(args.unified_file) # 執行命令 if args.command == "extract": synonym_mapping = processor.extract_language_specific(args.language) if args.json: result = { "language": args.language, "synonym_count": len(synonym_mapping), "synonyms": synonym_mapping, } print(json.dumps(result, ensure_ascii=False, indent=2)) else: print(f"語言: {args.language}") print(f"同義詞數量: {len(synonym_mapping)}") print("\n同義詞映射:") for term, synonyms in synonym_mapping.items(): print(f" {term}: {', '.join(synonyms)}") if args.output: standard_data = processor.export_to_standard_format( args.language, args.output ) elif args.command == "cross": cross_mapping = processor.create_cross_language_mapping() if args.json: result = { "cross_language_mapping": cross_mapping, "term_count": len(cross_mapping), } print(json.dumps(result, ensure_ascii=False, indent=2)) else: print("跨語言同義詞映射") print(f"術語數量: {len(cross_mapping)}") print("\n映射:") for term, synonyms in list(cross_mapping.items())[:10]: # 只顯示前10個 print( f" {term}: {', '.join(synonyms[:5])}{'...' if len(synonyms) > 5 else ''}" ) if len(cross_mapping) > 10: print(f"\n... 還有 {len(cross_mapping) - 10} 個術語未顯示") if args.output: try: with open(args.output, "w", encoding="utf-8") as f: json.dump(cross_mapping, f, ensure_ascii=False, indent=2) print(f"\n已保存到 {args.output}", file=sys.stderr) except Exception as e: print(f"錯誤: 無法保存到 {args.output}: {e}", file=sys.stderr) elif args.command == "search": search_result = processor.search_term(args.term, args.language) if args.json: print(json.dumps(search_result, ensure_ascii=False, indent=2)) else: print(f"搜索術語: {args.term}") print(f"找到: {'是' if search_result['found'] else '否'}") if search_result["found"]: print(f"相關語言: {', '.join(search_result['languages'])}") print("\n相關同義詞組:") for i, group in enumerate(search_result["groups"], 1): print(f"\n{i}. 組 ID: {group['id']}") print(f" 主要術語: {group['primary_term']}") print(f" 語言: {group['language']}") if group.get("is_primary"): print(" 匹配類型: 主要術語") elif "matched_synonym" in group: print(f" 匹配類型: 同義詞 ({group['matched_synonym']})") elif "matched_translation" in group: print( f" 匹配類型: 翻譯 ({group['matched_translation']} -> {group['translation_language']})" ) if "synonyms" in group and group["synonyms"]: print( f" 同義詞: {', '.join(group['synonyms'][:5])}{'...' if len(group['synonyms']) > 5 else ''}" ) else: print("未找到匹配的術語") elif args.command == "languages": languages = processor.get_language_support() if args.json: result = {"supported_languages": languages, "count": len(languages)} print(json.dumps(result, ensure_ascii=False, indent=2)) else: print(f"支援的語言: {len(languages)} 種") print("\n語言列表:") for lang in languages: print(f" {lang}") elif args.command == "export": processor.export_to_standard_format(args.language, args.output) print(f"已導出 {args.language} 同義詞庫到 {args.output}") if __name__ == "__main__": main()