Files
momentry_core/scripts/terminology_manager.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

240 lines
8.3 KiB
Python

#!/usr/bin/env python3
"""
術語管理器 - 用於統一管理和更新架構文檔中的術語
"""
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict
@dataclass
class TerminologyEntry:
"""術語條目"""
design_concept: str # 設計概念
design_value: str # 設計值
actual_value: str # 實際實現值
status: str # 狀態標記
description: str # 描述
last_updated: str # 最後更新時間
source_files: List[str] # 使用此術語的文件
@dataclass
class TerminologyMapping:
"""術語映射表"""
mapping: Dict[str, TerminologyEntry]
version: str
created_at: str
updated_at: str
class TerminologyManager:
"""術語管理器"""
def __init__(self, data_dir: Path = Path("data/terminology")):
self.data_dir = data_dir
self.data_dir.mkdir(parents=True, exist_ok=True)
self.mapping_file = data_dir / "terminology_mapping.json"
self.usage_file = data_dir / "terminology_usage.json"
# 定義標準術語對照表
self.standard_terminology = {
"sentence": TerminologyEntry(
design_concept="句子級分片",
design_value="sentence",
actual_value="ChunkType::Sentence",
status="✅ 完整實現",
description="基於 ASR 轉錄結果的單句級別分片",
last_updated=datetime.now().isoformat(),
source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_1_SENTENCE.md"],
),
"visual": TerminologyEntry(
design_concept="視覺物件級分片",
design_value="visual",
actual_value="未實現",
status="❌ 未實現",
description="基於 YOLO 物件檢測的視覺分片",
last_updated=datetime.now().isoformat(),
source_files=["CHUNK_DESIGN.md"],
),
"scene": TerminologyEntry(
design_concept="場景級分片",
design_value="scene",
actual_value="ChunkType::Cut",
status="⚠️ 部分實現",
description="基於 CUT 場景檢測算法的分片",
last_updated=datetime.now().isoformat(),
source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_3_SCENE.md"],
),
"summary": TerminologyEntry(
design_concept="摘要級分片",
design_value="summary",
actual_value="ChunkType::Story",
status="⚠️ 概念調整",
description="基於分片聚合的敘事總結分片",
last_updated=datetime.now().isoformat(),
source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_4_SUMMARY.md"],
),
"time": TerminologyEntry(
design_concept="時間基準分片",
design_value="time",
actual_value="ChunkType::TimeBased",
status="✅ 完整實現",
description="固定時間間隔的分片",
last_updated=datetime.now().isoformat(),
source_files=["CHUNK_DESIGN.md"],
),
"trace": TerminologyEntry(
design_concept="軌跡追蹤分片",
design_value="trace",
actual_value="ChunkType::Trace",
status="✅ 完整實現",
description="物件或人物的時空軌跡分片",
last_updated=datetime.now().isoformat(),
source_files=["CHUNK_DESIGN.md"],
),
}
self.initialize()
def initialize(self):
"""初始化術語映射表"""
if not self.mapping_file.exists():
self.save_mapping()
def save_mapping(self):
"""保存術語映射表"""
mapping_data = TerminologyMapping(
mapping=self.standard_terminology,
version="1.0",
created_at=datetime.now().isoformat(),
updated_at=datetime.now().isoformat(),
)
with open(self.mapping_file, "w", encoding="utf-8") as f:
json.dump(asdict(mapping_data), f, ensure_ascii=False, indent=2)
print(f"✓ 術語映射表已保存: {self.mapping_file}")
def load_mapping(self) -> TerminologyMapping:
"""加載術語映射表"""
with open(self.mapping_file, "r", encoding="utf-8") as f:
data = json.load(f)
return TerminologyMapping(**data)
def find_terminology_in_files(
self, pattern: str, directory: Path
) -> Dict[str, List[Tuple[str, int]]]:
"""在文件中查找術語使用情況"""
results = {}
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".md"):
file_path = Path(root) / file
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
matches = list(re.finditer(pattern, content, re.IGNORECASE))
if matches:
results[str(file_path)] = [
(match.group(), match.start()) for match in matches
]
return results
def generate_report(self) -> Dict[str, any]:
"""生成術語使用報告"""
mapping = self.load_mapping()
arch_dir = Path("docs_v1.0/ARCHITECTURE")
usage = {}
for design_term, entry in mapping.mapping.items():
pattern = re.escape(entry.design_value)
usage[design_term] = self.find_terminology_in_files(pattern, arch_dir)
report = {
"metadata": {
"generated_at": datetime.now().isoformat(),
"version": mapping.version,
"total_terms": len(mapping.mapping),
},
"terminology_usage": usage,
"summary": {
"total_files_scanned": sum(len(v) for v in usage.values()),
"unique_terms_used": len(usage),
"consistency_score": self.calculate_consistency_score(usage),
},
}
return report
def calculate_consistency_score(self, usage: Dict[str, any]) -> float:
"""計算術語一致性分數"""
total_occurrences = sum(len(v) for v in usage.values())
if total_occurrences == 0:
return 1.0
# 計算術語使用的一致性
consistency_score = 0.0
# 檢查設計值和實際值是否一致
for design_term, occurrences in usage.items():
entry = self.standard_terminology.get(design_term)
if not entry:
continue
# 檢查文件中的引用是否與定義一致
for file_path, matches in occurrences.items():
for match, _ in matches:
# 檢查是否使用了正確的術語
if match.lower() == entry.design_value.lower():
consistency_score += 1.0
else:
# 部分匹配或錯誤使用
consistency_score += 0.5
# 歸一化分數
if total_occurrences > 0:
consistency_score = consistency_score / total_occurrences
return consistency_score
def main():
"""主函數"""
print("術語管理器 - 統一管理架構文檔術語")
print("=" * 60)
manager = TerminologyManager()
# 生成報告
report = manager.generate_report()
print("\n術語使用報告:")
print(f"版本: {report['metadata']['version']}")
print(f"生成時間: {report['metadata']['generated_at']}")
print(f"一致性分數: {report['summary']['consistency_score']:.2f}")
print(f"使用術語總數: {report['summary']['unique_terms_used']}")
print("\n術語對照表:")
for term, entry in manager.standard_terminology.items():
print(f"{term:10}{entry.actual_value:30} [{entry.status}]")
print("\n建議:")
print("1. 在設計文檔中保留設計值說明")
print("2. 在實現文檔中使用實際值")
print("3. 定期檢查術語一致性")
print("4. 更新代碼註釋中的術語")
if __name__ == "__main__":
main()