- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
240 lines
8.3 KiB
Python
240 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
術語管理器 - 用於統一管理和更新架構文檔中的術語
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
from dataclasses import dataclass, asdict
|
|
|
|
|
|
@dataclass
|
|
class TerminologyEntry:
|
|
"""術語條目"""
|
|
|
|
design_concept: str # 設計概念
|
|
design_value: str # 設計值
|
|
actual_value: str # 實際實現值
|
|
status: str # 狀態標記
|
|
description: str # 描述
|
|
last_updated: str # 最後更新時間
|
|
source_files: List[str] # 使用此術語的文件
|
|
|
|
|
|
@dataclass
|
|
class TerminologyMapping:
|
|
"""術語映射表"""
|
|
|
|
mapping: Dict[str, TerminologyEntry]
|
|
version: str
|
|
created_at: str
|
|
updated_at: str
|
|
|
|
|
|
class TerminologyManager:
|
|
"""術語管理器"""
|
|
|
|
def __init__(self, data_dir: Path = Path("data/terminology")):
|
|
self.data_dir = data_dir
|
|
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
self.mapping_file = data_dir / "terminology_mapping.json"
|
|
self.usage_file = data_dir / "terminology_usage.json"
|
|
|
|
# 定義標準術語對照表
|
|
self.standard_terminology = {
|
|
"sentence": TerminologyEntry(
|
|
design_concept="句子級分片",
|
|
design_value="sentence",
|
|
actual_value="ChunkType::Sentence",
|
|
status="✅ 完整實現",
|
|
description="基於 ASR 轉錄結果的單句級別分片",
|
|
last_updated=datetime.now().isoformat(),
|
|
source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_1_SENTENCE.md"],
|
|
),
|
|
"visual": TerminologyEntry(
|
|
design_concept="視覺物件級分片",
|
|
design_value="visual",
|
|
actual_value="未實現",
|
|
status="❌ 未實現",
|
|
description="基於 YOLO 物件檢測的視覺分片",
|
|
last_updated=datetime.now().isoformat(),
|
|
source_files=["CHUNK_DESIGN.md"],
|
|
),
|
|
"scene": TerminologyEntry(
|
|
design_concept="場景級分片",
|
|
design_value="scene",
|
|
actual_value="ChunkType::Cut",
|
|
status="⚠️ 部分實現",
|
|
description="基於 CUT 場景檢測算法的分片",
|
|
last_updated=datetime.now().isoformat(),
|
|
source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_3_SCENE.md"],
|
|
),
|
|
"summary": TerminologyEntry(
|
|
design_concept="摘要級分片",
|
|
design_value="summary",
|
|
actual_value="ChunkType::Story",
|
|
status="⚠️ 概念調整",
|
|
description="基於分片聚合的敘事總結分片",
|
|
last_updated=datetime.now().isoformat(),
|
|
source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_4_SUMMARY.md"],
|
|
),
|
|
"time": TerminologyEntry(
|
|
design_concept="時間基準分片",
|
|
design_value="time",
|
|
actual_value="ChunkType::TimeBased",
|
|
status="✅ 完整實現",
|
|
description="固定時間間隔的分片",
|
|
last_updated=datetime.now().isoformat(),
|
|
source_files=["CHUNK_DESIGN.md"],
|
|
),
|
|
"trace": TerminologyEntry(
|
|
design_concept="軌跡追蹤分片",
|
|
design_value="trace",
|
|
actual_value="ChunkType::Trace",
|
|
status="✅ 完整實現",
|
|
description="物件或人物的時空軌跡分片",
|
|
last_updated=datetime.now().isoformat(),
|
|
source_files=["CHUNK_DESIGN.md"],
|
|
),
|
|
}
|
|
|
|
self.initialize()
|
|
|
|
def initialize(self):
|
|
"""初始化術語映射表"""
|
|
if not self.mapping_file.exists():
|
|
self.save_mapping()
|
|
|
|
def save_mapping(self):
|
|
"""保存術語映射表"""
|
|
mapping_data = TerminologyMapping(
|
|
mapping=self.standard_terminology,
|
|
version="1.0",
|
|
created_at=datetime.now().isoformat(),
|
|
updated_at=datetime.now().isoformat(),
|
|
)
|
|
|
|
with open(self.mapping_file, "w", encoding="utf-8") as f:
|
|
json.dump(asdict(mapping_data), f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"✓ 術語映射表已保存: {self.mapping_file}")
|
|
|
|
def load_mapping(self) -> TerminologyMapping:
|
|
"""加載術語映射表"""
|
|
with open(self.mapping_file, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
return TerminologyMapping(**data)
|
|
|
|
def find_terminology_in_files(
|
|
self, pattern: str, directory: Path
|
|
) -> Dict[str, List[Tuple[str, int]]]:
|
|
"""在文件中查找術語使用情況"""
|
|
results = {}
|
|
|
|
for root, dirs, files in os.walk(directory):
|
|
for file in files:
|
|
if file.endswith(".md"):
|
|
file_path = Path(root) / file
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
matches = list(re.finditer(pattern, content, re.IGNORECASE))
|
|
if matches:
|
|
results[str(file_path)] = [
|
|
(match.group(), match.start()) for match in matches
|
|
]
|
|
|
|
return results
|
|
|
|
def generate_report(self) -> Dict[str, any]:
|
|
"""生成術語使用報告"""
|
|
mapping = self.load_mapping()
|
|
arch_dir = Path("docs_v1.0/ARCHITECTURE")
|
|
|
|
usage = {}
|
|
for design_term, entry in mapping.mapping.items():
|
|
pattern = re.escape(entry.design_value)
|
|
usage[design_term] = self.find_terminology_in_files(pattern, arch_dir)
|
|
|
|
report = {
|
|
"metadata": {
|
|
"generated_at": datetime.now().isoformat(),
|
|
"version": mapping.version,
|
|
"total_terms": len(mapping.mapping),
|
|
},
|
|
"terminology_usage": usage,
|
|
"summary": {
|
|
"total_files_scanned": sum(len(v) for v in usage.values()),
|
|
"unique_terms_used": len(usage),
|
|
"consistency_score": self.calculate_consistency_score(usage),
|
|
},
|
|
}
|
|
|
|
return report
|
|
|
|
def calculate_consistency_score(self, usage: Dict[str, any]) -> float:
|
|
"""計算術語一致性分數"""
|
|
total_occurrences = sum(len(v) for v in usage.values())
|
|
if total_occurrences == 0:
|
|
return 1.0
|
|
|
|
# 計算術語使用的一致性
|
|
consistency_score = 0.0
|
|
|
|
# 檢查設計值和實際值是否一致
|
|
for design_term, occurrences in usage.items():
|
|
entry = self.standard_terminology.get(design_term)
|
|
if not entry:
|
|
continue
|
|
|
|
# 檢查文件中的引用是否與定義一致
|
|
for file_path, matches in occurrences.items():
|
|
for match, _ in matches:
|
|
# 檢查是否使用了正確的術語
|
|
if match.lower() == entry.design_value.lower():
|
|
consistency_score += 1.0
|
|
else:
|
|
# 部分匹配或錯誤使用
|
|
consistency_score += 0.5
|
|
|
|
# 歸一化分數
|
|
if total_occurrences > 0:
|
|
consistency_score = consistency_score / total_occurrences
|
|
|
|
return consistency_score
|
|
|
|
|
|
def main():
|
|
"""主函數"""
|
|
print("術語管理器 - 統一管理架構文檔術語")
|
|
print("=" * 60)
|
|
|
|
manager = TerminologyManager()
|
|
|
|
# 生成報告
|
|
report = manager.generate_report()
|
|
|
|
print("\n術語使用報告:")
|
|
print(f"版本: {report['metadata']['version']}")
|
|
print(f"生成時間: {report['metadata']['generated_at']}")
|
|
print(f"一致性分數: {report['summary']['consistency_score']:.2f}")
|
|
print(f"使用術語總數: {report['summary']['unique_terms_used']}")
|
|
|
|
print("\n術語對照表:")
|
|
for term, entry in manager.standard_terminology.items():
|
|
print(f"{term:10} → {entry.actual_value:30} [{entry.status}]")
|
|
|
|
print("\n建議:")
|
|
print("1. 在設計文檔中保留設計值說明")
|
|
print("2. 在實現文檔中使用實際值")
|
|
print("3. 定期檢查術語一致性")
|
|
print("4. 更新代碼註釋中的術語")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|