momentry_core/scripts/terminology_manager.py

#!/usr/bin/env python3
"""
術語管理器 - 用於統一管理和更新架構文檔中的術語
"""

import json
import re
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict


@dataclass
class TerminologyEntry:
    """術語條目"""

    design_concept: str  # 設計概念
    design_value: str  # 設計值
    actual_value: str  # 實際實現值
    status: str  # 狀態標記
    description: str  # 描述
    last_updated: str  # 最後更新時間
    source_files: List[str]  # 使用此術語的文件


@dataclass
class TerminologyMapping:
    """術語映射表"""

    mapping: Dict[str, TerminologyEntry]
    version: str
    created_at: str
    updated_at: str


class TerminologyManager:
    """術語管理器"""

    def __init__(self, data_dir: Path = Path("data/terminology")):
        self.data_dir = data_dir
        self.data_dir.mkdir(parents=True, exist_ok=True)
        self.mapping_file = data_dir / "terminology_mapping.json"
        self.usage_file = data_dir / "terminology_usage.json"

        # 定義標準術語對照表
        self.standard_terminology = {
            "sentence": TerminologyEntry(
                design_concept="句子級分片",
                design_value="sentence",
                actual_value="ChunkType::Sentence",
                status="✅ 完整實現",
                description="基於 ASR 轉錄結果的單句級別分片",
                last_updated=datetime.now().isoformat(),
                source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_1_SENTENCE.md"],
            ),
            "visual": TerminologyEntry(
                design_concept="視覺物件級分片",
                design_value="visual",
                actual_value="未實現",
                status="❌ 未實現",
                description="基於 YOLO 物件檢測的視覺分片",
                last_updated=datetime.now().isoformat(),
                source_files=["CHUNK_DESIGN.md"],
            ),
            "scene": TerminologyEntry(
                design_concept="場景級分片",
                design_value="scene",
                actual_value="ChunkType::Cut",
                status="⚠️ 部分實現",
                description="基於 CUT 場景檢測算法的分片",
                last_updated=datetime.now().isoformat(),
                source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_3_SCENE.md"],
            ),
            "summary": TerminologyEntry(
                design_concept="摘要級分片",
                design_value="summary",
                actual_value="ChunkType::Story",
                status="⚠️ 概念調整",
                description="基於分片聚合的敘事總結分片",
                last_updated=datetime.now().isoformat(),
                source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_4_SUMMARY.md"],
            ),
            "time": TerminologyEntry(
                design_concept="時間基準分片",
                design_value="time",
                actual_value="ChunkType::TimeBased",
                status="✅ 完整實現",
                description="固定時間間隔的分片",
                last_updated=datetime.now().isoformat(),
                source_files=["CHUNK_DESIGN.md"],
            ),
            "trace": TerminologyEntry(
                design_concept="軌跡追蹤分片",
                design_value="trace",
                actual_value="ChunkType::Trace",
                status="✅ 完整實現",
                description="物件或人物的時空軌跡分片",
                last_updated=datetime.now().isoformat(),
                source_files=["CHUNK_DESIGN.md"],
            ),
        }

        self.initialize()

    def initialize(self):
        """初始化術語映射表"""
        if not self.mapping_file.exists():
            self.save_mapping()

    def save_mapping(self):
        """保存術語映射表"""
        mapping_data = TerminologyMapping(
            mapping=self.standard_terminology,
            version="1.0",
            created_at=datetime.now().isoformat(),
            updated_at=datetime.now().isoformat(),
        )

        with open(self.mapping_file, "w", encoding="utf-8") as f:
            json.dump(asdict(mapping_data), f, ensure_ascii=False, indent=2)

        print(f"✓ 術語映射表已保存: {self.mapping_file}")

    def load_mapping(self) -> TerminologyMapping:
        """加載術語映射表"""
        with open(self.mapping_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        return TerminologyMapping(**data)

    def find_terminology_in_files(
        self, pattern: str, directory: Path
    ) -> Dict[str, List[Tuple[str, int]]]:
        """在文件中查找術語使用情況"""
        results = {}

        for root, dirs, files in os.walk(directory):
            for file in files:
                if file.endswith(".md"):
                    file_path = Path(root) / file
                    with open(file_path, "r", encoding="utf-8") as f:
                        content = f.read()

                    matches = list(re.finditer(pattern, content, re.IGNORECASE))
                    if matches:
                        results[str(file_path)] = [
                            (match.group(), match.start()) for match in matches
                        ]

        return results

    def generate_report(self) -> Dict[str, any]:
        """生成術語使用報告"""
        mapping = self.load_mapping()
        arch_dir = Path("docs_v1.0/ARCHITECTURE")

        usage = {}
        for design_term, entry in mapping.mapping.items():
            pattern = re.escape(entry.design_value)
            usage[design_term] = self.find_terminology_in_files(pattern, arch_dir)

        report = {
            "metadata": {
                "generated_at": datetime.now().isoformat(),
                "version": mapping.version,
                "total_terms": len(mapping.mapping),
            },
            "terminology_usage": usage,
            "summary": {
                "total_files_scanned": sum(len(v) for v in usage.values()),
                "unique_terms_used": len(usage),
                "consistency_score": self.calculate_consistency_score(usage),
            },
        }

        return report

    def calculate_consistency_score(self, usage: Dict[str, any]) -> float:
        """計算術語一致性分數"""
        total_occurrences = sum(len(v) for v in usage.values())
        if total_occurrences == 0:
            return 1.0

        # 計算術語使用的一致性
        consistency_score = 0.0

        # 檢查設計值和實際值是否一致
        for design_term, occurrences in usage.items():
            entry = self.standard_terminology.get(design_term)
            if not entry:
                continue

            # 檢查文件中的引用是否與定義一致
            for file_path, matches in occurrences.items():
                for match, _ in matches:
                    # 檢查是否使用了正確的術語
                    if match.lower() == entry.design_value.lower():
                        consistency_score += 1.0
                    else:
                        # 部分匹配或錯誤使用
                        consistency_score += 0.5

        # 歸一化分數
        if total_occurrences > 0:
            consistency_score = consistency_score / total_occurrences

        return consistency_score


def main():
    """主函數"""
    print("術語管理器 - 統一管理架構文檔術語")
    print("=" * 60)

    manager = TerminologyManager()

    # 生成報告
    report = manager.generate_report()

    print("\n術語使用報告:")
    print(f"版本: {report['metadata']['version']}")
    print(f"生成時間: {report['metadata']['generated_at']}")
    print(f"一致性分數: {report['summary']['consistency_score']:.2f}")
    print(f"使用術語總數: {report['summary']['unique_terms_used']}")

    print("\n術語對照表:")
    for term, entry in manager.standard_terminology.items():
        print(f"{term:10} → {entry.actual_value:30} [{entry.status}]")

    print("\n建議：")
    print("1. 在設計文檔中保留設計值說明")
    print("2. 在實現文檔中使用實際值")
    print("3. 定期檢查術語一致性")
    print("4. 更新代碼註釋中的術語")


if __name__ == "__main__":
    main()