Files
momentry_core/scripts/check_architecture_docs.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

483 lines
17 KiB
Python

#!/usr/bin/env python3
"""
架構文檔一致性檢查腳本
功能:
1. 檢查所有架構文檔間的鏈接有效性
2. 驗證術語一致性
3. 檢查設計與實現差異標記
4. 生成文檔質量報告
使用方法:
python3 scripts/check_architecture_docs.py [--report] [--verbose]
"""
import os
import re
import sys
import glob
import json
import argparse
from pathlib import Path
from typing import Dict, List, Set, Tuple, Optional
from collections import defaultdict
# 配置
ARCHITECTURE_DIR = Path(__file__).parent.parent / "docs_v1.0" / "ARCHITECTURE"
DOC_EXTENSIONS = [".md"]
IGNORE_FILES = ["README.md", "index.md"]
# 術語一致性檢查配置
TERMINOLOGY_PATTERNS = {
"chunk_type": [
r"chunk[_\\s]?type",
r"分片類型",
r"ChunkType",
],
"sentence": [
r"sentence",
r"句子",
r"Rule 1",
],
"visual": [
r"visual",
r"視覺",
r"Rule 2",
],
"scene": [
r"scene",
r"場景",
r"Rule 3",
],
"summary": [
r"summary",
r"摘要",
r"Rule 4",
],
"time_based": [
r"time[_\\s]?based",
r"時間基準",
r"TimeBased",
],
"cut": [
r"cut",
r"CUT",
r"場景分割",
],
"trace": [
r"trace",
r"軌跡",
r"Trace",
],
"story": [
r"story",
r"故事",
r"Story",
],
}
class DocumentIssue:
"""文檔問題記錄"""
def __init__(
self,
file_path: Path,
line_number: int,
issue_type: str,
description: str,
severity: str,
suggested_fix: Optional[str] = None,
):
self.file_path = file_path
self.line_number = line_number
self.issue_type = (
issue_type # "broken_link", "terminology", "format", "consistency"
)
self.description = description
self.severity = severity # "error", "warning", "info"
self.suggested_fix = suggested_fix
class DocumentStats:
"""文檔統計信息"""
def __init__(self, file_path: Path):
self.file_path = file_path
self.total_lines = 0
self.total_links = 0
self.broken_links = 0
self.terminology_issues = 0
self.format_issues = 0
self.consistency_issues = 0
self.issues: List[DocumentIssue] = []
class ArchitectureDocChecker:
"""架構文檔檢查器"""
def __init__(self, architecture_dir: Path):
self.architecture_dir = architecture_dir
self.all_md_files: List[Path] = []
self.file_contents: Dict[Path, List[str]] = {}
self.document_stats: Dict[Path, DocumentStats] = {}
def load_all_documents(self) -> None:
"""加載所有文檔"""
print(f"📁 掃描架構文檔目錄: {self.architecture_dir}")
# 掃描所有 Markdown 文件
for ext in DOC_EXTENSIONS:
pattern = self.architecture_dir / "**" / f"*{ext}"
for file_path in glob.glob(str(pattern), recursive=True):
file_path = Path(file_path)
if file_path.name in IGNORE_FILES:
continue
self.all_md_files.append(file_path)
# 加載文件內容
for file_path in self.all_md_files:
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.readlines()
self.file_contents[file_path] = content
# 初始化統計信息
self.document_stats[file_path] = DocumentStats(file_path=file_path)
self.document_stats[file_path].total_lines = len(content)
except Exception as e:
print(f"❌ 無法讀取文件 {file_path}: {e}")
print(f"✅ 加載了 {len(self.all_md_files)} 個文檔文件")
def check_links(self) -> None:
"""檢查文檔鏈接有效性"""
print("\n🔗 檢查文檔鏈接...")
# 收集所有可用的文件路徑(相對路徑)
available_files = set()
for file_path in self.all_md_files:
# 相對於架構目錄的路徑
rel_path = file_path.relative_to(self.architecture_dir)
available_files.add(str(rel_path))
available_files.add(str(rel_path).lower())
link_pattern = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
for file_path, content_lines in self.file_contents.items():
stats = self.document_stats[file_path]
for line_num, line in enumerate(content_lines, 1):
matches = link_pattern.findall(line)
stats.total_links += len(matches)
for link_text, link_url in matches:
# 檢查鏈接有效性
issue = self._check_single_link(
file_path, line_num, link_text, link_url, available_files
)
if issue:
stats.issues.append(issue)
stats.broken_links += 1
def _check_single_link(
self,
file_path: Path,
line_num: int,
link_text: str,
link_url: str,
available_files: Set[str],
) -> Optional[DocumentIssue]:
"""檢查單個鏈接"""
# 忽略外部鏈接
if link_url.startswith(("http://", "https://", "mailto:", "#")):
return None
# 清理鏈接(移除查詢參數和錨點)
clean_url = link_url.split("#")[0].split("?")[0]
# 檢查相對路徑鏈接
if clean_url.startswith("./"):
# 相對於當前文件的鏈接
current_dir = file_path.parent
target_path = (current_dir / clean_url[2:]).resolve()
# 轉換為相對於架構目錄的路徑
try:
rel_path = target_path.relative_to(self.architecture_dir)
if str(rel_path) not in available_files:
return DocumentIssue(
file_path=file_path,
line_number=line_num,
issue_type="broken_link",
description=f"鏈接目標不存在: {link_url} (解析為: {rel_path})",
severity="error",
suggested_fix=f"檢查文件是否存在: {target_path}",
)
except ValueError:
# 目標不在架構目錄內
if not target_path.exists():
return DocumentIssue(
file_path=file_path,
line_number=line_num,
issue_type="broken_link",
description=f"鏈接目標不存在: {link_url}",
severity="error",
suggested_fix=f"創建文件或修正鏈接: {target_path}",
)
# 檢查絕對路徑鏈接(相對於架構目錄)
elif not clean_url.startswith("/"):
if clean_url not in available_files:
return DocumentIssue(
file_path=file_path,
line_number=line_num,
issue_type="broken_link",
description=f"鏈接目標不存在: {link_url}",
severity="error",
suggested_fix=f"檢查文件是否存在: {clean_url}",
)
return None
def check_terminology(self) -> None:
"""檢查術語一致性"""
print("\n📝 檢查術語一致性...")
for file_path, content_lines in self.file_contents.items():
stats = self.document_stats[file_path]
for line_num, line in enumerate(content_lines, 1):
# 檢查設計與實現不一致的術語
design_terms = ["visual", "scene", "summary"]
impl_terms = ["TimeBased", "Cut", "Trace", "Story"]
# 如果文件提到設計術語,檢查是否有對應的實現說明
if any(term in line.lower() for term in design_terms):
# 檢查是否在 DESIGN_IMPLEMENTATION_GAP.md 中有說明
if file_path.name != "DESIGN_IMPLEMENTATION_GAP.md":
# 檢查前後文是否有提到實現差異
context_start = max(0, line_num - 3)
context_end = min(len(content_lines), line_num + 2)
context = content_lines[context_start:context_end]
context_text = "".join(context)
if not any(
impl_term in context_text for impl_term in impl_terms
):
stats.terminology_issues += 1
stats.issues.append(
DocumentIssue(
file_path=file_path,
line_number=line_num,
issue_type="terminology",
description="設計術語缺少實現狀態說明",
severity="warning",
suggested_fix="添加實現狀態說明或參考 DESIGN_IMPLEMENTATION_GAP.md",
)
)
def check_format(self) -> None:
"""檢查文檔格式"""
print("\n📋 檢查文檔格式...")
for file_path, content_lines in self.file_contents.items():
stats = self.document_stats[file_path]
# 檢查文件頭部格式
if content_lines and not content_lines[0].startswith("# "):
stats.format_issues += 1
stats.issues.append(
DocumentIssue(
file_path=file_path,
line_number=1,
issue_type="format",
description="文件缺少 H1 標題",
severity="warning",
suggested_fix="在第一行添加 # 標題",
)
)
# 檢查版本歷史表格
has_version_table = False
for line in content_lines:
if (
"版本歷史" in line
or "版本记录" in line
or "Version History" in line
):
has_version_table = True
break
if not has_version_table:
stats.format_issues += 1
stats.issues.append(
DocumentIssue(
file_path=file_path,
line_number=1,
issue_type="format",
description="文件缺少版本歷史表格",
severity="info",
suggested_fix="添加版本歷史表格",
)
)
def check_consistency(self) -> None:
"""檢查文檔間的一致性"""
print("\n🔄 檢查文檔間一致性...")
# 檢查 ARCHITECTURE_OVERVIEW.md 是否引用所有其他文檔
overview_file = self.architecture_dir / "ARCHITECTURE_OVERVIEW.md"
if overview_file in self.file_contents:
overview_content = "".join(self.file_contents[overview_file])
for other_file in self.all_md_files:
if other_file == overview_file:
continue
other_filename = other_file.name
if other_filename not in overview_content:
stats = self.document_stats[overview_file]
stats.consistency_issues += 1
stats.issues.append(
DocumentIssue(
file_path=overview_file,
line_number=1,
issue_type="consistency",
description=f"總覽文件未引用: {other_filename}",
severity="info",
suggested_fix=f"在相關文件索引中添加對 {other_filename} 的引用",
)
)
def generate_report(self, output_file: Optional[Path] = None) -> Dict:
"""生成檢查報告"""
print("\n📊 生成檢查報告...")
total_issues = 0
total_files = len(self.document_stats)
report = {
"summary": {
"total_files": total_files,
"total_issues": 0,
"issues_by_type": defaultdict(int),
"issues_by_severity": defaultdict(int),
},
"files": [],
}
for file_path, stats in self.document_stats.items():
file_report = {
"file": str(file_path.relative_to(self.architecture_dir.parent.parent)),
"total_lines": stats.total_lines,
"total_links": stats.total_links,
"broken_links": stats.broken_links,
"terminology_issues": stats.terminology_issues,
"format_issues": stats.format_issues,
"consistency_issues": stats.consistency_issues,
"issues": [],
}
for issue in stats.issues:
issue_dict = {
"line": issue.line_number,
"type": issue.issue_type,
"severity": issue.severity,
"description": issue.description,
"suggested_fix": issue.suggested_fix,
}
file_report["issues"].append(issue_dict)
# 更新統計
report["summary"]["total_issues"] += 1
report["summary"]["issues_by_type"][issue.issue_type] += 1
report["summary"]["issues_by_severity"][issue.severity] += 1
report["files"].append(file_report)
total_issues += len(stats.issues)
# 輸出報告
if output_file:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"✅ 報告已保存到: {output_file}")
else:
# 輸出簡要報告到控制台
print(f"\n{'=' * 60}")
print("架構文檔檢查報告")
print(f"{'=' * 60}")
print(f"📁 檢查文件數: {total_files}")
print(f"⚠️ 發現問題數: {total_issues}")
print(f"\n問題分類:")
for issue_type, count in report["summary"]["issues_by_type"].items():
print(f" - {issue_type}: {count}")
print(f"\n嚴重程度:")
for severity, count in report["summary"]["issues_by_severity"].items():
print(f" - {severity}: {count}")
if total_issues > 0:
print(f"\n🔍 詳細問題:")
for file_report in report["files"]:
if file_report["issues"]:
print(f"\n文件: {file_report['file']}")
for issue in file_report["issues"]:
print(
f"{issue['line']} [{issue['severity']}] {issue['type']}: {issue['description']}"
)
return report
def run_all_checks(self) -> Dict:
"""運行所有檢查"""
print("🚀 開始架構文檔一致性檢查")
print(f"檢查目錄: {self.architecture_dir}")
self.load_all_documents()
self.check_links()
self.check_terminology()
self.check_format()
self.check_consistency()
return self.generate_report()
def main():
"""主函數"""
parser = argparse.ArgumentParser(description="架構文檔一致性檢查工具")
parser.add_argument("--report", type=str, help="生成 JSON 報告文件")
parser.add_argument("--verbose", "-v", action="store_true", help="詳細輸出")
parser.add_argument("--check-only", action="store_true", help="只檢查不生成報告")
args = parser.parse_args()
# 檢查目錄是否存在
if not ARCHITECTURE_DIR.exists():
print(f"❌ 架構目錄不存在: {ARCHITECTURE_DIR}")
sys.exit(1)
# 運行檢查
checker = ArchitectureDocChecker(ARCHITECTURE_DIR)
if args.check_only:
checker.load_all_documents()
checker.check_links()
checker.check_terminology()
print("\n✅ 檢查完成(僅檢查模式)")
else:
output_file = Path(args.report) if args.report else None
report = checker.run_all_checks()
# 根據問題數量決定退出代碼
if report["summary"]["total_issues"] > 0:
print(f"\n❌ 發現 {report['summary']['total_issues']} 個問題,請修復")
sys.exit(1)
else:
print(f"\n✅ 所有檢查通過!")
sys.exit(0)
if __name__ == "__main__":
main()