momentry_core/scripts/test_llm_capabilities.py

#!/opt/homebrew/bin/python3.11
"""
Local LLM (Gemma 4) Capability & Speed Benchmark
"""

import json
import time
import subprocess

UUID = "384b0ff44aaaa1f1"
ASR_PATH = f"output/{UUID}/{UUID}.asr.json"
MODEL = "gemma4:latest"


def load_context(n_segments=20):
    try:
        with open(ASR_PATH, "r") as f:
            data = json.load(f)
        segments = data.get("segments", [])[50 : 50 + n_segments]  # Pick a middle chunk
        text = " ".join([s.get("text", "") for s in segments])
        return text
    except Exception as e:
        return f"Error loading context: {e}"


def run_test(name, prompt_template, context_text):
    print(f"\n🧪 Testing: {name}")
    print("-" * 50)

    prompt = prompt_template.format(context=context_text)
    full_input = f"{prompt}\n\nContext:\n{context_text}"

    start = time.time()
    try:
        result = subprocess.run(
            ["ollama", "run", MODEL, full_input],
            capture_output=True,
            text=True,
            timeout=120,
        )
        duration = time.time() - start
        output = result.stdout.strip()

        # Check if it's JSON (basic check)
        is_json = output.startswith("{") and output.endswith("}")
        tag = "JSON ✅" if is_json else "Text ⚠️"

        print(f"⏱️ Duration: {duration:.2f}s | Format: {tag}")
        print(f"🤖 Output: {output[:300]}...")
        return duration, output

    except Exception as e:
        duration = time.time() - start
        print(f"❌ Failed ({duration:.2f}s): {e}")
        return duration, None


def main():
    print(f"🚀 Starting Gemma 4 Capability Test on Context ({MODEL})")
    context = load_context()
    print(f"📂 Loaded Context: {len(context)} chars")
    if len(context) < 50:
        print("⚠️ Context too short, aborting.")
        return

    print(f"👀 Preview: {context[:100]}...")

    results = []

    # Test 1: Summarization
    results.append(
        run_test(
            "1. Plot Summarization (摘要)",
            "Summarize the following movie dialogue into ONE sentence. Do not explain, just give the summary.",
            context,
        )
    )

    # Test 2: 5W1H Extraction
    results.append(
        run_test(
            "2. 5W1H Entity Extraction (資訊提取)",
            "Extract the following information from the text and output valid JSON only:\n{{'who': '...', 'what': '...', 'where': '...', 'when': '...'}}.",
            context,
        )
    )

    # Test 3: Sentiment Analysis
    results.append(
        run_test(
            "3. Sentiment & Mood Detection (情緒分析)",
            "Analyze the emotional tone of the dialogue. Output JSON: {{'mood': ['...'], 'tension_level': 'high/medium/low'}}.",
            context,
        )
    )

    # Test 4: Logical Reasoning (Plot Deduction)
    results.append(
        run_test(
            "4. Logical Reasoning (邏輯推理)",
            "Based on the text, answer: What are the characters discussing or investigating? Be specific.",
            context,
        )
    )

    # Summary
    valid_results = [r[0] for r in results if r[0] is not None]
    if valid_results:
        total = sum(valid_results)
        avg = total / len(valid_results)
        print(f"\n📊 Benchmark Summary:")
        print(f"Total Time for 4 tasks: {total:.2f}s")
        print(f"Average Time: {avg:.2f}s per task")

        if avg > 20:
            print(
                "\n⚠️ Note: Gemma 4 is accurate but slow. Consider asynchronous processing or smaller models for speed."
            )
        else:
            print("\n✅ Note: Performance is acceptable for background tasks.")


if __name__ == "__main__":
    main()