Files
momentry_core/scripts/test_llm_capabilities.py
Warren 8f05a7c188 feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00

125 lines
3.6 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Local LLM (Gemma 4) Capability & Speed Benchmark
"""
import json
import time
import subprocess
UUID = "384b0ff44aaaa1f1"
ASR_PATH = f"output/{UUID}/{UUID}.asr.json"
MODEL = "gemma4:latest"
def load_context(n_segments=20):
try:
with open(ASR_PATH, "r") as f:
data = json.load(f)
segments = data.get("segments", [])[50 : 50 + n_segments] # Pick a middle chunk
text = " ".join([s.get("text", "") for s in segments])
return text
except Exception as e:
return f"Error loading context: {e}"
def run_test(name, prompt_template, context_text):
print(f"\n🧪 Testing: {name}")
print("-" * 50)
prompt = prompt_template.format(context=context_text)
full_input = f"{prompt}\n\nContext:\n{context_text}"
start = time.time()
try:
result = subprocess.run(
["ollama", "run", MODEL, full_input],
capture_output=True,
text=True,
timeout=120,
)
duration = time.time() - start
output = result.stdout.strip()
# Check if it's JSON (basic check)
is_json = output.startswith("{") and output.endswith("}")
tag = "JSON ✅" if is_json else "Text ⚠️"
print(f"⏱️ Duration: {duration:.2f}s | Format: {tag}")
print(f"🤖 Output: {output[:300]}...")
return duration, output
except Exception as e:
duration = time.time() - start
print(f"❌ Failed ({duration:.2f}s): {e}")
return duration, None
def main():
print(f"🚀 Starting Gemma 4 Capability Test on Context ({MODEL})")
context = load_context()
print(f"📂 Loaded Context: {len(context)} chars")
if len(context) < 50:
print("⚠️ Context too short, aborting.")
return
print(f"👀 Preview: {context[:100]}...")
results = []
# Test 1: Summarization
results.append(
run_test(
"1. Plot Summarization (摘要)",
"Summarize the following movie dialogue into ONE sentence. Do not explain, just give the summary.",
context,
)
)
# Test 2: 5W1H Extraction
results.append(
run_test(
"2. 5W1H Entity Extraction (資訊提取)",
"Extract the following information from the text and output valid JSON only:\n{{'who': '...', 'what': '...', 'where': '...', 'when': '...'}}.",
context,
)
)
# Test 3: Sentiment Analysis
results.append(
run_test(
"3. Sentiment & Mood Detection (情緒分析)",
"Analyze the emotional tone of the dialogue. Output JSON: {{'mood': ['...'], 'tension_level': 'high/medium/low'}}.",
context,
)
)
# Test 4: Logical Reasoning (Plot Deduction)
results.append(
run_test(
"4. Logical Reasoning (邏輯推理)",
"Based on the text, answer: What are the characters discussing or investigating? Be specific.",
context,
)
)
# Summary
valid_results = [r[0] for r in results if r[0] is not None]
if valid_results:
total = sum(valid_results)
avg = total / len(valid_results)
print(f"\n📊 Benchmark Summary:")
print(f"Total Time for 4 tasks: {total:.2f}s")
print(f"Average Time: {avg:.2f}s per task")
if avg > 20:
print(
"\n⚠️ Note: Gemma 4 is accurate but slow. Consider asynchronous processing or smaller models for speed."
)
else:
print("\n✅ Note: Performance is acceptable for background tasks.")
if __name__ == "__main__":
main()