- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
125 lines
3.6 KiB
Python
125 lines
3.6 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Local LLM (Gemma 4) Capability & Speed Benchmark
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import subprocess
|
|
|
|
UUID = "384b0ff44aaaa1f1"
|
|
ASR_PATH = f"output/{UUID}/{UUID}.asr.json"
|
|
MODEL = "gemma4:latest"
|
|
|
|
|
|
def load_context(n_segments=20):
|
|
try:
|
|
with open(ASR_PATH, "r") as f:
|
|
data = json.load(f)
|
|
segments = data.get("segments", [])[50 : 50 + n_segments] # Pick a middle chunk
|
|
text = " ".join([s.get("text", "") for s in segments])
|
|
return text
|
|
except Exception as e:
|
|
return f"Error loading context: {e}"
|
|
|
|
|
|
def run_test(name, prompt_template, context_text):
|
|
print(f"\n🧪 Testing: {name}")
|
|
print("-" * 50)
|
|
|
|
prompt = prompt_template.format(context=context_text)
|
|
full_input = f"{prompt}\n\nContext:\n{context_text}"
|
|
|
|
start = time.time()
|
|
try:
|
|
result = subprocess.run(
|
|
["ollama", "run", MODEL, full_input],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
)
|
|
duration = time.time() - start
|
|
output = result.stdout.strip()
|
|
|
|
# Check if it's JSON (basic check)
|
|
is_json = output.startswith("{") and output.endswith("}")
|
|
tag = "JSON ✅" if is_json else "Text ⚠️"
|
|
|
|
print(f"⏱️ Duration: {duration:.2f}s | Format: {tag}")
|
|
print(f"🤖 Output: {output[:300]}...")
|
|
return duration, output
|
|
|
|
except Exception as e:
|
|
duration = time.time() - start
|
|
print(f"❌ Failed ({duration:.2f}s): {e}")
|
|
return duration, None
|
|
|
|
|
|
def main():
|
|
print(f"🚀 Starting Gemma 4 Capability Test on Context ({MODEL})")
|
|
context = load_context()
|
|
print(f"📂 Loaded Context: {len(context)} chars")
|
|
if len(context) < 50:
|
|
print("⚠️ Context too short, aborting.")
|
|
return
|
|
|
|
print(f"👀 Preview: {context[:100]}...")
|
|
|
|
results = []
|
|
|
|
# Test 1: Summarization
|
|
results.append(
|
|
run_test(
|
|
"1. Plot Summarization (摘要)",
|
|
"Summarize the following movie dialogue into ONE sentence. Do not explain, just give the summary.",
|
|
context,
|
|
)
|
|
)
|
|
|
|
# Test 2: 5W1H Extraction
|
|
results.append(
|
|
run_test(
|
|
"2. 5W1H Entity Extraction (資訊提取)",
|
|
"Extract the following information from the text and output valid JSON only:\n{{'who': '...', 'what': '...', 'where': '...', 'when': '...'}}.",
|
|
context,
|
|
)
|
|
)
|
|
|
|
# Test 3: Sentiment Analysis
|
|
results.append(
|
|
run_test(
|
|
"3. Sentiment & Mood Detection (情緒分析)",
|
|
"Analyze the emotional tone of the dialogue. Output JSON: {{'mood': ['...'], 'tension_level': 'high/medium/low'}}.",
|
|
context,
|
|
)
|
|
)
|
|
|
|
# Test 4: Logical Reasoning (Plot Deduction)
|
|
results.append(
|
|
run_test(
|
|
"4. Logical Reasoning (邏輯推理)",
|
|
"Based on the text, answer: What are the characters discussing or investigating? Be specific.",
|
|
context,
|
|
)
|
|
)
|
|
|
|
# Summary
|
|
valid_results = [r[0] for r in results if r[0] is not None]
|
|
if valid_results:
|
|
total = sum(valid_results)
|
|
avg = total / len(valid_results)
|
|
print(f"\n📊 Benchmark Summary:")
|
|
print(f"Total Time for 4 tasks: {total:.2f}s")
|
|
print(f"Average Time: {avg:.2f}s per task")
|
|
|
|
if avg > 20:
|
|
print(
|
|
"\n⚠️ Note: Gemma 4 is accurate but slow. Consider asynchronous processing or smaller models for speed."
|
|
)
|
|
else:
|
|
print("\n✅ Note: Performance is acceptable for background tasks.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|