- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
312 lines
11 KiB
Python
312 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
LLM-Based Chinese-English Synonym Generator for Momentry
|
|
|
|
Generates a synonym database by querying Gemma4 via llama.cpp server.
|
|
Output format: JSON with word -> [synonyms] mapping
|
|
|
|
Usage:
|
|
python scripts/generate_synonyms_llamacpp.py # Using default llama.cpp server
|
|
python scripts/generate_synonyms_llamacpp.py --url http://127.0.0.1:8081
|
|
python scripts/generate_synonyms_llamacpp.py --test # Quick test
|
|
python scripts/generate_synonyms_llamacpp.py --help # Show help
|
|
|
|
Requires:
|
|
- llama.cpp server running (default: http://127.0.0.1:8081)
|
|
- pip install requests
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import argparse
|
|
from typing import Dict, List, Optional
|
|
import requests
|
|
|
|
# ======================== Configuration ========================
|
|
|
|
# llama.cpp server default endpoint
|
|
DEFAULT_API_URL = "http://127.0.0.1:8081"
|
|
DEFAULT_MODEL = "gemma4"
|
|
DEFAULT_TIMEOUT = 60
|
|
|
|
# ======================== Seed Words for Video Search Context ========================
|
|
|
|
SEED_WORDS: Dict[str, List[str]] = {
|
|
# Action & Movement
|
|
"action": ["run", "walk", "move", "chase", "escape", "fight", "attack"],
|
|
"emotion": ["happy", "sad", "angry", "afraid", "surprised", "calm"],
|
|
"speech": ["talk", "say", "tell", "ask", "answer", "shout", "whisper"],
|
|
"scene": ["scene", "moment", "part", "clip", "sequence", "segment"],
|
|
# People & Relationships
|
|
"person": ["man", "woman", "boy", "girl", "child", "person"],
|
|
"relationship": ["friend", "enemy", "lover", "partner", "colleague"],
|
|
"authority": ["police", "detective", "officer", "guard", "agent"],
|
|
# Objects & Settings
|
|
"vehicle": ["car", "truck", "bus", "van", "vehicle", "automobile"],
|
|
"location": ["house", "office", "street", "city", "country", "place"],
|
|
"food": ["eat", "dinner", "lunch", "breakfast", "meal", "snack"],
|
|
"weapon": ["gun", "knife", "sword", "bomb", "weapon"],
|
|
# Events & Activities
|
|
"event": ["party", "meeting", "gathering", "celebration", "festival"],
|
|
"crime": ["theft", "murder", "robbery", "assault", "kidnapping"],
|
|
"travel": ["travel", "trip", "journey", "flight", "drive", "ride"],
|
|
# Time & Duration
|
|
"time": ["morning", "noon", "evening", "night", "afternoon"],
|
|
"duration": ["second", "minute", "hour", "day", "week", "month", "year"],
|
|
# Emotions & States
|
|
"positive": ["love", "joy", "peace", "hope", "trust", "success"],
|
|
"negative": ["fear", "anger", "pain", "death", "loss", "failure"],
|
|
"mental": ["think", "know", "believe", "understand", "remember", "forget"],
|
|
# Sensory
|
|
"sight": ["see", "look", "watch", "observe", "notice", "find"],
|
|
"sound": ["hear", "listen", "noise", "music", "voice", "speak"],
|
|
# Money & Value
|
|
"money": ["cash", "dollar", "coin", "payment", "price", "wealth"],
|
|
"transaction": ["buy", "sell", "pay", "spend", "cost", "price"],
|
|
# Chinese specific concepts
|
|
"chinese_emotion": ["愛", "恨", "喜", "怒", "哀", "樂", "愁", "驚"],
|
|
"chinese_action": ["走", "跑", "說", "看", "聽", "想", "做", "吃"],
|
|
"chinese_object": ["房子", "車子", "書", "電話", "電腦", "手機"],
|
|
"chinese_person": ["男人", "女人", "小孩", "老人", "朋友", "敵人"],
|
|
}
|
|
|
|
# ======================== LLM Query Functions ========================
|
|
|
|
SYSTEM_PROMPT = """You are a synonym generation assistant. For each given word, provide 8-15 synonyms in the same language.
|
|
Rules:
|
|
1. Return ONLY a JSON array of strings, nothing else
|
|
2. Synonyms should be contextually relevant for video content search
|
|
3. Include common words, informal terms, and related concepts
|
|
4. Do NOT include the input word in the output
|
|
5. All synonyms must be in the SAME language as the input word
|
|
6. No explanations, no markdown, just the JSON array
|
|
|
|
Example input: "money"
|
|
Example output: ["cash", "dollar", "currency", "funds", "bucks", "greenbacks", "coins", "wealth", "payment"]
|
|
|
|
Example input: "快樂"
|
|
Example output: ["開心", "高興", "愉快", "歡喜", "歡樂", "喜悅", "愉悅", "幸福"]"""
|
|
|
|
|
|
def check_server_health(api_url: str) -> bool:
|
|
"""Check if llama.cpp server is running"""
|
|
try:
|
|
resp = requests.get(f"{api_url}/health", timeout=5)
|
|
if resp.status_code == 200:
|
|
print(f"✅ llama.cpp server is running at {api_url}")
|
|
return True
|
|
except requests.exceptions.ConnectionError:
|
|
print(f"❌ Cannot connect to llama.cpp server at {api_url}")
|
|
except requests.exceptions.Timeout:
|
|
print(f"❌ Connection to llama.cpp server timed out")
|
|
return False
|
|
|
|
|
|
def query_llm(
|
|
word: str,
|
|
api_url: str = DEFAULT_API_URL,
|
|
model: str = DEFAULT_MODEL,
|
|
timeout: int = DEFAULT_TIMEOUT,
|
|
retries: int = 3,
|
|
) -> Optional[List[str]]:
|
|
"""Query Gemma4 via llama.cpp OpenAI-compatible endpoint"""
|
|
for attempt in range(retries):
|
|
try:
|
|
payload = {
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": f'Give synonyms for: "{word}"'},
|
|
],
|
|
"temperature": 0.3,
|
|
"stream": False,
|
|
"max_tokens": 256,
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{api_url}/v1/chat/completions",
|
|
json=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
timeout=timeout,
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
print(f" ⚠ HTTP {response.status_code} for '{word}'")
|
|
print(f" Response: {response.text[:200]}")
|
|
time.sleep(2)
|
|
continue
|
|
|
|
data = response.json()
|
|
content = data["choices"][0]["message"]["content"].strip()
|
|
|
|
# Extract JSON from response (handle markdown code blocks)
|
|
if "```" in content:
|
|
parts = content.split("```")
|
|
for part in parts:
|
|
part = part.strip()
|
|
if part.startswith("json"):
|
|
part = part[4:].strip()
|
|
if part.startswith("[") and part.endswith("]"):
|
|
content = part
|
|
break
|
|
|
|
synonyms = json.loads(content)
|
|
|
|
if isinstance(synonyms, list) and len(synonyms) > 0:
|
|
# Filter: remove empty strings, normalize
|
|
synonyms = [s.strip().lower() for s in synonyms if s.strip()]
|
|
return synonyms
|
|
|
|
print(f" ⚠ Invalid format for '{word}'")
|
|
return None
|
|
|
|
except json.JSONDecodeError:
|
|
print(f" ⚠ JSON parse error for '{word}' (attempt {attempt + 1})")
|
|
except requests.exceptions.Timeout:
|
|
print(f" ⚠ Timeout for '{word}' (attempt {attempt + 1})")
|
|
time.sleep(2)
|
|
except Exception as e:
|
|
print(f" ⚠ Error for '{word}': {e} (attempt {attempt + 1})")
|
|
if attempt < retries - 1:
|
|
time.sleep(2)
|
|
|
|
return None
|
|
|
|
|
|
# ======================== Batch Generation ========================
|
|
|
|
|
|
def generate_synonyms_batch(
|
|
seed_words: Dict[str, List[str]],
|
|
api_url: str = DEFAULT_API_URL,
|
|
model: str = DEFAULT_MODEL,
|
|
output_file: str = "data/llm_synonyms.json",
|
|
rate_limit: float = 1.0,
|
|
) -> Dict[str, List[str]]:
|
|
"""Generate synonyms for all seed words"""
|
|
|
|
# Load existing data if output file exists (auto-resume)
|
|
synonym_db: Dict[str, List[str]] = {}
|
|
if os.path.exists(output_file):
|
|
try:
|
|
with open(output_file, "r", encoding="utf-8") as f:
|
|
synonym_db = json.load(f)
|
|
print(f"📥 Resumed from {output_file} ({len(synonym_db)} entries)")
|
|
except Exception:
|
|
pass
|
|
|
|
total_words = sum(len(words) for words in seed_words.values())
|
|
processed = 0
|
|
|
|
print(f"\n📝 Generating synonyms for {total_words} words using {model}...")
|
|
print(f"🔗 Server: {api_url}")
|
|
print("=" * 60)
|
|
|
|
for category, words in seed_words.items():
|
|
print(f"\n📂 Category: {category}")
|
|
for word in words:
|
|
print(f" 🔍 {word}...", end=" ")
|
|
|
|
# Skip if already in DB
|
|
if word in synonym_db:
|
|
print(f"⏭ cached ({len(synonym_db[word])} synonyms)")
|
|
continue
|
|
|
|
synonyms = query_llm(word, api_url=api_url, model=model)
|
|
|
|
if synonyms:
|
|
synonym_db[word] = synonyms
|
|
print(f"✅ {len(synonyms)} synonyms")
|
|
else:
|
|
print("❌ failed")
|
|
|
|
processed += 1
|
|
time.sleep(rate_limit)
|
|
|
|
# Save progress after each category
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(synonym_db, f, ensure_ascii=False, indent=2)
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"✅ Done! Saved {len(synonym_db)} entries to {output_file}")
|
|
print(f" Total words processed: {processed}/{total_words}")
|
|
|
|
return synonym_db
|
|
|
|
|
|
# ======================== Main ========================
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="LLM-Based Chinese-English Synonym Generator (llama.cpp / Gemma4)"
|
|
)
|
|
parser.add_argument(
|
|
"--url",
|
|
type=str,
|
|
default=DEFAULT_API_URL,
|
|
help=f"llama.cpp server URL (default: {DEFAULT_API_URL})",
|
|
)
|
|
parser.add_argument(
|
|
"--model",
|
|
type=str,
|
|
default=DEFAULT_MODEL,
|
|
help=f"Model name (default: {DEFAULT_MODEL})",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
default="data/llm_synonyms.json",
|
|
help="Output file path (default: data/llm_synonyms.json)",
|
|
)
|
|
parser.add_argument(
|
|
"--rate-limit",
|
|
type=float,
|
|
default=0.5,
|
|
help="Rate limit in seconds between requests (default: 0.5)",
|
|
)
|
|
parser.add_argument(
|
|
"--category",
|
|
type=str,
|
|
default=None,
|
|
help="Process only this category (e.g., 'action', 'emotion')",
|
|
)
|
|
parser.add_argument(
|
|
"--test", action="store_true", help="Test with a few words only"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Check server health
|
|
if not check_server_health(args.url):
|
|
print("\n💡 Start llama.cpp server with:")
|
|
print(f" llama-server --model <gemma4.gguf> --port 8081")
|
|
sys.exit(1)
|
|
|
|
# Prepare seed words
|
|
seeds = SEED_WORDS.copy()
|
|
if args.category:
|
|
if args.category in seeds:
|
|
seeds = {args.category: seeds[args.category]}
|
|
else:
|
|
print(f"Error: category '{args.category}' not found")
|
|
sys.exit(1)
|
|
|
|
if args.test:
|
|
seeds = {"test": ["happy", "money", "愛"]}
|
|
|
|
# Generate synonyms
|
|
generate_synonyms_batch(
|
|
seed_words=seeds,
|
|
api_url=args.url,
|
|
model=args.model,
|
|
output_file=args.output,
|
|
rate_limit=args.rate_limit,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|