Marcus/API/llava_api.py

"""
llava_api.py — LLaVA / Qwen VL query interface
"""
import json
import ollama
import yaml
from pathlib import Path
from Core.config_loader import load_config

_cfg = load_config("Brain")

# Load prompts from YAML (the authoritative source — bilingual, complete)
_yaml_path = Path(__file__).resolve().parent.parent / "Config" / "marcus_prompts.yaml"
with open(_yaml_path, encoding="utf-8") as _f:
    _prompts = yaml.safe_load(_f)

OLLAMA_MODEL  = _cfg["ollama_model"]
MAX_HISTORY   = _cfg["max_history"]
# Cap batch and context on every request. Without this, llama.cpp on Jetson
# Orin NX allocates a ~7.5 GiB compute graph (defaults: batch 512, ctx 4096)
# that SIGKILLs the runner when Marcus already holds ~2 GiB of unified memory
# for YOLO/camera/audio. Halving batch roughly quarters the compute graph.
NUM_BATCH     = _cfg.get("num_batch", 128)
NUM_CTX       = _cfg.get("num_ctx", 2048)
MAIN_PROMPT   = _prompts["main_prompt"]
GOAL_PROMPT   = _prompts["goal_prompt"]
PATROL_PROMPT = _prompts["patrol_prompt"]
TALK_PROMPT   = _prompts["talk_prompt"]
VERIFY_PROMPT = _prompts["verify_prompt"]

# Conversation state
_conversation_history = []
_facts = []


def remember_fact(fact: str):
    """Store a fact told by the user for injection into LLaVA context."""
    if fact and fact not in _facts:
        _facts.append(fact)
        print(f"  [Memory] Fact stored: {fact}")


def add_to_history(user_msg: str, assistant_msg: str):
    _conversation_history.append({"role": "user", "content": user_msg})
    _conversation_history.append({"role": "assistant", "content": assistant_msg})
    while len(_conversation_history) > MAX_HISTORY:
        _conversation_history.pop(0)


def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str:
    messages = []
    if use_history and _conversation_history:
        messages.extend(_conversation_history)
    msg = {"role": "user", "content": prompt}
    if img_b64:
        msg["images"] = [img_b64]
    messages.append(msg)
    r = ollama.chat(model=OLLAMA_MODEL, messages=messages,
                    options={
                        "temperature": 0.0,
                        "num_predict": num_predict,
                        "num_batch":   NUM_BATCH,
                        "num_ctx":     NUM_CTX,
                    })
    return r["message"]["content"].strip()


def parse_json(raw: str):
    """Extract and parse first JSON object from string."""
    raw = raw.replace("```json", "").replace("```", "").strip()
    s = raw.find("{")
    e = raw.rfind("}") + 1
    if s == -1 or e == 0:
        return None
    try:
        return json.loads(raw[s:e])
    except json.JSONDecodeError:
        return None


def ask(command: str, img_b64) -> dict:
    """Send command + camera frame to LLaVA with conversation history."""
    try:
        facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
        raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
                         num_predict=_cfg["num_predict_main"], use_history=True)
        print(f"  Raw: {raw}")
        d = parse_json(raw)
        speak = d.get("speak", raw) if d else raw
        add_to_history(command, speak)
        if d is None:
            return {"actions": [], "arm": None, "speak": raw, "abort": None}
        return d
    except Exception as ex:
        print(f"  LLaVA error: {ex}")
        return {"actions": [], "arm": None, "speak": "Error.", "abort": None}


def ask_goal(goal: str, img_b64) -> dict:
    """Ask LLaVA if goal is reached."""
    try:
        raw = call_llava(GOAL_PROMPT.format(goal=goal), img_b64,
                         num_predict=_cfg["num_predict_goal"])
        print(f"  LLaVA: {raw}")
        d = parse_json(raw)
        if d is None:
            text = raw.lower()
            reached = any(w in text for w in
                          ["reached", "found", "i can see", "i see a person", "yes", "arrived"])
            return {"reached": reached, "next_move": "left", "duration": 0.5, "speak": raw[:100]}
        reached = d.get("reached", False)
        if isinstance(reached, str):
            reached = reached.lower() in ("true", "yes", "1")
        d["reached"] = reached
        return d
    except Exception:
        return {"reached": False, "next_move": "left", "duration": 0.5, "speak": "Continuing..."}


def ask_talk(command: str, img_b64, facts: str = "") -> dict:
    """Handle talk-only commands using the YAML talk_prompt."""
    try:
        prompt = TALK_PROMPT.format(command=command, facts=facts)
        raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"],
                         use_history=True)
        print(f"  Raw: {raw}")
        d = parse_json(raw)
        if d is None:
            return {"actions": [], "arm": None, "speak": raw[:100], "abort": None}
        speak = d.get("speak", "")
        add_to_history(command, speak)
        return d
    except Exception as ex:
        print(f"  Talk error: {ex}")
        return {"actions": [], "arm": None, "speak": f"Error: {ex}", "abort": None}


def ask_verify(target: str, condition: str, img_b64) -> str:
    """Verify a condition on a detected target. Returns 'yes' or 'no'."""
    try:
        prompt = VERIFY_PROMPT.format(target=target, condition=condition)
        raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_verify"])
        cleaned = raw.strip().lower().rstrip(".,!?")
        first_word = cleaned.split()[0] if cleaned.split() else "no"
        return first_word if first_word in ("yes", "no") else "no"
    except Exception:
        return "no"


def ask_patrol(img_b64) -> dict:
    """Ask LLaVA to assess scene during patrol."""
    try:
        raw = call_llava(PATROL_PROMPT, img_b64, num_predict=_cfg["num_predict_patrol"])
        d = parse_json(raw)
        return d or {"observation": raw[:80], "alert": None, "next_move": "forward", "duration": 1.0}
    except Exception:
        return {"observation": "Error", "alert": None, "next_move": "stop", "duration": 0}