245 lines
9.9 KiB
Python
245 lines
9.9 KiB
Python
"""
|
|
llava_api.py — Qwen-VL query interface (via Ollama)
|
|
|
|
Three deployment modes, chosen via config_Brain.json:
|
|
|
|
1. subsystems.vlm = false
|
|
→ every ask*() returns a safe fallback dict. Marcus runs in
|
|
regex-only "safe mode": no LLM load on the Jetson, no GPU/CPU
|
|
contention with Holosoma, robot won't fall from thrashing.
|
|
Vision questions just answer "Scene understanding is disabled
|
|
— running in safe mode." Everything else (movement, places,
|
|
patrol, autonomous) still works.
|
|
|
|
2. ollama_host = "http://127.0.0.1:11434" + subsystems.vlm = true
|
|
→ Ollama runs on the Jetson. Old behavior — competes with
|
|
Holosoma for memory. Unsafe during walking with a 3B VL model.
|
|
|
|
3. ollama_host = "http://192.168.123.222:11434" + subsystems.vlm = true
|
|
→ Ollama runs on the workstation. Jetson stays light, Holosoma
|
|
keeps its 50 Hz real-time deadline, and the brain still gets
|
|
full Qwen-VL. Best mode for demos / walking with conversation.
|
|
"""
|
|
import json
|
|
import ollama
|
|
import yaml
|
|
from pathlib import Path
|
|
from Core.config_loader import load_config
|
|
|
|
_cfg = load_config("Brain")
|
|
|
|
_yaml_path = Path(__file__).resolve().parent.parent / "Config" / "marcus_prompts.yaml"
|
|
with open(_yaml_path, encoding="utf-8") as _f:
|
|
_prompts = yaml.safe_load(_f)
|
|
|
|
OLLAMA_MODEL = _cfg["ollama_model"]
|
|
OLLAMA_HOST = _cfg.get("ollama_host", "http://127.0.0.1:11434")
|
|
VLM_ENABLED = bool(_cfg.get("subsystems", {}).get("vlm", True))
|
|
MAX_HISTORY = _cfg["max_history"]
|
|
# Cap batch and context on every request. Without this, llama.cpp on Jetson
|
|
# Orin NX allocates a ~7.5 GiB compute graph (defaults: batch 512, ctx 4096)
|
|
# that SIGKILLs the runner when Marcus already holds ~2 GiB of unified memory
|
|
# for YOLO/camera/audio. Halving batch roughly quarters the compute graph.
|
|
NUM_BATCH = _cfg.get("num_batch", 128)
|
|
NUM_CTX = _cfg.get("num_ctx", 2048)
|
|
MAIN_PROMPT = _prompts["main_prompt"]
|
|
GOAL_PROMPT = _prompts["goal_prompt"]
|
|
PATROL_PROMPT = _prompts["patrol_prompt"]
|
|
TALK_PROMPT = _prompts["talk_prompt"]
|
|
VERIFY_PROMPT = _prompts["verify_prompt"]
|
|
|
|
# Explicit Ollama client — lets us route to a remote host (e.g., workstation)
|
|
# without relying on the OLLAMA_HOST env var being set in the launch shell.
|
|
#
|
|
# CRITICAL: timeout=300 (5 min). The Python `ollama` library defaults to
|
|
# httpx's short timeout. On the Jetson a cold-load of qwen2.5vl:3b takes
|
|
# 60-90 s; with the default timeout the client disconnects mid-load,
|
|
# Ollama interprets that as "client cancelled", aborts the in-progress
|
|
# load, and starts over on the next request. This caused the repeated
|
|
# OOM crashes — the model was never finishing a single load before being
|
|
# thrown away and re-started.
|
|
_client = ollama.Client(host=OLLAMA_HOST, timeout=300)
|
|
|
|
# Safe-mode replies used when subsystems.vlm == false
|
|
_VLM_OFF_TALK = "Scene understanding is disabled — Sanad is in safe mode."
|
|
_VLM_OFF_EMPTY = {"actions": [], "arm": None, "speak": _VLM_OFF_TALK, "abort": None}
|
|
|
|
# Conversation state
|
|
_conversation_history = []
|
|
_facts = []
|
|
|
|
|
|
def remember_fact(fact: str):
|
|
"""Store a fact told by the user for injection into LLaVA context."""
|
|
if fact and fact not in _facts:
|
|
_facts.append(fact)
|
|
print(f" [Memory] Fact stored: {fact}")
|
|
|
|
|
|
def add_to_history(user_msg: str, assistant_msg: str):
|
|
_conversation_history.append({"role": "user", "content": user_msg})
|
|
_conversation_history.append({"role": "assistant", "content": assistant_msg})
|
|
while len(_conversation_history) > MAX_HISTORY:
|
|
_conversation_history.pop(0)
|
|
|
|
|
|
def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str:
|
|
if not VLM_ENABLED:
|
|
return "" # safe-mode — caller must handle empty string
|
|
messages = []
|
|
if use_history and _conversation_history:
|
|
messages.extend(_conversation_history)
|
|
msg = {"role": "user", "content": prompt}
|
|
if img_b64:
|
|
msg["images"] = [img_b64]
|
|
messages.append(msg)
|
|
|
|
# When an image is attached, pause YOLO to free iGPU memory for the
|
|
# vision-encoder activations (~1.5 GiB). Without this, concurrent YOLO
|
|
# inference + Qwen vision-encode exceeds the 15 GiB Jetson iGPU budget
|
|
# and the llama runner is reaped by the OOM killer (status code: 500).
|
|
# Text-only calls skip the pause — they fit easily and YOLO stays hot.
|
|
_paused = False
|
|
if img_b64:
|
|
try:
|
|
from API.yolo_api import yolo_pause, yolo_resume, YOLO_AVAILABLE
|
|
if YOLO_AVAILABLE:
|
|
yolo_pause()
|
|
_paused = True
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
r = _client.chat(model=OLLAMA_MODEL, messages=messages,
|
|
options={
|
|
"temperature": 0.0,
|
|
"num_predict": num_predict,
|
|
"num_batch": NUM_BATCH,
|
|
"num_ctx": NUM_CTX,
|
|
})
|
|
return r["message"]["content"].strip()
|
|
finally:
|
|
if _paused:
|
|
try:
|
|
from API.yolo_api import yolo_resume
|
|
yolo_resume()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def parse_json(raw: str):
|
|
"""Extract and parse first JSON object from string."""
|
|
raw = raw.replace("```json", "").replace("```", "").strip()
|
|
s = raw.find("{")
|
|
e = raw.rfind("}") + 1
|
|
if s == -1 or e == 0:
|
|
return None
|
|
try:
|
|
return json.loads(raw[s:e])
|
|
except json.JSONDecodeError:
|
|
return None
|
|
|
|
|
|
def ask(command: str, img_b64) -> dict:
|
|
"""
|
|
Send command + camera frame to the VLM.
|
|
|
|
NOTE: this path does NOT use conversation history, even though other ask_*
|
|
paths do. With temperature=0 (required for reliable JSON action output),
|
|
including the last answer in the prompt makes the model lock onto
|
|
repeating it — `what do you see` then always replies with whatever it saw
|
|
the first time, regardless of the current frame. Vision grounding has to
|
|
be stateless per call. Chitchat (ask_talk) keeps history because there
|
|
the whole point is continuity.
|
|
"""
|
|
if not VLM_ENABLED:
|
|
return dict(_VLM_OFF_EMPTY)
|
|
try:
|
|
facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
|
|
raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
|
|
num_predict=_cfg["num_predict_main"], use_history=False)
|
|
print(f" Raw: {raw}")
|
|
d = parse_json(raw)
|
|
speak = d.get("speak", raw) if d else raw
|
|
# Still write to history so ask_talk() has context — just don't
|
|
# READ from it in this path (would cause lock-on repetition).
|
|
add_to_history(command, speak)
|
|
if d is None:
|
|
return {"actions": [], "arm": None, "speak": raw, "abort": None}
|
|
return d
|
|
except Exception as ex:
|
|
print(f" VLM error: {ex}")
|
|
return {"actions": [], "arm": None, "speak": "VLM error.", "abort": None}
|
|
|
|
|
|
def ask_goal(goal: str, img_b64) -> dict:
|
|
"""Ask the VLM if the goal is reached."""
|
|
if not VLM_ENABLED:
|
|
return {"reached": False, "next_move": "left", "duration": 0.5,
|
|
"speak": "VLM disabled — relying on YOLO fast-match only."}
|
|
try:
|
|
raw = call_llava(GOAL_PROMPT.format(goal=goal), img_b64,
|
|
num_predict=_cfg["num_predict_goal"])
|
|
print(f" VLM: {raw}")
|
|
d = parse_json(raw)
|
|
if d is None:
|
|
text = raw.lower()
|
|
reached = any(w in text for w in
|
|
["reached", "found", "i can see", "i see a person", "yes", "arrived"])
|
|
return {"reached": reached, "next_move": "left", "duration": 0.5, "speak": raw[:100]}
|
|
reached = d.get("reached", False)
|
|
if isinstance(reached, str):
|
|
reached = reached.lower() in ("true", "yes", "1")
|
|
d["reached"] = reached
|
|
return d
|
|
except Exception:
|
|
return {"reached": False, "next_move": "left", "duration": 0.5, "speak": "Continuing..."}
|
|
|
|
|
|
def ask_talk(command: str, img_b64, facts: str = "") -> dict:
|
|
"""Handle talk-only commands using the YAML talk_prompt."""
|
|
if not VLM_ENABLED:
|
|
return dict(_VLM_OFF_EMPTY)
|
|
try:
|
|
prompt = TALK_PROMPT.format(command=command, facts=facts)
|
|
raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"],
|
|
use_history=True)
|
|
print(f" Raw: {raw}")
|
|
d = parse_json(raw)
|
|
if d is None:
|
|
return {"actions": [], "arm": None, "speak": raw[:100], "abort": None}
|
|
speak = d.get("speak", "")
|
|
add_to_history(command, speak)
|
|
return d
|
|
except Exception as ex:
|
|
print(f" Talk error: {ex}")
|
|
return {"actions": [], "arm": None, "speak": f"Error: {ex}", "abort": None}
|
|
|
|
|
|
def ask_verify(target: str, condition: str, img_b64) -> str:
|
|
"""Verify a condition on a detected target. Returns 'yes' or 'no'."""
|
|
if not VLM_ENABLED:
|
|
# Without VLM we can't verify compound conditions; trust the YOLO match.
|
|
return "yes"
|
|
try:
|
|
prompt = VERIFY_PROMPT.format(target=target, condition=condition)
|
|
raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_verify"])
|
|
cleaned = raw.strip().lower().rstrip(".,!?")
|
|
first_word = cleaned.split()[0] if cleaned.split() else "no"
|
|
return first_word if first_word in ("yes", "no") else "no"
|
|
except Exception:
|
|
return "no"
|
|
|
|
|
|
def ask_patrol(img_b64) -> dict:
|
|
"""Ask the VLM to assess the scene during patrol."""
|
|
if not VLM_ENABLED:
|
|
return {"observation": "VLM off — patrolling without scene analysis.",
|
|
"alert": None, "next_move": "forward", "duration": 1.0}
|
|
try:
|
|
raw = call_llava(PATROL_PROMPT, img_b64, num_predict=_cfg["num_predict_patrol"])
|
|
d = parse_json(raw)
|
|
return d or {"observation": raw[:80], "alert": None, "next_move": "forward", "duration": 1.0}
|
|
except Exception:
|
|
return {"observation": "Error", "alert": None, "next_move": "stop", "duration": 0}
|