""" llava_api.py — Qwen-VL query interface (via Ollama) Three deployment modes, chosen via config_Brain.json: 1. subsystems.vlm = false → every ask*() returns a safe fallback dict. Marcus runs in regex-only "safe mode": no LLM load on the Jetson, no GPU/CPU contention with Holosoma, robot won't fall from thrashing. Vision questions just answer "Scene understanding is disabled — running in safe mode." Everything else (movement, places, patrol, autonomous) still works. 2. ollama_host = "http://127.0.0.1:11434" + subsystems.vlm = true → Ollama runs on the Jetson. Old behavior — competes with Holosoma for memory. Unsafe during walking with a 3B VL model. 3. ollama_host = "http://192.168.123.222:11434" + subsystems.vlm = true → Ollama runs on the workstation. Jetson stays light, Holosoma keeps its 50 Hz real-time deadline, and the brain still gets full Qwen-VL. Best mode for demos / walking with conversation. """ import json import ollama import yaml from pathlib import Path from Core.config_loader import load_config _cfg = load_config("Brain") _yaml_path = Path(__file__).resolve().parent.parent / "Config" / "marcus_prompts.yaml" with open(_yaml_path, encoding="utf-8") as _f: _prompts = yaml.safe_load(_f) OLLAMA_MODEL = _cfg["ollama_model"] OLLAMA_HOST = _cfg.get("ollama_host", "http://127.0.0.1:11434") VLM_ENABLED = bool(_cfg.get("subsystems", {}).get("vlm", True)) MAX_HISTORY = _cfg["max_history"] # Cap batch and context on every request. Without this, llama.cpp on Jetson # Orin NX allocates a ~7.5 GiB compute graph (defaults: batch 512, ctx 4096) # that SIGKILLs the runner when Marcus already holds ~2 GiB of unified memory # for YOLO/camera/audio. Halving batch roughly quarters the compute graph. NUM_BATCH = _cfg.get("num_batch", 128) NUM_CTX = _cfg.get("num_ctx", 2048) MAIN_PROMPT = _prompts["main_prompt"] GOAL_PROMPT = _prompts["goal_prompt"] PATROL_PROMPT = _prompts["patrol_prompt"] TALK_PROMPT = _prompts["talk_prompt"] VERIFY_PROMPT = _prompts["verify_prompt"] # Explicit Ollama client — lets us route to a remote host (e.g., workstation) # without relying on the OLLAMA_HOST env var being set in the launch shell. # # CRITICAL: timeout=300 (5 min). The Python `ollama` library defaults to # httpx's short timeout. On the Jetson a cold-load of qwen2.5vl:3b takes # 60-90 s; with the default timeout the client disconnects mid-load, # Ollama interprets that as "client cancelled", aborts the in-progress # load, and starts over on the next request. This caused the repeated # OOM crashes — the model was never finishing a single load before being # thrown away and re-started. _client = ollama.Client(host=OLLAMA_HOST, timeout=300) # Safe-mode replies used when subsystems.vlm == false _VLM_OFF_TALK = "Scene understanding is disabled — Sanad is in safe mode." _VLM_OFF_EMPTY = {"actions": [], "arm": None, "speak": _VLM_OFF_TALK, "abort": None} # Conversation state _conversation_history = [] _facts = [] def remember_fact(fact: str): """Store a fact told by the user for injection into LLaVA context.""" if fact and fact not in _facts: _facts.append(fact) print(f" [Memory] Fact stored: {fact}") def add_to_history(user_msg: str, assistant_msg: str): _conversation_history.append({"role": "user", "content": user_msg}) _conversation_history.append({"role": "assistant", "content": assistant_msg}) while len(_conversation_history) > MAX_HISTORY: _conversation_history.pop(0) def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str: """ Single synchronous VLM call — same mechanism as Marcus_v1's _call_llava. With YOLO on CPU (config_Vision.json::yolo_device="cpu"), there is no iGPU contention to guard against, so the v1-style plain call is the right shape. num_batch and num_ctx are still passed per-request because Ollama's compute-graph pre-allocation pays attention to them (default batch=512/ctx=4096 would OOM on the Jetson). """ if not VLM_ENABLED: return "" # safe-mode — caller must handle empty string messages = [] if use_history and _conversation_history: messages.extend(_conversation_history) msg = {"role": "user", "content": prompt} if img_b64: msg["images"] = [img_b64] messages.append(msg) r = _client.chat(model=OLLAMA_MODEL, messages=messages, options={ "temperature": 0.0, "num_predict": num_predict, "num_batch": NUM_BATCH, "num_ctx": NUM_CTX, }) return r["message"]["content"].strip() def parse_json(raw: str): """Extract and parse first JSON object from string.""" raw = raw.replace("```json", "").replace("```", "").strip() s = raw.find("{") e = raw.rfind("}") + 1 if s == -1 or e == 0: return None try: return json.loads(raw[s:e]) except json.JSONDecodeError: return None def ask(command: str, img_b64) -> dict: """ Send command + camera frame to the VLM. NOTE: this path does NOT use conversation history, even though other ask_* paths do. With temperature=0 (required for reliable JSON action output), including the last answer in the prompt makes the model lock onto repeating it — `what do you see` then always replies with whatever it saw the first time, regardless of the current frame. Vision grounding has to be stateless per call. Chitchat (ask_talk) keeps history because there the whole point is continuity. """ if not VLM_ENABLED: return dict(_VLM_OFF_EMPTY) try: facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else "" raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64, num_predict=_cfg["num_predict_main"], use_history=False) print(f" Raw: {raw}") d = parse_json(raw) speak = d.get("speak", raw) if d else raw # Still write to history so ask_talk() has context — just don't # READ from it in this path (would cause lock-on repetition). add_to_history(command, speak) if d is None: return {"actions": [], "arm": None, "speak": raw, "abort": None} return d except Exception as ex: print(f" VLM error: {ex}") return {"actions": [], "arm": None, "speak": "VLM error.", "abort": None} def ask_goal(goal: str, img_b64) -> dict: """Ask the VLM if the goal is reached.""" if not VLM_ENABLED: return {"reached": False, "next_move": "left", "duration": 0.5, "speak": "VLM disabled — relying on YOLO fast-match only."} try: raw = call_llava(GOAL_PROMPT.format(goal=goal), img_b64, num_predict=_cfg["num_predict_goal"]) print(f" VLM: {raw}") d = parse_json(raw) if d is None: text = raw.lower() reached = any(w in text for w in ["reached", "found", "i can see", "i see a person", "yes", "arrived"]) return {"reached": reached, "next_move": "left", "duration": 0.5, "speak": raw[:100]} reached = d.get("reached", False) if isinstance(reached, str): reached = reached.lower() in ("true", "yes", "1") d["reached"] = reached return d except Exception: return {"reached": False, "next_move": "left", "duration": 0.5, "speak": "Continuing..."} def ask_talk(command: str, img_b64, facts: str = "") -> dict: """ Handle talk-only commands using the YAML talk_prompt. NOTE: use_history is off. Accumulated turns push the prompt past qwen2.5vl's 2048-token KV cache; Ollama then truncates and tries to RoPE-shift the cache, which triggers an Ollama/ggml bug (GGML_ASSERT(a->ne[2] * 4 == b->ne[0]) — runner SIGABRTs with status 500. Keeping each call stateless avoids the ceiling entirely. """ if not VLM_ENABLED: return dict(_VLM_OFF_EMPTY) try: prompt = TALK_PROMPT.format(command=command, facts=facts) raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"], use_history=False) print(f" Raw: {raw}") d = parse_json(raw) if d is None: return {"actions": [], "arm": None, "speak": raw[:100], "abort": None} speak = d.get("speak", "") add_to_history(command, speak) return d except Exception as ex: print(f" Talk error: {ex}") return {"actions": [], "arm": None, "speak": f"Error: {ex}", "abort": None} def ask_verify(target: str, condition: str, img_b64) -> str: """Verify a condition on a detected target. Returns 'yes' or 'no'.""" if not VLM_ENABLED: # Without VLM we can't verify compound conditions; trust the YOLO match. return "yes" try: prompt = VERIFY_PROMPT.format(target=target, condition=condition) raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_verify"]) cleaned = raw.strip().lower().rstrip(".,!?") first_word = cleaned.split()[0] if cleaned.split() else "no" return first_word if first_word in ("yes", "no") else "no" except Exception: return "no" def ask_patrol(img_b64) -> dict: """Ask the VLM to assess the scene during patrol.""" if not VLM_ENABLED: return {"observation": "VLM off — patrolling without scene analysis.", "alert": None, "next_move": "forward", "duration": 1.0} try: raw = call_llava(PATROL_PROMPT, img_b64, num_predict=_cfg["num_predict_patrol"]) d = parse_json(raw) return d or {"observation": raw[:80], "alert": None, "next_move": "forward", "duration": 1.0} except Exception: return {"observation": "Error", "alert": None, "next_move": "stop", "duration": 0}