diff --git a/API/llava_api.py b/API/llava_api.py index f431a65..22cd904 100644 --- a/API/llava_api.py +++ b/API/llava_api.py @@ -1,5 +1,24 @@ """ -llava_api.py — LLaVA / Qwen VL query interface +llava_api.py — Qwen-VL query interface (via Ollama) + +Three deployment modes, chosen via config_Brain.json: + + 1. subsystems.vlm = false + → every ask*() returns a safe fallback dict. Marcus runs in + regex-only "safe mode": no LLM load on the Jetson, no GPU/CPU + contention with Holosoma, robot won't fall from thrashing. + Vision questions just answer "Scene understanding is disabled + — running in safe mode." Everything else (movement, places, + patrol, autonomous) still works. + + 2. ollama_host = "http://127.0.0.1:11434" + subsystems.vlm = true + → Ollama runs on the Jetson. Old behavior — competes with + Holosoma for memory. Unsafe during walking with a 3B VL model. + + 3. ollama_host = "http://192.168.123.222:11434" + subsystems.vlm = true + → Ollama runs on the workstation. Jetson stays light, Holosoma + keeps its 50 Hz real-time deadline, and the brain still gets + full Qwen-VL. Best mode for demos / walking with conversation. """ import json import ollama @@ -9,12 +28,13 @@ from Core.config_loader import load_config _cfg = load_config("Brain") -# Load prompts from YAML (the authoritative source — bilingual, complete) _yaml_path = Path(__file__).resolve().parent.parent / "Config" / "marcus_prompts.yaml" with open(_yaml_path, encoding="utf-8") as _f: _prompts = yaml.safe_load(_f) OLLAMA_MODEL = _cfg["ollama_model"] +OLLAMA_HOST = _cfg.get("ollama_host", "http://127.0.0.1:11434") +VLM_ENABLED = bool(_cfg.get("subsystems", {}).get("vlm", True)) MAX_HISTORY = _cfg["max_history"] # Cap batch and context on every request. Without this, llama.cpp on Jetson # Orin NX allocates a ~7.5 GiB compute graph (defaults: batch 512, ctx 4096) @@ -28,6 +48,14 @@ PATROL_PROMPT = _prompts["patrol_prompt"] TALK_PROMPT = _prompts["talk_prompt"] VERIFY_PROMPT = _prompts["verify_prompt"] +# Explicit Ollama client — lets us route to a remote host (e.g., workstation) +# without relying on the OLLAMA_HOST env var being set in the launch shell. +_client = ollama.Client(host=OLLAMA_HOST) + +# Safe-mode replies used when subsystems.vlm == false +_VLM_OFF_TALK = "Scene understanding is disabled — Sanad is in safe mode." +_VLM_OFF_EMPTY = {"actions": [], "arm": None, "speak": _VLM_OFF_TALK, "abort": None} + # Conversation state _conversation_history = [] _facts = [] @@ -48,6 +76,8 @@ def add_to_history(user_msg: str, assistant_msg: str): def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str: + if not VLM_ENABLED: + return "" # safe-mode — caller must handle empty string messages = [] if use_history and _conversation_history: messages.extend(_conversation_history) @@ -55,13 +85,13 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = if img_b64: msg["images"] = [img_b64] messages.append(msg) - r = ollama.chat(model=OLLAMA_MODEL, messages=messages, - options={ - "temperature": 0.0, - "num_predict": num_predict, - "num_batch": NUM_BATCH, - "num_ctx": NUM_CTX, - }) + r = _client.chat(model=OLLAMA_MODEL, messages=messages, + options={ + "temperature": 0.0, + "num_predict": num_predict, + "num_batch": NUM_BATCH, + "num_ctx": NUM_CTX, + }) return r["message"]["content"].strip() @@ -79,7 +109,9 @@ def parse_json(raw: str): def ask(command: str, img_b64) -> dict: - """Send command + camera frame to LLaVA with conversation history.""" + """Send command + camera frame to the VLM with conversation history.""" + if not VLM_ENABLED: + return dict(_VLM_OFF_EMPTY) try: facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else "" raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64, @@ -92,16 +124,19 @@ def ask(command: str, img_b64) -> dict: return {"actions": [], "arm": None, "speak": raw, "abort": None} return d except Exception as ex: - print(f" LLaVA error: {ex}") - return {"actions": [], "arm": None, "speak": "Error.", "abort": None} + print(f" VLM error: {ex}") + return {"actions": [], "arm": None, "speak": "VLM error.", "abort": None} def ask_goal(goal: str, img_b64) -> dict: - """Ask LLaVA if goal is reached.""" + """Ask the VLM if the goal is reached.""" + if not VLM_ENABLED: + return {"reached": False, "next_move": "left", "duration": 0.5, + "speak": "VLM disabled — relying on YOLO fast-match only."} try: raw = call_llava(GOAL_PROMPT.format(goal=goal), img_b64, num_predict=_cfg["num_predict_goal"]) - print(f" LLaVA: {raw}") + print(f" VLM: {raw}") d = parse_json(raw) if d is None: text = raw.lower() @@ -119,6 +154,8 @@ def ask_goal(goal: str, img_b64) -> dict: def ask_talk(command: str, img_b64, facts: str = "") -> dict: """Handle talk-only commands using the YAML talk_prompt.""" + if not VLM_ENABLED: + return dict(_VLM_OFF_EMPTY) try: prompt = TALK_PROMPT.format(command=command, facts=facts) raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"], @@ -137,6 +174,9 @@ def ask_talk(command: str, img_b64, facts: str = "") -> dict: def ask_verify(target: str, condition: str, img_b64) -> str: """Verify a condition on a detected target. Returns 'yes' or 'no'.""" + if not VLM_ENABLED: + # Without VLM we can't verify compound conditions; trust the YOLO match. + return "yes" try: prompt = VERIFY_PROMPT.format(target=target, condition=condition) raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_verify"]) @@ -148,7 +188,10 @@ def ask_verify(target: str, condition: str, img_b64) -> str: def ask_patrol(img_b64) -> dict: - """Ask LLaVA to assess scene during patrol.""" + """Ask the VLM to assess the scene during patrol.""" + if not VLM_ENABLED: + return {"observation": "VLM off — patrolling without scene analysis.", + "alert": None, "next_move": "forward", "duration": 1.0} try: raw = call_llava(PATROL_PROMPT, img_b64, num_predict=_cfg["num_predict_patrol"]) d = parse_json(raw) diff --git a/Brain/marcus_brain.py b/Brain/marcus_brain.py index 9c47731..bb02a24 100644 --- a/Brain/marcus_brain.py +++ b/Brain/marcus_brain.py @@ -147,11 +147,19 @@ def init_brain(): _log("Brain initialized", "info", "brain") - # Warmup runs in a daemon thread so the dashboard + Command: prompt - # appear immediately. The first real user command will either hit a - # warm model (fast) or pay the cold-load itself (same as before). - import threading as _t - _t.Thread(target=_warmup_llava, daemon=True, name="llava-warmup").start() + # Skip warmup when VLM is off — there's no model to warm, and the + # dashboard should mention that Marcus is in safe mode. + from API.llava_api import VLM_ENABLED, OLLAMA_HOST + if not VLM_ENABLED: + print(" [VLM] disabled by config — safe mode (no Ollama load)") + else: + host_short = OLLAMA_HOST.replace("http://", "") + print(f" [VLM] target: {host_short} ({OLLAMA_MODEL})") + # Warmup runs in a daemon thread so the dashboard + Command: prompt + # appear immediately. The first real user command will either hit a + # warm model (fast) or pay the cold-load itself (same as before). + import threading as _t + _t.Thread(target=_warmup_llava, daemon=True, name="llava-warmup").start() # Global voice references @@ -540,7 +548,9 @@ def run_terminal(): print("\n\n" + "╔" + "═" * (W-2) + "╗") print("║" + _pad(" SANAD — AI BRAIN READY", W-2) + "║") print("╠" + "═" * (W-2) + "╣") + from API.llava_api import VLM_ENABLED left = [("model", status["model"]), + ("vlm", _fmt(VLM_ENABLED)), ("voice", _fmt(status["voice"])), ("camera", status["camera"])] right = [("yolo", _fmt(status["yolo"])), diff --git a/Config/config_Brain.json b/Config/config_Brain.json index de9ed36..cf85c53 100644 --- a/Config/config_Brain.json +++ b/Config/config_Brain.json @@ -1,12 +1,14 @@ { "ollama_model": "qwen2.5vl:3b", + "ollama_host": "http://127.0.0.1:11434", "max_history": 6, "num_batch": 128, "num_ctx": 2048, "subsystems": { - "lidar": true, - "voice": true, - "imgsearch": false, + "vlm": true, + "lidar": true, + "voice": true, + "imgsearch": false, "autonomous": true }, "num_predict_main": 120, diff --git a/Config/config_Voice.json b/Config/config_Voice.json index 99e6ac3..91635fb 100644 --- a/Config/config_Voice.json +++ b/Config/config_Voice.json @@ -1,7 +1,7 @@ { "tts": { "backend": "builtin_ttsmaker", - "builtin_speaker_id": 0, + "builtin_speaker_id": 2, "target_sample_rate": 16000 }, "stt": { diff --git a/Data/Brain/Sessions/session_029_2026-04-22/alerts.json b/Data/Brain/Sessions/session_029_2026-04-22/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_029_2026-04-22/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_029_2026-04-22/commands.json b/Data/Brain/Sessions/session_029_2026-04-22/commands.json new file mode 100644 index 0000000..5185d44 --- /dev/null +++ b/Data/Brain/Sessions/session_029_2026-04-22/commands.json @@ -0,0 +1,8 @@ +[ + { + "time": "10:54:15", + "cmd": "hi", + "response": "Hello! I am Sanad. How can I help you?", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_029_2026-04-22/detections.json b/Data/Brain/Sessions/session_029_2026-04-22/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_029_2026-04-22/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_029_2026-04-22/places.json b/Data/Brain/Sessions/session_029_2026-04-22/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_029_2026-04-22/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_030_2026-04-22/alerts.json b/Data/Brain/Sessions/session_030_2026-04-22/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_030_2026-04-22/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_030_2026-04-22/commands.json b/Data/Brain/Sessions/session_030_2026-04-22/commands.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_030_2026-04-22/commands.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_030_2026-04-22/detections.json b/Data/Brain/Sessions/session_030_2026-04-22/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_030_2026-04-22/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_030_2026-04-22/places.json b/Data/Brain/Sessions/session_030_2026-04-22/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_030_2026-04-22/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_030_2026-04-22/summary.txt b/Data/Brain/Sessions/session_030_2026-04-22/summary.txt new file mode 100644 index 0000000..8b9af93 --- /dev/null +++ b/Data/Brain/Sessions/session_030_2026-04-22/summary.txt @@ -0,0 +1,9 @@ +Session: session_030_2026-04-22 +Date: 2026-04-22 10:56 +Duration: 0m 37s +Commands: 0 +YOLO detections: 0 +Alerts: 0 +Known places: none + +First commands: \ No newline at end of file diff --git a/Vision/marcus_imgsearch.py b/Vision/marcus_imgsearch.py index 15aea51..5530f63 100644 --- a/Vision/marcus_imgsearch.py +++ b/Vision/marcus_imgsearch.py @@ -349,12 +349,14 @@ class ImageSearch: try: if has_ref: - # Pass BOTH images: [reference, current_frame] - # num_batch/num_ctx mirror llava_api.py — without these - # caps the compute graph OOMs the runner on Jetson. - import ollama as _ollama - from API.llava_api import NUM_BATCH, NUM_CTX - r = _ollama.chat( + # Pass BOTH images: [reference, current_frame]. Route through + # the shared Ollama client (so VLM-off and remote-host config + # are honored) and mirror the compute-graph caps. + from API.llava_api import NUM_BATCH, NUM_CTX, VLM_ENABLED, _client as _llava_client + if not VLM_ENABLED: + print(f" [{step}/{max_steps}] VLM disabled — skipping image-match") + continue + r = _llava_client.chat( model=self._model, messages=[{ "role": "user",