Update 2026-04-22 13:28:38

2026-04-22 13:28:39 +04:00 · 2026-04-22 13:28:39 +04:00 · dc06864ec3
commit dc06864ec3
parent 9991e742da
12 changed files with 305 additions and 11 deletions
--- a/API/llava_api.py
+++ b/API/llava_api.py
@ -93,6 +93,23 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
    if img_b64:
        msg["images"] = [img_b64]
    messages.append(msg)
+
+    # When an image is attached, pause YOLO to free iGPU memory for the
+    # vision-encoder activations (~1.5 GiB). Without this, concurrent YOLO
+    # inference + Qwen vision-encode exceeds the 15 GiB Jetson iGPU budget
+    # and the llama runner is reaped by the OOM killer (status code: 500).
+    # Text-only calls skip the pause — they fit easily and YOLO stays hot.
+    _paused = False
+    if img_b64:
+        try:
+            from API.yolo_api import yolo_pause, yolo_resume, YOLO_AVAILABLE
+            if YOLO_AVAILABLE:
+                yolo_pause()
+                _paused = True
+        except Exception:
+            pass
+
+    try:
        r = _client.chat(model=OLLAMA_MODEL, messages=messages,
                         options={
                             "temperature": 0.0,
@ -101,6 +118,13 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
                             "num_ctx":     NUM_CTX,
                         })
        return r["message"]["content"].strip()
+    finally:
+        if _paused:
+            try:
+                from API.yolo_api import yolo_resume
+                yolo_resume()
+            except Exception:
+                pass


 def parse_json(raw: str):
@ -117,16 +141,28 @@ def parse_json(raw: str):


 def ask(command: str, img_b64) -> dict:
-    """Send command + camera frame to the VLM with conversation history."""
+    """
+    Send command + camera frame to the VLM.
+
+    NOTE: this path does NOT use conversation history, even though other ask_*
+    paths do. With temperature=0 (required for reliable JSON action output),
+    including the last answer in the prompt makes the model lock onto
+    repeating it — `what do you see` then always replies with whatever it saw
+    the first time, regardless of the current frame. Vision grounding has to
+    be stateless per call. Chitchat (ask_talk) keeps history because there
+    the whole point is continuity.
+    """
    if not VLM_ENABLED:
        return dict(_VLM_OFF_EMPTY)
    try:
        facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
        raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
-                         num_predict=_cfg["num_predict_main"], use_history=True)
+                         num_predict=_cfg["num_predict_main"], use_history=False)
        print(f"  Raw: {raw}")
        d = parse_json(raw)
        speak = d.get("speak", raw) if d else raw
+        # Still write to history so ask_talk() has context — just don't
+        # READ from it in this path (would cause lock-on repetition).
        add_to_history(command, speak)
        if d is None:
            return {"actions": [], "arm": None, "speak": raw, "abort": None}
--- a/API/yolo_api.py
+++ b/API/yolo_api.py
@ -17,6 +17,8 @@ def _stub_ppe():                 return []
 def _stub_too_close(**k):        return False
 def _stub_all():                 return set()
 def _stub_fps():                 return 0.0
+def _stub_pause():               return None
+def _stub_resume():              return None

 yolo_sees            = _stub_sees
 yolo_count           = _stub_count
@ -26,6 +28,8 @@ yolo_ppe_violations  = _stub_ppe
 yolo_person_too_close = _stub_too_close
 yolo_all_classes     = _stub_all
 yolo_fps             = _stub_fps
+yolo_pause           = _stub_pause
+yolo_resume          = _stub_resume


 def init_yolo(raw_frame_ref, frame_lock) -> bool:
@ -33,6 +37,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
    global YOLO_AVAILABLE
    global yolo_sees, yolo_count, yolo_closest, yolo_summary
    global yolo_ppe_violations, yolo_person_too_close, yolo_all_classes, yolo_fps
+    global yolo_pause, yolo_resume

    # marcus_yolo.py lives in Vision/
    models_dir = os.path.join(PROJECT_ROOT, "Vision")
@ -44,6 +49,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
            start_yolo, yolo_sees as _ys, yolo_count as _yc, yolo_closest as _ycl,
            yolo_summary as _ysu, yolo_ppe_violations as _ypp,
            yolo_person_too_close as _yptc, yolo_all_classes as _yac, yolo_fps as _yfps,
+            yolo_pause as _ypause, yolo_resume as _yresume,
        )
    except ImportError as e:
        print(f"marcus_yolo.py not found ({e})")
@ -81,5 +87,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
        yolo_person_too_close = _yptc
        yolo_all_classes     = _yac
        yolo_fps             = _yfps
+        yolo_pause           = _ypause
+        yolo_resume          = _yresume
    print(f"YOLO {'started' if ok else 'failed to start'}")
    return ok
--- a/Config/config_Brain.json
+++ b/Config/config_Brain.json
@ -3,7 +3,7 @@
  "ollama_host":  "http://127.0.0.1:11434",
  "max_history": 6,
  "num_batch": 128,
-  "num_ctx": 2048,
+  "num_ctx": 1024,
  "subsystems": {
    "vlm":        true,
    "lidar":      true,
--- a/Data/Brain/Sessions/session_031_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_031_2026-04-22/alerts.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_031_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_031_2026-04-22/commands.json
@ -0,0 +1,32 @@
+[
+  {
+    "time": "11:18:06",
+    "cmd": "what do you see",
+    "response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
+    "duration_s": 0.0
+  },
+  {
+    "time": "11:18:19",
+    "cmd": "hi",
+    "response": "Hello! I am Sanad. How can I help you?",
+    "duration_s": 0.0
+  },
+  {
+    "time": "11:18:49",
+    "cmd": "what do you see",
+    "response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
+    "duration_s": 0.0
+  },
+  {
+    "time": "11:19:20",
+    "cmd": "turn left 1 step",
+    "response": "local command",
+    "duration_s": 0.0
+  },
+  {
+    "time": "11:20:40",
+    "cmd": "help/",
+    "response": "local command",
+    "duration_s": 0.0
+  }
+]
--- a/Data/Brain/Sessions/session_031_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_031_2026-04-22/detections.json
@ -0,0 +1 @@
+[]
--- a/Data/Brain/Sessions/session_031_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_031_2026-04-22/places.json
@ -0,0 +1 @@
+{}
--- a/Vision/marcus_yolo.py
+++ b/Vision/marcus_yolo.py
@ -92,6 +92,12 @@ PPE_VIOLATION_CLASSES = {"no-helmet", "no_helmet", "no-vest", "no_vest"}
 _detections_lock   = threading.Lock()
 _latest_detections = []     # list of dicts
 _yolo_running      = [False]
+# When True, the inference loop skips model forward passes. Used by the VLM
+# path: Qwen2.5-VL's vision encode needs ~1.5 GiB of iGPU activations and the
+# Jetson's 15 GiB is shared with YOLO + Holosoma, so concurrent inference
+# spikes the runner into OOM. Pausing YOLO for the ~1 s the VLM spends on an
+# image prevents that peak. Model weights stay resident (fast resume).
+_yolo_paused       = [False]
 _yolo_fps          = [0.0]


@ -279,6 +285,30 @@ def yolo_is_running() -> bool:
    return _yolo_running[0]


+def yolo_pause() -> None:
+    """
+    Stop YOLO forward passes and release PyTorch's CUDA cache back to the
+    driver so Ollama's vision encoder has contiguous iGPU memory to allocate
+    into. Weights stay resident, so resume is instant.
+    """
+    _yolo_paused[0] = True
+    try:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
+
+
+def yolo_resume() -> None:
+    """Resume YOLO inference after a pause()."""
+    _yolo_paused[0] = False
+
+
+def yolo_is_paused() -> bool:
+    return _yolo_paused[0]
+
+
 def yolo_fps() -> float:
    """Return current YOLO inference FPS."""
    return _yolo_fps[0]
@ -293,6 +323,9 @@ def _inference_loop(model, is_ppe: bool, raw_frame_ref, frame_lock,
    t_fps       = time.time()

    while _yolo_running[0]:
+        if _yolo_paused[0]:
+            time.sleep(0.03)
+            continue
        with frame_lock:
            frame = raw_frame_ref[0]
        if frame is None:
--- a/check_ollama.sh
+++ b/check_ollama.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+echo "=== ollama arch + version ==="
+file $(which ollama); ollama --version; uname -m
+echo
+echo "=== does nvidia-smi work on Jetson? ==="
+nvidia-smi 2>&1 | head -5 || echo "NO nvidia-smi (expected on JetPack 5 — Tegra uses tegrastats)"
+echo
+echo "=== tegrastats (Jetson GPU util) — 2 s sample ==="
+timeout 2 tegrastats 2>&1 | head -2
+echo
+echo "=== Ollama 'inference compute' line — THE answer ==="
+journalctl -u ollama -n 200 --no-pager 2>/dev/null | grep -E "inference compute|vram|library=|starting runner|GPU" | tail -15
+echo
+echo "=== Ollama service env ==="
+systemctl cat ollama 2>/dev/null | grep -E "Environment|ExecStart"
+echo
+echo "=== JetPack / CUDA on this box ==="
+cat /etc/nv_tegra_release 2>/dev/null | head -1
+ls /usr/local/cuda/lib64/libcudart.so* 2>/dev/null | head -3
+echo
+echo "=== does Ollama's own lib dir exist? (stock install) ==="
+ls /usr/lib/ollama/ /usr/local/lib/ollama/ 2>/dev/null
--- a/install_ollama_jetson.sh
+++ b/install_ollama_jetson.sh
@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+# install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit
+#
+# WHY THIS SCRIPT EXISTS:
+# `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama
+# manually — but on the Jetson, Ollama is started by systemd at boot. Systemd
+# ignores the shell script entirely, so none of the flags were ever reaching
+# the live server. This installs a drop-in that systemd merges into the unit,
+# so `systemctl restart ollama` picks up the flags. Confirmed by the log line
+# `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works,
+# but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the
+# vision-encode pass OOMs the runner (seen as "llama runner process has
+# terminated ... status code: 500" when an image is attached).
+#
+# Run once, from the Jetson:
+#   sudo ./install_ollama_jetson.sh
+#
+set -euo pipefail
+
+if [[ $EUID -ne 0 ]]; then
+    echo "This script must be run as root (it writes to /etc/systemd/system)." >&2
+    echo "Re-run with: sudo $0" >&2
+    exit 1
+fi
+
+DROPIN_DIR="/etc/systemd/system/ollama.service.d"
+DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf"
+
+mkdir -p "$DROPIN_DIR"
+cat > "$DROPIN_FILE" <<'EOF'
+# Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh
+[Service]
+# Flash attention: ~30% less memory for attention tensors.
+Environment="OLLAMA_FLASH_ATTENTION=1"
+# Quantize KV cache to 8-bit (halves KV memory; negligible quality loss).
+Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
+# Never hold two VL models simultaneously.
+Environment="OLLAMA_MAX_LOADED_MODELS=1"
+# CRITICAL: keep the model resident essentially forever. The previous 2m value
+# meant that any pause in conversation longer than 2 min unloaded the model,
+# and the NEXT "what do you see" paid another 60-90 s cold-load. That
+# cold-load hammered unified memory + disk bandwidth hard enough to break
+# Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to
+# 153 ms and the robot fell. 24h means one cold-load per day (during
+# warmup_vlm.sh while the robot is in squat), everything after is warm.
+Environment="OLLAMA_KEEP_ALIVE=24h"
+# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
+# Holosoma + camera + Python heap). Raised from 2 GiB after observing
+# Holosoma starvation during image requests — 2 GiB was enough for memory
+# but not for memory-bandwidth headroom.
+Environment="OLLAMA_GPU_OVERHEAD=3221225472"
+# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
+# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
+# nobody else wants it. Nice=10 = lowest normal priority.
+IOSchedulingClass=idle
+Nice=10
+EOF
+
+chmod 644 "$DROPIN_FILE"
+echo "Wrote $DROPIN_FILE"
+echo
+
+systemctl daemon-reload
+systemctl restart ollama
+
+sleep 2
+echo "=== verification: these vars should now be in the live process ==="
+journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10
+echo
+echo "If the 'inference compute' line shows library=CUDA and your flags appear"
+echo "in the 'server config' dump above, you're done. Try 'what do you see' again."
--- a/start_ollama.sh
+++ b/start_ollama.sh
@ -25,6 +25,13 @@ export OLLAMA_FLASH_ATTENTION=1
 export OLLAMA_KV_CACHE_TYPE=q8_0
 export OLLAMA_KEEP_ALIVE=2m
 export OLLAMA_MAX_LOADED_MODELS=1
+# Reserve 2 GiB of the Jetson's 15 GiB iGPU for the rest of the system
+# (YOLO in CUDA FP16, Holosoma, camera, Python heap). Without this, Ollama
+# assumes the full 13.8 GiB "available" is its to use and sizes its compute
+# graph that way — which works for text, but the vision-encode pass of
+# Qwen2.5-VL then pushes total allocation past physical memory and the
+# runner dies with status 500.
+export OLLAMA_GPU_OVERHEAD=2147483648

 if [[ "$1" == "--fg" ]]; then
    echo "Running ollama in foreground..."
--- a/warmup_vlm.sh
+++ b/warmup_vlm.sh
@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+# warmup_vlm.sh — pre-load Qwen2.5-VL into iGPU BEFORE the robot stands up.
+#
+# WHY THIS EXISTS:
+# Cold-loading a 3 GB vision model on the Jetson's 16 GB unified memory
+# hammers disk + memory bandwidth hard enough to blow Holosoma's 20 ms
+# real-time deadline. If that happens while the robot is standing, Holosoma
+# loses balance control and the robot falls (observed 2026-04-22 —
+# RL FPS crashed from 56 → 10 during cold-load, OOM killer reaped Holosoma).
+#
+# SAFE OPERATING PROCEDURE (do this once per boot):
+#   1. Boot robot. Keep it in squat / damping mode (NOT standing).
+#   2. Run: ./warmup_vlm.sh
+#   3. Wait for "Warmup complete" (~60-90 s).
+#   4. NOW raise the robot to standing.
+#   5. Run: python3 run_marcus.py
+#
+# After warmup, OLLAMA_KEEP_ALIVE=24h keeps the model hot for the rest of the
+# day, so every "what do you see" is fast (~2-5 s) and Holosoma stays
+# healthy. Never cold-load a vision model while the robot is standing.
+
+set -e
+
+MODEL="${1:-qwen2.5vl:3b}"
+HOST="http://127.0.0.1:11434"
+
+cat <<'BANNER'
+════════════════════════════════════════════════════════════════════
+  VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
+════════════════════════════════════════════════════════════════════
+
+  The next 60-90 s will hammer disk + memory bandwidth. Holosoma
+  cannot balance through this. If the robot is standing, IT WILL FALL.
+
+  Press ENTER when the robot is safely in squat / damping.
+  Press Ctrl-C to abort.
+
+BANNER
+read -r
+
+# Sanity: is ollama reachable?
+if ! curl -sf "$HOST/api/version" > /dev/null; then
+    echo "✗ Ollama is not running on $HOST"
+    echo "  Start it:  sudo systemctl start ollama"
+    exit 1
+fi
+
+# Sanity: is the model in the store?
+if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
+    echo "✗ Model '$MODEL' not found in Ollama store"
+    echo "  Pull it:   ollama pull $MODEL"
+    exit 1
+fi
+
+echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
+echo "  Watching tegrastats in a second window is useful — GPU should spike."
+echo
+
+START=$(date +%s)
+
+# Send a tiny text-only request. Ollama loads the model on first request;
+# we don't need vision here — just getting weights resident is what takes the
+# long time. A text-only warmup also avoids needing a camera frame.
+#
+# --keepalive 24h matches the systemd config and prevents the server from
+# unloading the model after the default 5 min.
+RESPONSE=$(curl -s "$HOST/api/generate" \
+    -H 'Content-Type: application/json' \
+    -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}")
+
+END=$(date +%s)
+ELAPSED=$((END - START))
+
+if echo "$RESPONSE" | grep -q '"response"'; then
+    echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
+    echo "  You can now stand the robot and run: python3 run_marcus.py"
+else
+    echo "✗ Warmup failed after ${ELAPSED}s"
+    echo "  Response: $RESPONSE"
+    echo "  Check:    journalctl -u ollama --since '2 minutes ago' --no-pager | tail -40"
+    exit 1
+fi