Update 2026-04-22 13:28:38

2026-04-22 13:28:39 +04:00 · 2026-04-22 13:28:39 +04:00 · dc06864ec3
commit dc06864ec3
parent 9991e742da
12 changed files with 305 additions and 11 deletions
--- a/API/llava_api.py
+++ b/API/llava_api.py
@ -93,14 +93,38 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
    if img_b64:
        msg["images"] = [img_b64]
    messages.append(msg)
-    r = _client.chat(model=OLLAMA_MODEL, messages=messages,
+
-                     options={
+    # When an image is attached, pause YOLO to free iGPU memory for the
-                         "temperature": 0.0,
+    # vision-encoder activations (~1.5 GiB). Without this, concurrent YOLO
-                         "num_predict": num_predict,
+    # inference + Qwen vision-encode exceeds the 15 GiB Jetson iGPU budget
-                         "num_batch":   NUM_BATCH,
+    # and the llama runner is reaped by the OOM killer (status code: 500).
-                         "num_ctx":     NUM_CTX,
+    # Text-only calls skip the pause — they fit easily and YOLO stays hot.
-                     })
+    _paused = False
-    return r["message"]["content"].strip()
+    if img_b64:
        try:
            from API.yolo_api import yolo_pause, yolo_resume, YOLO_AVAILABLE
            if YOLO_AVAILABLE:
                yolo_pause()
                _paused = True
        except Exception:
            pass
    try:
        r = _client.chat(model=OLLAMA_MODEL, messages=messages,
                         options={
                             "temperature": 0.0,
                             "num_predict": num_predict,
                             "num_batch":   NUM_BATCH,
                             "num_ctx":     NUM_CTX,
                         })
        return r["message"]["content"].strip()
    finally:
        if _paused:
            try:
                from API.yolo_api import yolo_resume
                yolo_resume()
            except Exception:
                pass
 def parse_json(raw: str):
@ -117,16 +141,28 @@ def parse_json(raw: str):
 def ask(command: str, img_b64) -> dict:
-    """Send command + camera frame to the VLM with conversation history."""
+    """
    Send command + camera frame to the VLM.
    NOTE: this path does NOT use conversation history, even though other ask_*
    paths do. With temperature=0 (required for reliable JSON action output),
    including the last answer in the prompt makes the model lock onto
    repeating it — `what do you see` then always replies with whatever it saw
    the first time, regardless of the current frame. Vision grounding has to
    be stateless per call. Chitchat (ask_talk) keeps history because there
    the whole point is continuity.
    """
    if not VLM_ENABLED:
        return dict(_VLM_OFF_EMPTY)
    try:
        facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
        raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
-                         num_predict=_cfg["num_predict_main"], use_history=True)
+                         num_predict=_cfg["num_predict_main"], use_history=False)
        print(f"  Raw: {raw}")
        d = parse_json(raw)
        speak = d.get("speak", raw) if d else raw
        # Still write to history so ask_talk() has context — just don't
        # READ from it in this path (would cause lock-on repetition).
        add_to_history(command, speak)
        if d is None:
            return {"actions": [], "arm": None, "speak": raw, "abort": None}
--- a/API/yolo_api.py
+++ b/API/yolo_api.py
@ -17,6 +17,8 @@ def _stub_ppe():                 return []
 def _stub_too_close(**k):        return False
 def _stub_all():                 return set()
 def _stub_fps():                 return 0.0
 def _stub_pause():               return None
 def _stub_resume():              return None
 yolo_sees            = _stub_sees
 yolo_count           = _stub_count
@ -26,6 +28,8 @@ yolo_ppe_violations  = _stub_ppe
 yolo_person_too_close = _stub_too_close
 yolo_all_classes     = _stub_all
 yolo_fps             = _stub_fps
 yolo_pause           = _stub_pause
 yolo_resume          = _stub_resume
 def init_yolo(raw_frame_ref, frame_lock) -> bool:
@ -33,6 +37,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
    global YOLO_AVAILABLE
    global yolo_sees, yolo_count, yolo_closest, yolo_summary
    global yolo_ppe_violations, yolo_person_too_close, yolo_all_classes, yolo_fps
    global yolo_pause, yolo_resume
    # marcus_yolo.py lives in Vision/
    models_dir = os.path.join(PROJECT_ROOT, "Vision")
@ -44,6 +49,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
            start_yolo, yolo_sees as _ys, yolo_count as _yc, yolo_closest as _ycl,
            yolo_summary as _ysu, yolo_ppe_violations as _ypp,
            yolo_person_too_close as _yptc, yolo_all_classes as _yac, yolo_fps as _yfps,
            yolo_pause as _ypause, yolo_resume as _yresume,
        )
    except ImportError as e:
        print(f"marcus_yolo.py not found ({e})")
@ -81,5 +87,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
        yolo_person_too_close = _yptc
        yolo_all_classes     = _yac
        yolo_fps             = _yfps
        yolo_pause           = _ypause
        yolo_resume          = _yresume
    print(f"YOLO {'started' if ok else 'failed to start'}")
    return ok
--- a/Config/config_Brain.json
+++ b/Config/config_Brain.json
@ -3,7 +3,7 @@
  "ollama_host":  "http://127.0.0.1:11434",
  "max_history": 6,
  "num_batch": 128,
-  "num_ctx": 2048,
+  "num_ctx": 1024,
  "subsystems": {
    "vlm":        true,
    "lidar":      true,
--- a/Data/Brain/Sessions/session_031_2026-04-22/alerts.json
+++ b/Data/Brain/Sessions/session_031_2026-04-22/alerts.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_031_2026-04-22/commands.json
+++ b/Data/Brain/Sessions/session_031_2026-04-22/commands.json
@ -0,0 +1,32 @@
 [
  {
    "time": "11:18:06",
    "cmd": "what do you see",
    "response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
    "duration_s": 0.0
  },
  {
    "time": "11:18:19",
    "cmd": "hi",
    "response": "Hello! I am Sanad. How can I help you?",
    "duration_s": 0.0
  },
  {
    "time": "11:18:49",
    "cmd": "what do you see",
    "response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
    "duration_s": 0.0
  },
  {
    "time": "11:19:20",
    "cmd": "turn left 1 step",
    "response": "local command",
    "duration_s": 0.0
  },
  {
    "time": "11:20:40",
    "cmd": "help/",
    "response": "local command",
    "duration_s": 0.0
  }
 ]
--- a/Data/Brain/Sessions/session_031_2026-04-22/detections.json
+++ b/Data/Brain/Sessions/session_031_2026-04-22/detections.json
@ -0,0 +1 @@
 []
--- a/Data/Brain/Sessions/session_031_2026-04-22/places.json
+++ b/Data/Brain/Sessions/session_031_2026-04-22/places.json
@ -0,0 +1 @@
 {}
--- a/Vision/marcus_yolo.py
+++ b/Vision/marcus_yolo.py
@ -92,6 +92,12 @@ PPE_VIOLATION_CLASSES = {"no-helmet", "no_helmet", "no-vest", "no_vest"}
 _detections_lock   = threading.Lock()
 _latest_detections = []     # list of dicts
 _yolo_running      = [False]
 # When True, the inference loop skips model forward passes. Used by the VLM
 # path: Qwen2.5-VL's vision encode needs ~1.5 GiB of iGPU activations and the
 # Jetson's 15 GiB is shared with YOLO + Holosoma, so concurrent inference
 # spikes the runner into OOM. Pausing YOLO for the ~1 s the VLM spends on an
 # image prevents that peak. Model weights stay resident (fast resume).
 _yolo_paused       = [False]
 _yolo_fps          = [0.0]
@ -279,6 +285,30 @@ def yolo_is_running() -> bool:
    return _yolo_running[0]
 def yolo_pause() -> None:
    """
    Stop YOLO forward passes and release PyTorch's CUDA cache back to the
    driver so Ollama's vision encoder has contiguous iGPU memory to allocate
    into. Weights stay resident, so resume is instant.
    """
    _yolo_paused[0] = True
    try:
        import torch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    except Exception:
        pass
 def yolo_resume() -> None:
    """Resume YOLO inference after a pause()."""
    _yolo_paused[0] = False
 def yolo_is_paused() -> bool:
    return _yolo_paused[0]
 def yolo_fps() -> float:
    """Return current YOLO inference FPS."""
    return _yolo_fps[0]
@ -293,6 +323,9 @@ def _inference_loop(model, is_ppe: bool, raw_frame_ref, frame_lock,
    t_fps       = time.time()
    while _yolo_running[0]:
        if _yolo_paused[0]:
            time.sleep(0.03)
            continue
        with frame_lock:
            frame = raw_frame_ref[0]
        if frame is None:
--- a/check_ollama.sh
+++ b/check_ollama.sh
@ -0,0 +1,22 @@
 #!/bin/bash
 echo "=== ollama arch + version ==="
 file $(which ollama); ollama --version; uname -m
 echo
 echo "=== does nvidia-smi work on Jetson? ==="
 nvidia-smi 2>&1 | head -5 || echo "NO nvidia-smi (expected on JetPack 5 — Tegra uses tegrastats)"
 echo
 echo "=== tegrastats (Jetson GPU util) — 2 s sample ==="
 timeout 2 tegrastats 2>&1 | head -2
 echo
 echo "=== Ollama 'inference compute' line — THE answer ==="
 journalctl -u ollama -n 200 --no-pager 2>/dev/null | grep -E "inference compute|vram|library=|starting runner|GPU" | tail -15
 echo
 echo "=== Ollama service env ==="
 systemctl cat ollama 2>/dev/null | grep -E "Environment|ExecStart"
 echo
 echo "=== JetPack / CUDA on this box ==="
 cat /etc/nv_tegra_release 2>/dev/null | head -1
 ls /usr/local/cuda/lib64/libcudart.so* 2>/dev/null | head -3
 echo
 echo "=== does Ollama's own lib dir exist? (stock install) ==="
 ls /usr/lib/ollama/ /usr/local/lib/ollama/ 2>/dev/null
--- a/install_ollama_jetson.sh
+++ b/install_ollama_jetson.sh
@ -0,0 +1,71 @@
 #!/usr/bin/env bash
 # install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit
 #
 # WHY THIS SCRIPT EXISTS:
 # `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama
 # manually — but on the Jetson, Ollama is started by systemd at boot. Systemd
 # ignores the shell script entirely, so none of the flags were ever reaching
 # the live server. This installs a drop-in that systemd merges into the unit,
 # so `systemctl restart ollama` picks up the flags. Confirmed by the log line
 # `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works,
 # but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the
 # vision-encode pass OOMs the runner (seen as "llama runner process has
 # terminated ... status code: 500" when an image is attached).
 #
 # Run once, from the Jetson:
 #   sudo ./install_ollama_jetson.sh
 #
 set -euo pipefail
 if [[ $EUID -ne 0 ]]; then
    echo "This script must be run as root (it writes to /etc/systemd/system)." >&2
    echo "Re-run with: sudo $0" >&2
    exit 1
 fi
 DROPIN_DIR="/etc/systemd/system/ollama.service.d"
 DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf"
 mkdir -p "$DROPIN_DIR"
 cat > "$DROPIN_FILE" <<'EOF'
 # Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh
 [Service]
 # Flash attention: ~30% less memory for attention tensors.
 Environment="OLLAMA_FLASH_ATTENTION=1"
 # Quantize KV cache to 8-bit (halves KV memory; negligible quality loss).
 Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
 # Never hold two VL models simultaneously.
 Environment="OLLAMA_MAX_LOADED_MODELS=1"
 # CRITICAL: keep the model resident essentially forever. The previous 2m value
 # meant that any pause in conversation longer than 2 min unloaded the model,
 # and the NEXT "what do you see" paid another 60-90 s cold-load. That
 # cold-load hammered unified memory + disk bandwidth hard enough to break
 # Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to
 # 153 ms and the robot fell. 24h means one cold-load per day (during
 # warmup_vlm.sh while the robot is in squat), everything after is warm.
 Environment="OLLAMA_KEEP_ALIVE=24h"
 # Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
 # Holosoma + camera + Python heap). Raised from 2 GiB after observing
 # Holosoma starvation during image requests — 2 GiB was enough for memory
 # but not for memory-bandwidth headroom.
 Environment="OLLAMA_GPU_OVERHEAD=3221225472"
 # Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
 # thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
 # nobody else wants it. Nice=10 = lowest normal priority.
 IOSchedulingClass=idle
 Nice=10
 EOF
 chmod 644 "$DROPIN_FILE"
 echo "Wrote $DROPIN_FILE"
 echo
 systemctl daemon-reload
 systemctl restart ollama
 sleep 2
 echo "=== verification: these vars should now be in the live process ==="
 journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10
 echo
 echo "If the 'inference compute' line shows library=CUDA and your flags appear"
 echo "in the 'server config' dump above, you're done. Try 'what do you see' again."
--- a/start_ollama.sh
+++ b/start_ollama.sh
@ -25,6 +25,13 @@ export OLLAMA_FLASH_ATTENTION=1
 export OLLAMA_KV_CACHE_TYPE=q8_0
 export OLLAMA_KEEP_ALIVE=2m
 export OLLAMA_MAX_LOADED_MODELS=1
 # Reserve 2 GiB of the Jetson's 15 GiB iGPU for the rest of the system
 # (YOLO in CUDA FP16, Holosoma, camera, Python heap). Without this, Ollama
 # assumes the full 13.8 GiB "available" is its to use and sizes its compute
 # graph that way — which works for text, but the vision-encode pass of
 # Qwen2.5-VL then pushes total allocation past physical memory and the
 # runner dies with status 500.
 export OLLAMA_GPU_OVERHEAD=2147483648
 if [[ "$1" == "--fg" ]]; then
    echo "Running ollama in foreground..."
--- a/warmup_vlm.sh
+++ b/warmup_vlm.sh
@ -0,0 +1,82 @@
 #!/usr/bin/env bash
 # warmup_vlm.sh — pre-load Qwen2.5-VL into iGPU BEFORE the robot stands up.
 #
 # WHY THIS EXISTS:
 # Cold-loading a 3 GB vision model on the Jetson's 16 GB unified memory
 # hammers disk + memory bandwidth hard enough to blow Holosoma's 20 ms
 # real-time deadline. If that happens while the robot is standing, Holosoma
 # loses balance control and the robot falls (observed 2026-04-22 —
 # RL FPS crashed from 56 → 10 during cold-load, OOM killer reaped Holosoma).
 #
 # SAFE OPERATING PROCEDURE (do this once per boot):
 #   1. Boot robot. Keep it in squat / damping mode (NOT standing).
 #   2. Run: ./warmup_vlm.sh
 #   3. Wait for "Warmup complete" (~60-90 s).
 #   4. NOW raise the robot to standing.
 #   5. Run: python3 run_marcus.py
 #
 # After warmup, OLLAMA_KEEP_ALIVE=24h keeps the model hot for the rest of the
 # day, so every "what do you see" is fast (~2-5 s) and Holosoma stays
 # healthy. Never cold-load a vision model while the robot is standing.
 set -e
 MODEL="${1:-qwen2.5vl:3b}"
 HOST="http://127.0.0.1:11434"
 cat <<'BANNER'
 ════════════════════════════════════════════════════════════════════
  VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
 ════════════════════════════════════════════════════════════════════
  The next 60-90 s will hammer disk + memory bandwidth. Holosoma
  cannot balance through this. If the robot is standing, IT WILL FALL.
  Press ENTER when the robot is safely in squat / damping.
  Press Ctrl-C to abort.
 BANNER
 read -r
 # Sanity: is ollama reachable?
 if ! curl -sf "$HOST/api/version" > /dev/null; then
    echo "✗ Ollama is not running on $HOST"
    echo "  Start it:  sudo systemctl start ollama"
    exit 1
 fi
 # Sanity: is the model in the store?
 if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
    echo "✗ Model '$MODEL' not found in Ollama store"
    echo "  Pull it:   ollama pull $MODEL"
    exit 1
 fi
 echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
 echo "  Watching tegrastats in a second window is useful — GPU should spike."
 echo
 START=$(date +%s)
 # Send a tiny text-only request. Ollama loads the model on first request;
 # we don't need vision here — just getting weights resident is what takes the
 # long time. A text-only warmup also avoids needing a camera frame.
 #
 # --keepalive 24h matches the systemd config and prevents the server from
 # unloading the model after the default 5 min.
 RESPONSE=$(curl -s "$HOST/api/generate" \
    -H 'Content-Type: application/json' \
    -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}")
 END=$(date +%s)
 ELAPSED=$((END - START))
 if echo "$RESPONSE" | grep -q '"response"'; then
    echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
    echo "  You can now stand the robot and run: python3 run_marcus.py"
 else
    echo "✗ Warmup failed after ${ELAPSED}s"
    echo "  Response: $RESPONSE"
    echo "  Check:    journalctl -u ollama --since '2 minutes ago' --no-pager | tail -40"
    exit 1
 fi