diff --git a/API/llava_api.py b/API/llava_api.py index 20d4003..1966d3c 100644 --- a/API/llava_api.py +++ b/API/llava_api.py @@ -93,14 +93,38 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = if img_b64: msg["images"] = [img_b64] messages.append(msg) - r = _client.chat(model=OLLAMA_MODEL, messages=messages, - options={ - "temperature": 0.0, - "num_predict": num_predict, - "num_batch": NUM_BATCH, - "num_ctx": NUM_CTX, - }) - return r["message"]["content"].strip() + + # When an image is attached, pause YOLO to free iGPU memory for the + # vision-encoder activations (~1.5 GiB). Without this, concurrent YOLO + # inference + Qwen vision-encode exceeds the 15 GiB Jetson iGPU budget + # and the llama runner is reaped by the OOM killer (status code: 500). + # Text-only calls skip the pause — they fit easily and YOLO stays hot. + _paused = False + if img_b64: + try: + from API.yolo_api import yolo_pause, yolo_resume, YOLO_AVAILABLE + if YOLO_AVAILABLE: + yolo_pause() + _paused = True + except Exception: + pass + + try: + r = _client.chat(model=OLLAMA_MODEL, messages=messages, + options={ + "temperature": 0.0, + "num_predict": num_predict, + "num_batch": NUM_BATCH, + "num_ctx": NUM_CTX, + }) + return r["message"]["content"].strip() + finally: + if _paused: + try: + from API.yolo_api import yolo_resume + yolo_resume() + except Exception: + pass def parse_json(raw: str): @@ -117,16 +141,28 @@ def parse_json(raw: str): def ask(command: str, img_b64) -> dict: - """Send command + camera frame to the VLM with conversation history.""" + """ + Send command + camera frame to the VLM. + + NOTE: this path does NOT use conversation history, even though other ask_* + paths do. With temperature=0 (required for reliable JSON action output), + including the last answer in the prompt makes the model lock onto + repeating it — `what do you see` then always replies with whatever it saw + the first time, regardless of the current frame. Vision grounding has to + be stateless per call. Chitchat (ask_talk) keeps history because there + the whole point is continuity. + """ if not VLM_ENABLED: return dict(_VLM_OFF_EMPTY) try: facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else "" raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64, - num_predict=_cfg["num_predict_main"], use_history=True) + num_predict=_cfg["num_predict_main"], use_history=False) print(f" Raw: {raw}") d = parse_json(raw) speak = d.get("speak", raw) if d else raw + # Still write to history so ask_talk() has context — just don't + # READ from it in this path (would cause lock-on repetition). add_to_history(command, speak) if d is None: return {"actions": [], "arm": None, "speak": raw, "abort": None} diff --git a/API/yolo_api.py b/API/yolo_api.py index e6e1d64..030f7c5 100644 --- a/API/yolo_api.py +++ b/API/yolo_api.py @@ -17,6 +17,8 @@ def _stub_ppe(): return [] def _stub_too_close(**k): return False def _stub_all(): return set() def _stub_fps(): return 0.0 +def _stub_pause(): return None +def _stub_resume(): return None yolo_sees = _stub_sees yolo_count = _stub_count @@ -26,6 +28,8 @@ yolo_ppe_violations = _stub_ppe yolo_person_too_close = _stub_too_close yolo_all_classes = _stub_all yolo_fps = _stub_fps +yolo_pause = _stub_pause +yolo_resume = _stub_resume def init_yolo(raw_frame_ref, frame_lock) -> bool: @@ -33,6 +37,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool: global YOLO_AVAILABLE global yolo_sees, yolo_count, yolo_closest, yolo_summary global yolo_ppe_violations, yolo_person_too_close, yolo_all_classes, yolo_fps + global yolo_pause, yolo_resume # marcus_yolo.py lives in Vision/ models_dir = os.path.join(PROJECT_ROOT, "Vision") @@ -44,6 +49,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool: start_yolo, yolo_sees as _ys, yolo_count as _yc, yolo_closest as _ycl, yolo_summary as _ysu, yolo_ppe_violations as _ypp, yolo_person_too_close as _yptc, yolo_all_classes as _yac, yolo_fps as _yfps, + yolo_pause as _ypause, yolo_resume as _yresume, ) except ImportError as e: print(f"marcus_yolo.py not found ({e})") @@ -81,5 +87,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool: yolo_person_too_close = _yptc yolo_all_classes = _yac yolo_fps = _yfps + yolo_pause = _ypause + yolo_resume = _yresume print(f"YOLO {'started' if ok else 'failed to start'}") return ok diff --git a/Config/config_Brain.json b/Config/config_Brain.json index 65d14ad..53f900f 100644 --- a/Config/config_Brain.json +++ b/Config/config_Brain.json @@ -3,7 +3,7 @@ "ollama_host": "http://127.0.0.1:11434", "max_history": 6, "num_batch": 128, - "num_ctx": 2048, + "num_ctx": 1024, "subsystems": { "vlm": true, "lidar": true, diff --git a/Data/Brain/Sessions/session_031_2026-04-22/alerts.json b/Data/Brain/Sessions/session_031_2026-04-22/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_031_2026-04-22/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_031_2026-04-22/commands.json b/Data/Brain/Sessions/session_031_2026-04-22/commands.json new file mode 100644 index 0000000..28c2220 --- /dev/null +++ b/Data/Brain/Sessions/session_031_2026-04-22/commands.json @@ -0,0 +1,32 @@ +[ + { + "time": "11:18:06", + "cmd": "what do you see", + "response": "Error: llama runner process has terminated: %!w() (status code: 500)", + "duration_s": 0.0 + }, + { + "time": "11:18:19", + "cmd": "hi", + "response": "Hello! I am Sanad. How can I help you?", + "duration_s": 0.0 + }, + { + "time": "11:18:49", + "cmd": "what do you see", + "response": "Error: llama runner process has terminated: %!w() (status code: 500)", + "duration_s": 0.0 + }, + { + "time": "11:19:20", + "cmd": "turn left 1 step", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:20:40", + "cmd": "help/", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_031_2026-04-22/detections.json b/Data/Brain/Sessions/session_031_2026-04-22/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_031_2026-04-22/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_031_2026-04-22/places.json b/Data/Brain/Sessions/session_031_2026-04-22/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_031_2026-04-22/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Vision/marcus_yolo.py b/Vision/marcus_yolo.py index b876d9e..6d6a9e4 100644 --- a/Vision/marcus_yolo.py +++ b/Vision/marcus_yolo.py @@ -92,6 +92,12 @@ PPE_VIOLATION_CLASSES = {"no-helmet", "no_helmet", "no-vest", "no_vest"} _detections_lock = threading.Lock() _latest_detections = [] # list of dicts _yolo_running = [False] +# When True, the inference loop skips model forward passes. Used by the VLM +# path: Qwen2.5-VL's vision encode needs ~1.5 GiB of iGPU activations and the +# Jetson's 15 GiB is shared with YOLO + Holosoma, so concurrent inference +# spikes the runner into OOM. Pausing YOLO for the ~1 s the VLM spends on an +# image prevents that peak. Model weights stay resident (fast resume). +_yolo_paused = [False] _yolo_fps = [0.0] @@ -279,6 +285,30 @@ def yolo_is_running() -> bool: return _yolo_running[0] +def yolo_pause() -> None: + """ + Stop YOLO forward passes and release PyTorch's CUDA cache back to the + driver so Ollama's vision encoder has contiguous iGPU memory to allocate + into. Weights stay resident, so resume is instant. + """ + _yolo_paused[0] = True + try: + import torch + if torch.cuda.is_available(): + torch.cuda.empty_cache() + except Exception: + pass + + +def yolo_resume() -> None: + """Resume YOLO inference after a pause().""" + _yolo_paused[0] = False + + +def yolo_is_paused() -> bool: + return _yolo_paused[0] + + def yolo_fps() -> float: """Return current YOLO inference FPS.""" return _yolo_fps[0] @@ -293,6 +323,9 @@ def _inference_loop(model, is_ppe: bool, raw_frame_ref, frame_lock, t_fps = time.time() while _yolo_running[0]: + if _yolo_paused[0]: + time.sleep(0.03) + continue with frame_lock: frame = raw_frame_ref[0] if frame is None: diff --git a/check_ollama.sh b/check_ollama.sh new file mode 100755 index 0000000..6046a58 --- /dev/null +++ b/check_ollama.sh @@ -0,0 +1,22 @@ +#!/bin/bash +echo "=== ollama arch + version ===" +file $(which ollama); ollama --version; uname -m +echo +echo "=== does nvidia-smi work on Jetson? ===" +nvidia-smi 2>&1 | head -5 || echo "NO nvidia-smi (expected on JetPack 5 — Tegra uses tegrastats)" +echo +echo "=== tegrastats (Jetson GPU util) — 2 s sample ===" +timeout 2 tegrastats 2>&1 | head -2 +echo +echo "=== Ollama 'inference compute' line — THE answer ===" +journalctl -u ollama -n 200 --no-pager 2>/dev/null | grep -E "inference compute|vram|library=|starting runner|GPU" | tail -15 +echo +echo "=== Ollama service env ===" +systemctl cat ollama 2>/dev/null | grep -E "Environment|ExecStart" +echo +echo "=== JetPack / CUDA on this box ===" +cat /etc/nv_tegra_release 2>/dev/null | head -1 +ls /usr/local/cuda/lib64/libcudart.so* 2>/dev/null | head -3 +echo +echo "=== does Ollama's own lib dir exist? (stock install) ===" +ls /usr/lib/ollama/ /usr/local/lib/ollama/ 2>/dev/null diff --git a/install_ollama_jetson.sh b/install_ollama_jetson.sh new file mode 100755 index 0000000..d5acd2b --- /dev/null +++ b/install_ollama_jetson.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit +# +# WHY THIS SCRIPT EXISTS: +# `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama +# manually — but on the Jetson, Ollama is started by systemd at boot. Systemd +# ignores the shell script entirely, so none of the flags were ever reaching +# the live server. This installs a drop-in that systemd merges into the unit, +# so `systemctl restart ollama` picks up the flags. Confirmed by the log line +# `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works, +# but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the +# vision-encode pass OOMs the runner (seen as "llama runner process has +# terminated ... status code: 500" when an image is attached). +# +# Run once, from the Jetson: +# sudo ./install_ollama_jetson.sh +# +set -euo pipefail + +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root (it writes to /etc/systemd/system)." >&2 + echo "Re-run with: sudo $0" >&2 + exit 1 +fi + +DROPIN_DIR="/etc/systemd/system/ollama.service.d" +DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf" + +mkdir -p "$DROPIN_DIR" +cat > "$DROPIN_FILE" <<'EOF' +# Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh +[Service] +# Flash attention: ~30% less memory for attention tensors. +Environment="OLLAMA_FLASH_ATTENTION=1" +# Quantize KV cache to 8-bit (halves KV memory; negligible quality loss). +Environment="OLLAMA_KV_CACHE_TYPE=q8_0" +# Never hold two VL models simultaneously. +Environment="OLLAMA_MAX_LOADED_MODELS=1" +# CRITICAL: keep the model resident essentially forever. The previous 2m value +# meant that any pause in conversation longer than 2 min unloaded the model, +# and the NEXT "what do you see" paid another 60-90 s cold-load. That +# cold-load hammered unified memory + disk bandwidth hard enough to break +# Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to +# 153 ms and the robot fell. 24h means one cold-load per day (during +# warmup_vlm.sh while the robot is in squat), everything after is warm. +Environment="OLLAMA_KEEP_ALIVE=24h" +# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO + +# Holosoma + camera + Python heap). Raised from 2 GiB after observing +# Holosoma starvation during image requests — 2 GiB was enough for memory +# but not for memory-bandwidth headroom. +Environment="OLLAMA_GPU_OVERHEAD=3221225472" +# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion +# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when +# nobody else wants it. Nice=10 = lowest normal priority. +IOSchedulingClass=idle +Nice=10 +EOF + +chmod 644 "$DROPIN_FILE" +echo "Wrote $DROPIN_FILE" +echo + +systemctl daemon-reload +systemctl restart ollama + +sleep 2 +echo "=== verification: these vars should now be in the live process ===" +journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10 +echo +echo "If the 'inference compute' line shows library=CUDA and your flags appear" +echo "in the 'server config' dump above, you're done. Try 'what do you see' again." diff --git a/start_ollama.sh b/start_ollama.sh index 922d97d..39bede4 100755 --- a/start_ollama.sh +++ b/start_ollama.sh @@ -25,6 +25,13 @@ export OLLAMA_FLASH_ATTENTION=1 export OLLAMA_KV_CACHE_TYPE=q8_0 export OLLAMA_KEEP_ALIVE=2m export OLLAMA_MAX_LOADED_MODELS=1 +# Reserve 2 GiB of the Jetson's 15 GiB iGPU for the rest of the system +# (YOLO in CUDA FP16, Holosoma, camera, Python heap). Without this, Ollama +# assumes the full 13.8 GiB "available" is its to use and sizes its compute +# graph that way — which works for text, but the vision-encode pass of +# Qwen2.5-VL then pushes total allocation past physical memory and the +# runner dies with status 500. +export OLLAMA_GPU_OVERHEAD=2147483648 if [[ "$1" == "--fg" ]]; then echo "Running ollama in foreground..." diff --git a/warmup_vlm.sh b/warmup_vlm.sh new file mode 100755 index 0000000..556a37f --- /dev/null +++ b/warmup_vlm.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# warmup_vlm.sh — pre-load Qwen2.5-VL into iGPU BEFORE the robot stands up. +# +# WHY THIS EXISTS: +# Cold-loading a 3 GB vision model on the Jetson's 16 GB unified memory +# hammers disk + memory bandwidth hard enough to blow Holosoma's 20 ms +# real-time deadline. If that happens while the robot is standing, Holosoma +# loses balance control and the robot falls (observed 2026-04-22 — +# RL FPS crashed from 56 → 10 during cold-load, OOM killer reaped Holosoma). +# +# SAFE OPERATING PROCEDURE (do this once per boot): +# 1. Boot robot. Keep it in squat / damping mode (NOT standing). +# 2. Run: ./warmup_vlm.sh +# 3. Wait for "Warmup complete" (~60-90 s). +# 4. NOW raise the robot to standing. +# 5. Run: python3 run_marcus.py +# +# After warmup, OLLAMA_KEEP_ALIVE=24h keeps the model hot for the rest of the +# day, so every "what do you see" is fast (~2-5 s) and Holosoma stays +# healthy. Never cold-load a vision model while the robot is standing. + +set -e + +MODEL="${1:-qwen2.5vl:3b}" +HOST="http://127.0.0.1:11434" + +cat <<'BANNER' +════════════════════════════════════════════════════════════════════ + VLM WARMUP — put the robot in SQUAT / DAMPING mode first! +════════════════════════════════════════════════════════════════════ + + The next 60-90 s will hammer disk + memory bandwidth. Holosoma + cannot balance through this. If the robot is standing, IT WILL FALL. + + Press ENTER when the robot is safely in squat / damping. + Press Ctrl-C to abort. + +BANNER +read -r + +# Sanity: is ollama reachable? +if ! curl -sf "$HOST/api/version" > /dev/null; then + echo "✗ Ollama is not running on $HOST" + echo " Start it: sudo systemctl start ollama" + exit 1 +fi + +# Sanity: is the model in the store? +if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then + echo "✗ Model '$MODEL' not found in Ollama store" + echo " Pull it: ollama pull $MODEL" + exit 1 +fi + +echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..." +echo " Watching tegrastats in a second window is useful — GPU should spike." +echo + +START=$(date +%s) + +# Send a tiny text-only request. Ollama loads the model on first request; +# we don't need vision here — just getting weights resident is what takes the +# long time. A text-only warmup also avoids needing a camera frame. +# +# --keepalive 24h matches the systemd config and prevents the server from +# unloading the model after the default 5 min. +RESPONSE=$(curl -s "$HOST/api/generate" \ + -H 'Content-Type: application/json' \ + -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}") + +END=$(date +%s) +ELAPSED=$((END - START)) + +if echo "$RESPONSE" | grep -q '"response"'; then + echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h" + echo " You can now stand the robot and run: python3 run_marcus.py" +else + echo "✗ Warmup failed after ${ELAPSED}s" + echo " Response: $RESPONSE" + echo " Check: journalctl -u ollama --since '2 minutes ago' --no-pager | tail -40" + exit 1 +fi