#!/usr/bin/env bash # install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit # # WHY THIS SCRIPT EXISTS: # `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama # manually — but on the Jetson, Ollama is started by systemd at boot. Systemd # ignores the shell script entirely, so none of the flags were ever reaching # the live server. This installs a drop-in that systemd merges into the unit, # so `systemctl restart ollama` picks up the flags. Confirmed by the log line # `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works, # but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the # vision-encode pass OOMs the runner (seen as "llama runner process has # terminated ... status code: 500" when an image is attached). # # Run once, from the Jetson: # sudo ./install_ollama_jetson.sh # set -euo pipefail if [[ $EUID -ne 0 ]]; then echo "This script must be run as root (it writes to /etc/systemd/system)." >&2 echo "Re-run with: sudo $0" >&2 exit 1 fi DROPIN_DIR="/etc/systemd/system/ollama.service.d" DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf" mkdir -p "$DROPIN_DIR" cat > "$DROPIN_FILE" <<'EOF' # Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh [Service] # Flash attention: ~30% less memory for attention tensors. Environment="OLLAMA_FLASH_ATTENTION=1" # Quantize KV cache to 8-bit (halves KV memory; negligible quality loss). Environment="OLLAMA_KV_CACHE_TYPE=q8_0" # Never hold two VL models simultaneously. Environment="OLLAMA_MAX_LOADED_MODELS=1" # CRITICAL: keep the model resident essentially forever. The previous 2m value # meant that any pause in conversation longer than 2 min unloaded the model, # and the NEXT "what do you see" paid another 60-90 s cold-load. That # cold-load hammered unified memory + disk bandwidth hard enough to break # Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to # 153 ms and the robot fell. 24h means one cold-load per day (during # warmup_vlm.sh while the robot is in squat), everything after is warm. Environment="OLLAMA_KEEP_ALIVE=24h" # Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO + # Holosoma + camera + Python heap). Raised from 2 GiB after observing # Holosoma starvation during image requests — 2 GiB was enough for memory # but not for memory-bandwidth headroom. Environment="OLLAMA_GPU_OVERHEAD=3221225472" # Deprioritize Ollama so it never preempts Holosoma's real-time locomotion # thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when # nobody else wants it. Nice=10 = lowest normal priority. IOSchedulingClass=idle Nice=10 EOF chmod 644 "$DROPIN_FILE" echo "Wrote $DROPIN_FILE" echo systemctl daemon-reload systemctl restart ollama sleep 2 echo "=== verification: these vars should now be in the live process ===" journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10 echo echo "If the 'inference compute' line shows library=CUDA and your flags appear" echo "in the 'server config' dump above, you're done. Try 'what do you see' again."