Marcus/install_ollama_jetson.sh

#!/usr/bin/env bash
# install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit
#
# WHY THIS SCRIPT EXISTS:
# `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama
# manually — but on the Jetson, Ollama is started by systemd at boot. Systemd
# ignores the shell script entirely, so none of the flags were ever reaching
# the live server. This installs a drop-in that systemd merges into the unit,
# so `systemctl restart ollama` picks up the flags. Confirmed by the log line
# `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works,
# but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the
# vision-encode pass OOMs the runner (seen as "llama runner process has
# terminated ... status code: 500" when an image is attached).
#
# Run once, from the Jetson:
#   sudo ./install_ollama_jetson.sh
#
set -euo pipefail

if [[ $EUID -ne 0 ]]; then
    echo "This script must be run as root (it writes to /etc/systemd/system)." >&2
    echo "Re-run with: sudo $0" >&2
    exit 1
fi

DROPIN_DIR="/etc/systemd/system/ollama.service.d"
DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf"

mkdir -p "$DROPIN_DIR"
cat > "$DROPIN_FILE" <<'EOF'
# Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh
[Service]
# Flash attention: ~30% less memory for attention tensors.
Environment="OLLAMA_FLASH_ATTENTION=1"
# Quantize KV cache to 8-bit (halves KV memory; negligible quality loss).
Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
# Never hold two VL models simultaneously.
Environment="OLLAMA_MAX_LOADED_MODELS=1"
# CRITICAL: keep the model resident essentially forever. The previous 2m value
# meant that any pause in conversation longer than 2 min unloaded the model,
# and the NEXT "what do you see" paid another 60-90 s cold-load. That
# cold-load hammered unified memory + disk bandwidth hard enough to break
# Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to
# 153 ms and the robot fell. 24h means one cold-load per day (during
# warmup_vlm.sh while the robot is in squat), everything after is warm.
Environment="OLLAMA_KEEP_ALIVE=24h"
# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
# Holosoma + camera + Python heap). Raised from 2 GiB after observing
# Holosoma starvation during image requests — 2 GiB was enough for memory
# but not for memory-bandwidth headroom.
Environment="OLLAMA_GPU_OVERHEAD=3221225472"
# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
# nobody else wants it. Nice=10 = lowest normal priority.
IOSchedulingClass=idle
Nice=10
EOF

chmod 644 "$DROPIN_FILE"
echo "Wrote $DROPIN_FILE"
echo

systemctl daemon-reload
systemctl restart ollama

sleep 2
echo "=== verification: these vars should now be in the live process ==="
journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10
echo
echo "If the 'inference compute' line shows library=CUDA and your flags appear"
echo "in the 'server config' dump above, you're done. Try 'what do you see' again."