Marcus/install_ollama_jetson.sh

88 lines
4.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit
#
# WHY THIS SCRIPT EXISTS:
# `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama
# manually — but on the Jetson, Ollama is started by systemd at boot. Systemd
# ignores the shell script entirely, so none of the flags were ever reaching
# the live server. This installs a drop-in that systemd merges into the unit,
# so `systemctl restart ollama` picks up the flags. Confirmed by the log line
# `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works,
# but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the
# vision-encode pass OOMs the runner (seen as "llama runner process has
# terminated ... status code: 500" when an image is attached).
#
# Run once, from the Jetson:
# sudo ./install_ollama_jetson.sh
#
set -euo pipefail
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root (it writes to /etc/systemd/system)." >&2
echo "Re-run with: sudo $0" >&2
exit 1
fi
DROPIN_DIR="/etc/systemd/system/ollama.service.d"
DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf"
mkdir -p "$DROPIN_DIR"
cat > "$DROPIN_FILE" <<'EOF'
# Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh
[Service]
# Flash attention: ~30% less memory for attention tensors.
Environment="OLLAMA_FLASH_ATTENTION=1"
# Quantize KV cache to 8-bit (halves KV memory; negligible quality loss).
Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
# Never hold two VL models simultaneously.
Environment="OLLAMA_MAX_LOADED_MODELS=1"
# CRITICAL: keep the model resident essentially forever. The previous 2m value
# meant that any pause in conversation longer than 2 min unloaded the model,
# and the NEXT "what do you see" paid another 60-90 s cold-load. That
# cold-load hammered unified memory + disk bandwidth hard enough to break
# Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to
# 153 ms and the robot fell. 24h means one cold-load per day (during
# warmup_vlm.sh while the robot is in squat), everything after is warm.
Environment="OLLAMA_KEEP_ALIVE=24h"
# Reserve 4 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
# Holosoma + camera + Python heap + vision-encoder transient spike).
# Raised from 3 GiB after observing the robot wobble/fall during an
# `ask()` call — the vision encoder's peak activations were still eating
# into Holosoma's slack memory.
Environment="OLLAMA_GPU_OVERHEAD=4294967296"
# Cap the model's context window at the server level so Ollama's compute
# graph pre-allocation shrinks from 7.3 GiB (KvSize:2048 default) to
# ~3-4 GiB (KvSize:1024). Runtime num_ctx in request options is ignored
# for compute-graph sizing — only this env var affects the load-time
# allocation. 1024 tokens is enough for our prompts + short answers.
Environment="OLLAMA_CONTEXT_LENGTH=1024"
# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
# nobody else wants it. Nice=10 = lowest normal priority.
IOSchedulingClass=idle
Nice=10
# Make Ollama the preferred OOM-killer victim. If the 15 GiB Jetson iGPU +
# system memory fills up (e.g. swap thrashing after a vision request),
# kernel picks Ollama first — it auto-restarts, the model cold-loads on
# next request, Marcus and Holosoma keep running. Much better outcome than
# killing Marcus (robot brain) or Holosoma (balance control).
# Observed 2026-04-22: with oom_score_adj=0 across the board, the kernel
# killed Marcus's python3 (22 GB virtual, swapped) instead of Ollama
# (larger RSS but systemd-managed). This flips that.
OOMScoreAdjust=500
EOF
chmod 644 "$DROPIN_FILE"
echo "Wrote $DROPIN_FILE"
echo
systemctl daemon-reload
systemctl restart ollama
sleep 2
echo "=== verification: these vars should now be in the live process ==="
journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10
echo
echo "If the 'inference compute' line shows library=CUDA and your flags appear"
echo "in the 'server config' dump above, you're done. Try 'what do you see' again."