88 lines
4.1 KiB
Bash
Executable File
88 lines
4.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit
|
|
#
|
|
# WHY THIS SCRIPT EXISTS:
|
|
# `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama
|
|
# manually — but on the Jetson, Ollama is started by systemd at boot. Systemd
|
|
# ignores the shell script entirely, so none of the flags were ever reaching
|
|
# the live server. This installs a drop-in that systemd merges into the unit,
|
|
# so `systemctl restart ollama` picks up the flags. Confirmed by the log line
|
|
# `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works,
|
|
# but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the
|
|
# vision-encode pass OOMs the runner (seen as "llama runner process has
|
|
# terminated ... status code: 500" when an image is attached).
|
|
#
|
|
# Run once, from the Jetson:
|
|
# sudo ./install_ollama_jetson.sh
|
|
#
|
|
set -euo pipefail
|
|
|
|
if [[ $EUID -ne 0 ]]; then
|
|
echo "This script must be run as root (it writes to /etc/systemd/system)." >&2
|
|
echo "Re-run with: sudo $0" >&2
|
|
exit 1
|
|
fi
|
|
|
|
DROPIN_DIR="/etc/systemd/system/ollama.service.d"
|
|
DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf"
|
|
|
|
mkdir -p "$DROPIN_DIR"
|
|
cat > "$DROPIN_FILE" <<'EOF'
|
|
# Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh
|
|
[Service]
|
|
# Flash attention: ~30% less memory for attention tensors.
|
|
Environment="OLLAMA_FLASH_ATTENTION=1"
|
|
# Quantize KV cache to 8-bit (halves KV memory; negligible quality loss).
|
|
Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
|
|
# Never hold two VL models simultaneously.
|
|
Environment="OLLAMA_MAX_LOADED_MODELS=1"
|
|
# CRITICAL: keep the model resident essentially forever. The previous 2m value
|
|
# meant that any pause in conversation longer than 2 min unloaded the model,
|
|
# and the NEXT "what do you see" paid another 60-90 s cold-load. That
|
|
# cold-load hammered unified memory + disk bandwidth hard enough to break
|
|
# Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to
|
|
# 153 ms and the robot fell. 24h means one cold-load per day (during
|
|
# warmup_vlm.sh while the robot is in squat), everything after is warm.
|
|
Environment="OLLAMA_KEEP_ALIVE=24h"
|
|
# Reserve 4 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
|
|
# Holosoma + camera + Python heap + vision-encoder transient spike).
|
|
# Raised from 3 GiB after observing the robot wobble/fall during an
|
|
# `ask()` call — the vision encoder's peak activations were still eating
|
|
# into Holosoma's slack memory.
|
|
Environment="OLLAMA_GPU_OVERHEAD=4294967296"
|
|
# Cap the model's context window at the server level so Ollama's compute
|
|
# graph pre-allocation shrinks from 7.3 GiB (KvSize:2048 default) to
|
|
# ~3-4 GiB (KvSize:1024). Runtime num_ctx in request options is ignored
|
|
# for compute-graph sizing — only this env var affects the load-time
|
|
# allocation. 1024 tokens is enough for our prompts + short answers.
|
|
Environment="OLLAMA_CONTEXT_LENGTH=1024"
|
|
# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
|
|
# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
|
|
# nobody else wants it. Nice=10 = lowest normal priority.
|
|
IOSchedulingClass=idle
|
|
Nice=10
|
|
# Make Ollama the preferred OOM-killer victim. If the 15 GiB Jetson iGPU +
|
|
# system memory fills up (e.g. swap thrashing after a vision request),
|
|
# kernel picks Ollama first — it auto-restarts, the model cold-loads on
|
|
# next request, Marcus and Holosoma keep running. Much better outcome than
|
|
# killing Marcus (robot brain) or Holosoma (balance control).
|
|
# Observed 2026-04-22: with oom_score_adj=0 across the board, the kernel
|
|
# killed Marcus's python3 (22 GB virtual, swapped) instead of Ollama
|
|
# (larger RSS but systemd-managed). This flips that.
|
|
OOMScoreAdjust=500
|
|
EOF
|
|
|
|
chmod 644 "$DROPIN_FILE"
|
|
echo "Wrote $DROPIN_FILE"
|
|
echo
|
|
|
|
systemctl daemon-reload
|
|
systemctl restart ollama
|
|
|
|
sleep 2
|
|
echo "=== verification: these vars should now be in the live process ==="
|
|
journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10
|
|
echo
|
|
echo "If the 'inference compute' line shows library=CUDA and your flags appear"
|
|
echo "in the 'server config' dump above, you're done. Try 'what do you see' again."
|