72 lines
3.1 KiB
Bash
Executable File
72 lines
3.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit
|
|
#
|
|
# WHY THIS SCRIPT EXISTS:
|
|
# `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama
|
|
# manually — but on the Jetson, Ollama is started by systemd at boot. Systemd
|
|
# ignores the shell script entirely, so none of the flags were ever reaching
|
|
# the live server. This installs a drop-in that systemd merges into the unit,
|
|
# so `systemctl restart ollama` picks up the flags. Confirmed by the log line
|
|
# `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works,
|
|
# but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the
|
|
# vision-encode pass OOMs the runner (seen as "llama runner process has
|
|
# terminated ... status code: 500" when an image is attached).
|
|
#
|
|
# Run once, from the Jetson:
|
|
# sudo ./install_ollama_jetson.sh
|
|
#
|
|
set -euo pipefail
|
|
|
|
if [[ $EUID -ne 0 ]]; then
|
|
echo "This script must be run as root (it writes to /etc/systemd/system)." >&2
|
|
echo "Re-run with: sudo $0" >&2
|
|
exit 1
|
|
fi
|
|
|
|
DROPIN_DIR="/etc/systemd/system/ollama.service.d"
|
|
DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf"
|
|
|
|
mkdir -p "$DROPIN_DIR"
|
|
cat > "$DROPIN_FILE" <<'EOF'
|
|
# Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh
|
|
[Service]
|
|
# Flash attention: ~30% less memory for attention tensors.
|
|
Environment="OLLAMA_FLASH_ATTENTION=1"
|
|
# Quantize KV cache to 8-bit (halves KV memory; negligible quality loss).
|
|
Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
|
|
# Never hold two VL models simultaneously.
|
|
Environment="OLLAMA_MAX_LOADED_MODELS=1"
|
|
# CRITICAL: keep the model resident essentially forever. The previous 2m value
|
|
# meant that any pause in conversation longer than 2 min unloaded the model,
|
|
# and the NEXT "what do you see" paid another 60-90 s cold-load. That
|
|
# cold-load hammered unified memory + disk bandwidth hard enough to break
|
|
# Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to
|
|
# 153 ms and the robot fell. 24h means one cold-load per day (during
|
|
# warmup_vlm.sh while the robot is in squat), everything after is warm.
|
|
Environment="OLLAMA_KEEP_ALIVE=24h"
|
|
# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
|
|
# Holosoma + camera + Python heap). Raised from 2 GiB after observing
|
|
# Holosoma starvation during image requests — 2 GiB was enough for memory
|
|
# but not for memory-bandwidth headroom.
|
|
Environment="OLLAMA_GPU_OVERHEAD=3221225472"
|
|
# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
|
|
# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
|
|
# nobody else wants it. Nice=10 = lowest normal priority.
|
|
IOSchedulingClass=idle
|
|
Nice=10
|
|
EOF
|
|
|
|
chmod 644 "$DROPIN_FILE"
|
|
echo "Wrote $DROPIN_FILE"
|
|
echo
|
|
|
|
systemctl daemon-reload
|
|
systemctl restart ollama
|
|
|
|
sleep 2
|
|
echo "=== verification: these vars should now be in the live process ==="
|
|
journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10
|
|
echo
|
|
echo "If the 'inference compute' line shows library=CUDA and your flags appear"
|
|
echo "in the 'server config' dump above, you're done. Try 'what do you see' again."
|