#!/usr/bin/env bash # start_ollama.sh — launch Ollama with Jetson-friendly memory settings # # The Jetson Orin NX has 16 GB unified CPU+GPU memory. When Marcus + YOLO + # Whisper + Holosoma + the camera + audio all run alongside Qwen2.5-VL, # the compute-graph OOMs the llama runner and Linux kills the biggest # process (often Holosoma — which is a safety problem for locomotion). # # These env vars cut Ollama's memory footprint: # OLLAMA_FLASH_ATTENTION=1 ~30 % less memory for attention tensors # OLLAMA_KV_CACHE_TYPE=q8_0 quantize KV cache (halves it) # OLLAMA_KEEP_ALIVE=2m keep the model warm for 2 min then evict # (adjust if cold-load lag matters more # than idle memory) # OLLAMA_MAX_LOADED_MODELS=1 never hold two VL models at once # # Usage: # ./start_ollama.sh # starts server in background, logs to /tmp/ollama.log # ./start_ollama.sh --fg # runs in foreground (for debugging) pkill -f "ollama (runner|serve)" 2>/dev/null sleep 1 export OLLAMA_FLASH_ATTENTION=1 export OLLAMA_KV_CACHE_TYPE=q8_0 export OLLAMA_KEEP_ALIVE=2m export OLLAMA_MAX_LOADED_MODELS=1 # Reserve 2 GiB of the Jetson's 15 GiB iGPU for the rest of the system # (YOLO in CUDA FP16, Holosoma, camera, Python heap). Without this, Ollama # assumes the full 13.8 GiB "available" is its to use and sizes its compute # graph that way — which works for text, but the vision-encode pass of # Qwen2.5-VL then pushes total allocation past physical memory and the # runner dies with status 500. export OLLAMA_GPU_OVERHEAD=2147483648 if [[ "$1" == "--fg" ]]; then echo "Running ollama in foreground..." ollama serve else ollama serve > /tmp/ollama.log 2>&1 & sleep 3 if curl -sf http://localhost:11434/api/version > /dev/null; then echo "✓ Ollama started (pid $(pgrep -f 'ollama serve'))" echo " logs: tail -f /tmp/ollama.log" echo " stop: pkill -f 'ollama serve'" else echo "✗ Ollama failed to start — see /tmp/ollama.log" exit 1 fi fi