Marcus/start_ollama.sh

44 lines
1.6 KiB
Bash
Executable File

#!/usr/bin/env bash
# start_ollama.sh — launch Ollama with Jetson-friendly memory settings
#
# The Jetson Orin NX has 16 GB unified CPU+GPU memory. When Marcus + YOLO +
# Whisper + Holosoma + the camera + audio all run alongside Qwen2.5-VL,
# the compute-graph OOMs the llama runner and Linux kills the biggest
# process (often Holosoma — which is a safety problem for locomotion).
#
# These env vars cut Ollama's memory footprint:
# OLLAMA_FLASH_ATTENTION=1 ~30 % less memory for attention tensors
# OLLAMA_KV_CACHE_TYPE=q8_0 quantize KV cache (halves it)
# OLLAMA_KEEP_ALIVE=2m keep the model warm for 2 min then evict
# (adjust if cold-load lag matters more
# than idle memory)
# OLLAMA_MAX_LOADED_MODELS=1 never hold two VL models at once
#
# Usage:
# ./start_ollama.sh # starts server in background, logs to /tmp/ollama.log
# ./start_ollama.sh --fg # runs in foreground (for debugging)
pkill -f "ollama (runner|serve)" 2>/dev/null
sleep 1
export OLLAMA_FLASH_ATTENTION=1
export OLLAMA_KV_CACHE_TYPE=q8_0
export OLLAMA_KEEP_ALIVE=2m
export OLLAMA_MAX_LOADED_MODELS=1
if [[ "$1" == "--fg" ]]; then
echo "Running ollama in foreground..."
ollama serve
else
ollama serve > /tmp/ollama.log 2>&1 &
sleep 3
if curl -sf http://localhost:11434/api/version > /dev/null; then
echo "✓ Ollama started (pid $(pgrep -f 'ollama serve'))"
echo " logs: tail -f /tmp/ollama.log"
echo " stop: pkill -f 'ollama serve'"
else
echo "✗ Ollama failed to start — see /tmp/ollama.log"
exit 1
fi
fi