Marcus/start_ollama.sh

#!/usr/bin/env bash
# start_ollama.sh — launch Ollama with Jetson-friendly memory settings
#
# The Jetson Orin NX has 16 GB unified CPU+GPU memory. When Marcus + YOLO +
# Whisper + Holosoma + the camera + audio all run alongside Qwen2.5-VL,
# the compute-graph OOMs the llama runner and Linux kills the biggest
# process (often Holosoma — which is a safety problem for locomotion).
#
# These env vars cut Ollama's memory footprint:
#   OLLAMA_FLASH_ATTENTION=1      ~30 % less memory for attention tensors
#   OLLAMA_KV_CACHE_TYPE=q8_0     quantize KV cache (halves it)
#   OLLAMA_KEEP_ALIVE=2m          keep the model warm for 2 min then evict
#                                 (adjust if cold-load lag matters more
#                                  than idle memory)
#   OLLAMA_MAX_LOADED_MODELS=1    never hold two VL models at once
#
# Usage:
#   ./start_ollama.sh            # starts server in background, logs to /tmp/ollama.log
#   ./start_ollama.sh --fg       # runs in foreground (for debugging)

pkill -f "ollama (runner|serve)" 2>/dev/null
sleep 1

export OLLAMA_FLASH_ATTENTION=1
export OLLAMA_KV_CACHE_TYPE=q8_0
export OLLAMA_KEEP_ALIVE=2m
export OLLAMA_MAX_LOADED_MODELS=1

if [[ "$1" == "--fg" ]]; then
    echo "Running ollama in foreground..."
    ollama serve
else
    ollama serve > /tmp/ollama.log 2>&1 &
    sleep 3
    if curl -sf http://localhost:11434/api/version > /dev/null; then
        echo "✓ Ollama started (pid $(pgrep -f 'ollama serve'))"
        echo "  logs:  tail -f /tmp/ollama.log"
        echo "  stop:  pkill -f 'ollama serve'"
    else
        echo "✗ Ollama failed to start — see /tmp/ollama.log"
        exit 1
    fi
fi