#!/usr/bin/env bash # warmup_vlm.sh — pre-load Qwen2.5-VL into iGPU BEFORE the robot stands up. # # WHY THIS EXISTS: # Cold-loading a 3 GB vision model on the Jetson's 16 GB unified memory # hammers disk + memory bandwidth hard enough to blow Holosoma's 20 ms # real-time deadline. If that happens while the robot is standing, Holosoma # loses balance control and the robot falls (observed 2026-04-22 — # RL FPS crashed from 56 → 10 during cold-load, OOM killer reaped Holosoma). # # SAFE OPERATING PROCEDURE (do this once per boot): # 1. Boot robot. Keep it in squat / damping mode (NOT standing). # 2. Run: ./warmup_vlm.sh # 3. Wait for "Warmup complete" (~60-90 s). # 4. NOW raise the robot to standing. # 5. Run: python3 run_marcus.py # # After warmup, OLLAMA_KEEP_ALIVE=24h keeps the model hot for the rest of the # day, so every "what do you see" is fast (~2-5 s) and Holosoma stays # healthy. Never cold-load a vision model while the robot is standing. set -e MODEL="${1:-qwen2.5vl:3b}" HOST="http://127.0.0.1:11434" # Sanity: is ollama reachable? if ! curl -sf "$HOST/api/version" > /dev/null; then echo "✗ Ollama is not running on $HOST" echo " Start it: sudo systemctl start ollama" exit 1 fi # Sanity: is the model in the store? if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then echo "✗ Model '$MODEL' not found in Ollama store" echo " Pull it: ollama pull $MODEL" exit 1 fi # Is the model ALREADY resident? `ollama ps` lists running models with their # keep-alive. If qwen2.5vl:3b is there, a ping is ~1 s and the robot doesn't # need to squat for it. Only ask the user to squat if we're about to do the # real 60-90 s disk + iGPU hammer. ALREADY_WARM=0 if ollama ps 2>/dev/null | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then ALREADY_WARM=1 fi if [[ $ALREADY_WARM -eq 1 ]]; then echo "→ $MODEL is already resident (keep-alive). Pinging to confirm..." else cat <<'BANNER' ════════════════════════════════════════════════════════════════════ VLM WARMUP — put the robot in SQUAT / DAMPING mode first! ════════════════════════════════════════════════════════════════════ The next 60-90 s will hammer disk + memory bandwidth. Holosoma cannot balance through this. If the robot is standing, IT WILL FALL. Press ENTER when the robot is safely in squat / damping. Press Ctrl-C to abort. BANNER read -r echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..." echo " Watching tegrastats in a second window is useful — GPU should spike." echo fi START=$(date +%s) # Send a tiny text-only request. Ollama loads the model on first request; # we don't need vision here — just getting weights resident is what takes the # long time. A text-only warmup also avoids needing a camera frame. # # --keepalive 24h matches the systemd config and prevents the server from # unloading the model after the default 5 min. RESPONSE=$(curl -s "$HOST/api/generate" \ -H 'Content-Type: application/json' \ -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}") END=$(date +%s) ELAPSED=$((END - START)) if echo "$RESPONSE" | grep -q '"response"'; then if [[ $ALREADY_WARM -eq 1 ]]; then echo "✓ Model already warm — no cold-load needed (${ELAPSED}s ping)" else echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h" echo " You can now stand the robot and run: python3 run_marcus.py" fi else echo "✗ Warmup failed after ${ELAPSED}s" echo " Response: $RESPONSE" echo " Check: journalctl -u ollama --since '2 minutes ago' --no-pager | tail -40" exit 1 fi