Update 2026-04-22 13:28:38
This commit is contained in:
parent
9991e742da
commit
dc06864ec3
@ -93,14 +93,38 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
|
|||||||
if img_b64:
|
if img_b64:
|
||||||
msg["images"] = [img_b64]
|
msg["images"] = [img_b64]
|
||||||
messages.append(msg)
|
messages.append(msg)
|
||||||
r = _client.chat(model=OLLAMA_MODEL, messages=messages,
|
|
||||||
options={
|
# When an image is attached, pause YOLO to free iGPU memory for the
|
||||||
"temperature": 0.0,
|
# vision-encoder activations (~1.5 GiB). Without this, concurrent YOLO
|
||||||
"num_predict": num_predict,
|
# inference + Qwen vision-encode exceeds the 15 GiB Jetson iGPU budget
|
||||||
"num_batch": NUM_BATCH,
|
# and the llama runner is reaped by the OOM killer (status code: 500).
|
||||||
"num_ctx": NUM_CTX,
|
# Text-only calls skip the pause — they fit easily and YOLO stays hot.
|
||||||
})
|
_paused = False
|
||||||
return r["message"]["content"].strip()
|
if img_b64:
|
||||||
|
try:
|
||||||
|
from API.yolo_api import yolo_pause, yolo_resume, YOLO_AVAILABLE
|
||||||
|
if YOLO_AVAILABLE:
|
||||||
|
yolo_pause()
|
||||||
|
_paused = True
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = _client.chat(model=OLLAMA_MODEL, messages=messages,
|
||||||
|
options={
|
||||||
|
"temperature": 0.0,
|
||||||
|
"num_predict": num_predict,
|
||||||
|
"num_batch": NUM_BATCH,
|
||||||
|
"num_ctx": NUM_CTX,
|
||||||
|
})
|
||||||
|
return r["message"]["content"].strip()
|
||||||
|
finally:
|
||||||
|
if _paused:
|
||||||
|
try:
|
||||||
|
from API.yolo_api import yolo_resume
|
||||||
|
yolo_resume()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def parse_json(raw: str):
|
def parse_json(raw: str):
|
||||||
@ -117,16 +141,28 @@ def parse_json(raw: str):
|
|||||||
|
|
||||||
|
|
||||||
def ask(command: str, img_b64) -> dict:
|
def ask(command: str, img_b64) -> dict:
|
||||||
"""Send command + camera frame to the VLM with conversation history."""
|
"""
|
||||||
|
Send command + camera frame to the VLM.
|
||||||
|
|
||||||
|
NOTE: this path does NOT use conversation history, even though other ask_*
|
||||||
|
paths do. With temperature=0 (required for reliable JSON action output),
|
||||||
|
including the last answer in the prompt makes the model lock onto
|
||||||
|
repeating it — `what do you see` then always replies with whatever it saw
|
||||||
|
the first time, regardless of the current frame. Vision grounding has to
|
||||||
|
be stateless per call. Chitchat (ask_talk) keeps history because there
|
||||||
|
the whole point is continuity.
|
||||||
|
"""
|
||||||
if not VLM_ENABLED:
|
if not VLM_ENABLED:
|
||||||
return dict(_VLM_OFF_EMPTY)
|
return dict(_VLM_OFF_EMPTY)
|
||||||
try:
|
try:
|
||||||
facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
|
facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
|
||||||
raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
|
raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
|
||||||
num_predict=_cfg["num_predict_main"], use_history=True)
|
num_predict=_cfg["num_predict_main"], use_history=False)
|
||||||
print(f" Raw: {raw}")
|
print(f" Raw: {raw}")
|
||||||
d = parse_json(raw)
|
d = parse_json(raw)
|
||||||
speak = d.get("speak", raw) if d else raw
|
speak = d.get("speak", raw) if d else raw
|
||||||
|
# Still write to history so ask_talk() has context — just don't
|
||||||
|
# READ from it in this path (would cause lock-on repetition).
|
||||||
add_to_history(command, speak)
|
add_to_history(command, speak)
|
||||||
if d is None:
|
if d is None:
|
||||||
return {"actions": [], "arm": None, "speak": raw, "abort": None}
|
return {"actions": [], "arm": None, "speak": raw, "abort": None}
|
||||||
|
|||||||
@ -17,6 +17,8 @@ def _stub_ppe(): return []
|
|||||||
def _stub_too_close(**k): return False
|
def _stub_too_close(**k): return False
|
||||||
def _stub_all(): return set()
|
def _stub_all(): return set()
|
||||||
def _stub_fps(): return 0.0
|
def _stub_fps(): return 0.0
|
||||||
|
def _stub_pause(): return None
|
||||||
|
def _stub_resume(): return None
|
||||||
|
|
||||||
yolo_sees = _stub_sees
|
yolo_sees = _stub_sees
|
||||||
yolo_count = _stub_count
|
yolo_count = _stub_count
|
||||||
@ -26,6 +28,8 @@ yolo_ppe_violations = _stub_ppe
|
|||||||
yolo_person_too_close = _stub_too_close
|
yolo_person_too_close = _stub_too_close
|
||||||
yolo_all_classes = _stub_all
|
yolo_all_classes = _stub_all
|
||||||
yolo_fps = _stub_fps
|
yolo_fps = _stub_fps
|
||||||
|
yolo_pause = _stub_pause
|
||||||
|
yolo_resume = _stub_resume
|
||||||
|
|
||||||
|
|
||||||
def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
||||||
@ -33,6 +37,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
|||||||
global YOLO_AVAILABLE
|
global YOLO_AVAILABLE
|
||||||
global yolo_sees, yolo_count, yolo_closest, yolo_summary
|
global yolo_sees, yolo_count, yolo_closest, yolo_summary
|
||||||
global yolo_ppe_violations, yolo_person_too_close, yolo_all_classes, yolo_fps
|
global yolo_ppe_violations, yolo_person_too_close, yolo_all_classes, yolo_fps
|
||||||
|
global yolo_pause, yolo_resume
|
||||||
|
|
||||||
# marcus_yolo.py lives in Vision/
|
# marcus_yolo.py lives in Vision/
|
||||||
models_dir = os.path.join(PROJECT_ROOT, "Vision")
|
models_dir = os.path.join(PROJECT_ROOT, "Vision")
|
||||||
@ -44,6 +49,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
|||||||
start_yolo, yolo_sees as _ys, yolo_count as _yc, yolo_closest as _ycl,
|
start_yolo, yolo_sees as _ys, yolo_count as _yc, yolo_closest as _ycl,
|
||||||
yolo_summary as _ysu, yolo_ppe_violations as _ypp,
|
yolo_summary as _ysu, yolo_ppe_violations as _ypp,
|
||||||
yolo_person_too_close as _yptc, yolo_all_classes as _yac, yolo_fps as _yfps,
|
yolo_person_too_close as _yptc, yolo_all_classes as _yac, yolo_fps as _yfps,
|
||||||
|
yolo_pause as _ypause, yolo_resume as _yresume,
|
||||||
)
|
)
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
print(f"marcus_yolo.py not found ({e})")
|
print(f"marcus_yolo.py not found ({e})")
|
||||||
@ -81,5 +87,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
|||||||
yolo_person_too_close = _yptc
|
yolo_person_too_close = _yptc
|
||||||
yolo_all_classes = _yac
|
yolo_all_classes = _yac
|
||||||
yolo_fps = _yfps
|
yolo_fps = _yfps
|
||||||
|
yolo_pause = _ypause
|
||||||
|
yolo_resume = _yresume
|
||||||
print(f"YOLO {'started' if ok else 'failed to start'}")
|
print(f"YOLO {'started' if ok else 'failed to start'}")
|
||||||
return ok
|
return ok
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
"ollama_host": "http://127.0.0.1:11434",
|
"ollama_host": "http://127.0.0.1:11434",
|
||||||
"max_history": 6,
|
"max_history": 6,
|
||||||
"num_batch": 128,
|
"num_batch": 128,
|
||||||
"num_ctx": 2048,
|
"num_ctx": 1024,
|
||||||
"subsystems": {
|
"subsystems": {
|
||||||
"vlm": true,
|
"vlm": true,
|
||||||
"lidar": true,
|
"lidar": true,
|
||||||
|
|||||||
1
Data/Brain/Sessions/session_031_2026-04-22/alerts.json
Normal file
1
Data/Brain/Sessions/session_031_2026-04-22/alerts.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
32
Data/Brain/Sessions/session_031_2026-04-22/commands.json
Normal file
32
Data/Brain/Sessions/session_031_2026-04-22/commands.json
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"time": "11:18:06",
|
||||||
|
"cmd": "what do you see",
|
||||||
|
"response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
|
||||||
|
"duration_s": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"time": "11:18:19",
|
||||||
|
"cmd": "hi",
|
||||||
|
"response": "Hello! I am Sanad. How can I help you?",
|
||||||
|
"duration_s": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"time": "11:18:49",
|
||||||
|
"cmd": "what do you see",
|
||||||
|
"response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
|
||||||
|
"duration_s": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"time": "11:19:20",
|
||||||
|
"cmd": "turn left 1 step",
|
||||||
|
"response": "local command",
|
||||||
|
"duration_s": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"time": "11:20:40",
|
||||||
|
"cmd": "help/",
|
||||||
|
"response": "local command",
|
||||||
|
"duration_s": 0.0
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
1
Data/Brain/Sessions/session_031_2026-04-22/places.json
Normal file
1
Data/Brain/Sessions/session_031_2026-04-22/places.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{}
|
||||||
@ -92,6 +92,12 @@ PPE_VIOLATION_CLASSES = {"no-helmet", "no_helmet", "no-vest", "no_vest"}
|
|||||||
_detections_lock = threading.Lock()
|
_detections_lock = threading.Lock()
|
||||||
_latest_detections = [] # list of dicts
|
_latest_detections = [] # list of dicts
|
||||||
_yolo_running = [False]
|
_yolo_running = [False]
|
||||||
|
# When True, the inference loop skips model forward passes. Used by the VLM
|
||||||
|
# path: Qwen2.5-VL's vision encode needs ~1.5 GiB of iGPU activations and the
|
||||||
|
# Jetson's 15 GiB is shared with YOLO + Holosoma, so concurrent inference
|
||||||
|
# spikes the runner into OOM. Pausing YOLO for the ~1 s the VLM spends on an
|
||||||
|
# image prevents that peak. Model weights stay resident (fast resume).
|
||||||
|
_yolo_paused = [False]
|
||||||
_yolo_fps = [0.0]
|
_yolo_fps = [0.0]
|
||||||
|
|
||||||
|
|
||||||
@ -279,6 +285,30 @@ def yolo_is_running() -> bool:
|
|||||||
return _yolo_running[0]
|
return _yolo_running[0]
|
||||||
|
|
||||||
|
|
||||||
|
def yolo_pause() -> None:
|
||||||
|
"""
|
||||||
|
Stop YOLO forward passes and release PyTorch's CUDA cache back to the
|
||||||
|
driver so Ollama's vision encoder has contiguous iGPU memory to allocate
|
||||||
|
into. Weights stay resident, so resume is instant.
|
||||||
|
"""
|
||||||
|
_yolo_paused[0] = True
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def yolo_resume() -> None:
|
||||||
|
"""Resume YOLO inference after a pause()."""
|
||||||
|
_yolo_paused[0] = False
|
||||||
|
|
||||||
|
|
||||||
|
def yolo_is_paused() -> bool:
|
||||||
|
return _yolo_paused[0]
|
||||||
|
|
||||||
|
|
||||||
def yolo_fps() -> float:
|
def yolo_fps() -> float:
|
||||||
"""Return current YOLO inference FPS."""
|
"""Return current YOLO inference FPS."""
|
||||||
return _yolo_fps[0]
|
return _yolo_fps[0]
|
||||||
@ -293,6 +323,9 @@ def _inference_loop(model, is_ppe: bool, raw_frame_ref, frame_lock,
|
|||||||
t_fps = time.time()
|
t_fps = time.time()
|
||||||
|
|
||||||
while _yolo_running[0]:
|
while _yolo_running[0]:
|
||||||
|
if _yolo_paused[0]:
|
||||||
|
time.sleep(0.03)
|
||||||
|
continue
|
||||||
with frame_lock:
|
with frame_lock:
|
||||||
frame = raw_frame_ref[0]
|
frame = raw_frame_ref[0]
|
||||||
if frame is None:
|
if frame is None:
|
||||||
|
|||||||
22
check_ollama.sh
Executable file
22
check_ollama.sh
Executable file
@ -0,0 +1,22 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
echo "=== ollama arch + version ==="
|
||||||
|
file $(which ollama); ollama --version; uname -m
|
||||||
|
echo
|
||||||
|
echo "=== does nvidia-smi work on Jetson? ==="
|
||||||
|
nvidia-smi 2>&1 | head -5 || echo "NO nvidia-smi (expected on JetPack 5 — Tegra uses tegrastats)"
|
||||||
|
echo
|
||||||
|
echo "=== tegrastats (Jetson GPU util) — 2 s sample ==="
|
||||||
|
timeout 2 tegrastats 2>&1 | head -2
|
||||||
|
echo
|
||||||
|
echo "=== Ollama 'inference compute' line — THE answer ==="
|
||||||
|
journalctl -u ollama -n 200 --no-pager 2>/dev/null | grep -E "inference compute|vram|library=|starting runner|GPU" | tail -15
|
||||||
|
echo
|
||||||
|
echo "=== Ollama service env ==="
|
||||||
|
systemctl cat ollama 2>/dev/null | grep -E "Environment|ExecStart"
|
||||||
|
echo
|
||||||
|
echo "=== JetPack / CUDA on this box ==="
|
||||||
|
cat /etc/nv_tegra_release 2>/dev/null | head -1
|
||||||
|
ls /usr/local/cuda/lib64/libcudart.so* 2>/dev/null | head -3
|
||||||
|
echo
|
||||||
|
echo "=== does Ollama's own lib dir exist? (stock install) ==="
|
||||||
|
ls /usr/lib/ollama/ /usr/local/lib/ollama/ 2>/dev/null
|
||||||
71
install_ollama_jetson.sh
Executable file
71
install_ollama_jetson.sh
Executable file
@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit
|
||||||
|
#
|
||||||
|
# WHY THIS SCRIPT EXISTS:
|
||||||
|
# `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama
|
||||||
|
# manually — but on the Jetson, Ollama is started by systemd at boot. Systemd
|
||||||
|
# ignores the shell script entirely, so none of the flags were ever reaching
|
||||||
|
# the live server. This installs a drop-in that systemd merges into the unit,
|
||||||
|
# so `systemctl restart ollama` picks up the flags. Confirmed by the log line
|
||||||
|
# `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works,
|
||||||
|
# but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the
|
||||||
|
# vision-encode pass OOMs the runner (seen as "llama runner process has
|
||||||
|
# terminated ... status code: 500" when an image is attached).
|
||||||
|
#
|
||||||
|
# Run once, from the Jetson:
|
||||||
|
# sudo ./install_ollama_jetson.sh
|
||||||
|
#
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if [[ $EUID -ne 0 ]]; then
|
||||||
|
echo "This script must be run as root (it writes to /etc/systemd/system)." >&2
|
||||||
|
echo "Re-run with: sudo $0" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
DROPIN_DIR="/etc/systemd/system/ollama.service.d"
|
||||||
|
DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf"
|
||||||
|
|
||||||
|
mkdir -p "$DROPIN_DIR"
|
||||||
|
cat > "$DROPIN_FILE" <<'EOF'
|
||||||
|
# Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh
|
||||||
|
[Service]
|
||||||
|
# Flash attention: ~30% less memory for attention tensors.
|
||||||
|
Environment="OLLAMA_FLASH_ATTENTION=1"
|
||||||
|
# Quantize KV cache to 8-bit (halves KV memory; negligible quality loss).
|
||||||
|
Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
|
||||||
|
# Never hold two VL models simultaneously.
|
||||||
|
Environment="OLLAMA_MAX_LOADED_MODELS=1"
|
||||||
|
# CRITICAL: keep the model resident essentially forever. The previous 2m value
|
||||||
|
# meant that any pause in conversation longer than 2 min unloaded the model,
|
||||||
|
# and the NEXT "what do you see" paid another 60-90 s cold-load. That
|
||||||
|
# cold-load hammered unified memory + disk bandwidth hard enough to break
|
||||||
|
# Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to
|
||||||
|
# 153 ms and the robot fell. 24h means one cold-load per day (during
|
||||||
|
# warmup_vlm.sh while the robot is in squat), everything after is warm.
|
||||||
|
Environment="OLLAMA_KEEP_ALIVE=24h"
|
||||||
|
# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
|
||||||
|
# Holosoma + camera + Python heap). Raised from 2 GiB after observing
|
||||||
|
# Holosoma starvation during image requests — 2 GiB was enough for memory
|
||||||
|
# but not for memory-bandwidth headroom.
|
||||||
|
Environment="OLLAMA_GPU_OVERHEAD=3221225472"
|
||||||
|
# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
|
||||||
|
# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
|
||||||
|
# nobody else wants it. Nice=10 = lowest normal priority.
|
||||||
|
IOSchedulingClass=idle
|
||||||
|
Nice=10
|
||||||
|
EOF
|
||||||
|
|
||||||
|
chmod 644 "$DROPIN_FILE"
|
||||||
|
echo "Wrote $DROPIN_FILE"
|
||||||
|
echo
|
||||||
|
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl restart ollama
|
||||||
|
|
||||||
|
sleep 2
|
||||||
|
echo "=== verification: these vars should now be in the live process ==="
|
||||||
|
journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10
|
||||||
|
echo
|
||||||
|
echo "If the 'inference compute' line shows library=CUDA and your flags appear"
|
||||||
|
echo "in the 'server config' dump above, you're done. Try 'what do you see' again."
|
||||||
@ -25,6 +25,13 @@ export OLLAMA_FLASH_ATTENTION=1
|
|||||||
export OLLAMA_KV_CACHE_TYPE=q8_0
|
export OLLAMA_KV_CACHE_TYPE=q8_0
|
||||||
export OLLAMA_KEEP_ALIVE=2m
|
export OLLAMA_KEEP_ALIVE=2m
|
||||||
export OLLAMA_MAX_LOADED_MODELS=1
|
export OLLAMA_MAX_LOADED_MODELS=1
|
||||||
|
# Reserve 2 GiB of the Jetson's 15 GiB iGPU for the rest of the system
|
||||||
|
# (YOLO in CUDA FP16, Holosoma, camera, Python heap). Without this, Ollama
|
||||||
|
# assumes the full 13.8 GiB "available" is its to use and sizes its compute
|
||||||
|
# graph that way — which works for text, but the vision-encode pass of
|
||||||
|
# Qwen2.5-VL then pushes total allocation past physical memory and the
|
||||||
|
# runner dies with status 500.
|
||||||
|
export OLLAMA_GPU_OVERHEAD=2147483648
|
||||||
|
|
||||||
if [[ "$1" == "--fg" ]]; then
|
if [[ "$1" == "--fg" ]]; then
|
||||||
echo "Running ollama in foreground..."
|
echo "Running ollama in foreground..."
|
||||||
|
|||||||
82
warmup_vlm.sh
Executable file
82
warmup_vlm.sh
Executable file
@ -0,0 +1,82 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# warmup_vlm.sh — pre-load Qwen2.5-VL into iGPU BEFORE the robot stands up.
|
||||||
|
#
|
||||||
|
# WHY THIS EXISTS:
|
||||||
|
# Cold-loading a 3 GB vision model on the Jetson's 16 GB unified memory
|
||||||
|
# hammers disk + memory bandwidth hard enough to blow Holosoma's 20 ms
|
||||||
|
# real-time deadline. If that happens while the robot is standing, Holosoma
|
||||||
|
# loses balance control and the robot falls (observed 2026-04-22 —
|
||||||
|
# RL FPS crashed from 56 → 10 during cold-load, OOM killer reaped Holosoma).
|
||||||
|
#
|
||||||
|
# SAFE OPERATING PROCEDURE (do this once per boot):
|
||||||
|
# 1. Boot robot. Keep it in squat / damping mode (NOT standing).
|
||||||
|
# 2. Run: ./warmup_vlm.sh
|
||||||
|
# 3. Wait for "Warmup complete" (~60-90 s).
|
||||||
|
# 4. NOW raise the robot to standing.
|
||||||
|
# 5. Run: python3 run_marcus.py
|
||||||
|
#
|
||||||
|
# After warmup, OLLAMA_KEEP_ALIVE=24h keeps the model hot for the rest of the
|
||||||
|
# day, so every "what do you see" is fast (~2-5 s) and Holosoma stays
|
||||||
|
# healthy. Never cold-load a vision model while the robot is standing.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
MODEL="${1:-qwen2.5vl:3b}"
|
||||||
|
HOST="http://127.0.0.1:11434"
|
||||||
|
|
||||||
|
cat <<'BANNER'
|
||||||
|
════════════════════════════════════════════════════════════════════
|
||||||
|
VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
|
||||||
|
════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
The next 60-90 s will hammer disk + memory bandwidth. Holosoma
|
||||||
|
cannot balance through this. If the robot is standing, IT WILL FALL.
|
||||||
|
|
||||||
|
Press ENTER when the robot is safely in squat / damping.
|
||||||
|
Press Ctrl-C to abort.
|
||||||
|
|
||||||
|
BANNER
|
||||||
|
read -r
|
||||||
|
|
||||||
|
# Sanity: is ollama reachable?
|
||||||
|
if ! curl -sf "$HOST/api/version" > /dev/null; then
|
||||||
|
echo "✗ Ollama is not running on $HOST"
|
||||||
|
echo " Start it: sudo systemctl start ollama"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Sanity: is the model in the store?
|
||||||
|
if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
|
||||||
|
echo "✗ Model '$MODEL' not found in Ollama store"
|
||||||
|
echo " Pull it: ollama pull $MODEL"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
|
||||||
|
echo " Watching tegrastats in a second window is useful — GPU should spike."
|
||||||
|
echo
|
||||||
|
|
||||||
|
START=$(date +%s)
|
||||||
|
|
||||||
|
# Send a tiny text-only request. Ollama loads the model on first request;
|
||||||
|
# we don't need vision here — just getting weights resident is what takes the
|
||||||
|
# long time. A text-only warmup also avoids needing a camera frame.
|
||||||
|
#
|
||||||
|
# --keepalive 24h matches the systemd config and prevents the server from
|
||||||
|
# unloading the model after the default 5 min.
|
||||||
|
RESPONSE=$(curl -s "$HOST/api/generate" \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}")
|
||||||
|
|
||||||
|
END=$(date +%s)
|
||||||
|
ELAPSED=$((END - START))
|
||||||
|
|
||||||
|
if echo "$RESPONSE" | grep -q '"response"'; then
|
||||||
|
echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
|
||||||
|
echo " You can now stand the robot and run: python3 run_marcus.py"
|
||||||
|
else
|
||||||
|
echo "✗ Warmup failed after ${ELAPSED}s"
|
||||||
|
echo " Response: $RESPONSE"
|
||||||
|
echo " Check: journalctl -u ollama --since '2 minutes ago' --no-pager | tail -40"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
Loading…
x
Reference in New Issue
Block a user