Update 2026-04-22 13:47:14
This commit is contained in:
parent
dc06864ec3
commit
f4ff5c27fa
@ -2,7 +2,7 @@
|
|||||||
"ollama_model": "qwen2.5vl:3b",
|
"ollama_model": "qwen2.5vl:3b",
|
||||||
"ollama_host": "http://127.0.0.1:11434",
|
"ollama_host": "http://127.0.0.1:11434",
|
||||||
"max_history": 6,
|
"max_history": 6,
|
||||||
"num_batch": 128,
|
"num_batch": 64,
|
||||||
"num_ctx": 1024,
|
"num_ctx": 1024,
|
||||||
"subsystems": {
|
"subsystems": {
|
||||||
"vlm": true,
|
"vlm": true,
|
||||||
|
|||||||
@ -44,11 +44,18 @@ Environment="OLLAMA_MAX_LOADED_MODELS=1"
|
|||||||
# 153 ms and the robot fell. 24h means one cold-load per day (during
|
# 153 ms and the robot fell. 24h means one cold-load per day (during
|
||||||
# warmup_vlm.sh while the robot is in squat), everything after is warm.
|
# warmup_vlm.sh while the robot is in squat), everything after is warm.
|
||||||
Environment="OLLAMA_KEEP_ALIVE=24h"
|
Environment="OLLAMA_KEEP_ALIVE=24h"
|
||||||
# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
|
# Reserve 4 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
|
||||||
# Holosoma + camera + Python heap). Raised from 2 GiB after observing
|
# Holosoma + camera + Python heap + vision-encoder transient spike).
|
||||||
# Holosoma starvation during image requests — 2 GiB was enough for memory
|
# Raised from 3 GiB after observing the robot wobble/fall during an
|
||||||
# but not for memory-bandwidth headroom.
|
# `ask()` call — the vision encoder's peak activations were still eating
|
||||||
Environment="OLLAMA_GPU_OVERHEAD=3221225472"
|
# into Holosoma's slack memory.
|
||||||
|
Environment="OLLAMA_GPU_OVERHEAD=4294967296"
|
||||||
|
# Cap the model's context window at the server level so Ollama's compute
|
||||||
|
# graph pre-allocation shrinks from 7.3 GiB (KvSize:2048 default) to
|
||||||
|
# ~3-4 GiB (KvSize:1024). Runtime num_ctx in request options is ignored
|
||||||
|
# for compute-graph sizing — only this env var affects the load-time
|
||||||
|
# allocation. 1024 tokens is enough for our prompts + short answers.
|
||||||
|
Environment="OLLAMA_CONTEXT_LENGTH=1024"
|
||||||
# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
|
# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
|
||||||
# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
|
# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
|
||||||
# nobody else wants it. Nice=10 = lowest normal priority.
|
# nobody else wants it. Nice=10 = lowest normal priority.
|
||||||
|
|||||||
@ -19,6 +19,17 @@ PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
|
|||||||
if PROJECT_ROOT not in sys.path:
|
if PROJECT_ROOT not in sys.path:
|
||||||
sys.path.insert(0, PROJECT_ROOT)
|
sys.path.insert(0, PROJECT_ROOT)
|
||||||
|
|
||||||
|
# Tell the Linux OOM killer to pick Ollama (easily restarted) or any other
|
||||||
|
# process before Marcus. -900 is almost-but-not-quite OOM-immune; we don't
|
||||||
|
# use -1000 because that disables OOM handling entirely, which is risky if
|
||||||
|
# Marcus ever had a runaway allocation. Writing oom_score_adj doesn't need
|
||||||
|
# root — a process can always lower its own score.
|
||||||
|
try:
|
||||||
|
with open(f"/proc/{os.getpid()}/oom_score_adj", "w") as _f:
|
||||||
|
_f.write("-900")
|
||||||
|
except OSError:
|
||||||
|
pass # not fatal — running under a restrictive sandbox
|
||||||
|
|
||||||
from Brain.marcus_brain import run_terminal
|
from Brain.marcus_brain import run_terminal
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -24,20 +24,6 @@ set -e
|
|||||||
MODEL="${1:-qwen2.5vl:3b}"
|
MODEL="${1:-qwen2.5vl:3b}"
|
||||||
HOST="http://127.0.0.1:11434"
|
HOST="http://127.0.0.1:11434"
|
||||||
|
|
||||||
cat <<'BANNER'
|
|
||||||
════════════════════════════════════════════════════════════════════
|
|
||||||
VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
|
|
||||||
════════════════════════════════════════════════════════════════════
|
|
||||||
|
|
||||||
The next 60-90 s will hammer disk + memory bandwidth. Holosoma
|
|
||||||
cannot balance through this. If the robot is standing, IT WILL FALL.
|
|
||||||
|
|
||||||
Press ENTER when the robot is safely in squat / damping.
|
|
||||||
Press Ctrl-C to abort.
|
|
||||||
|
|
||||||
BANNER
|
|
||||||
read -r
|
|
||||||
|
|
||||||
# Sanity: is ollama reachable?
|
# Sanity: is ollama reachable?
|
||||||
if ! curl -sf "$HOST/api/version" > /dev/null; then
|
if ! curl -sf "$HOST/api/version" > /dev/null; then
|
||||||
echo "✗ Ollama is not running on $HOST"
|
echo "✗ Ollama is not running on $HOST"
|
||||||
@ -52,9 +38,35 @@ if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
|
# Is the model ALREADY resident? `ollama ps` lists running models with their
|
||||||
echo " Watching tegrastats in a second window is useful — GPU should spike."
|
# keep-alive. If qwen2.5vl:3b is there, a ping is ~1 s and the robot doesn't
|
||||||
echo
|
# need to squat for it. Only ask the user to squat if we're about to do the
|
||||||
|
# real 60-90 s disk + iGPU hammer.
|
||||||
|
ALREADY_WARM=0
|
||||||
|
if ollama ps 2>/dev/null | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
|
||||||
|
ALREADY_WARM=1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $ALREADY_WARM -eq 1 ]]; then
|
||||||
|
echo "→ $MODEL is already resident (keep-alive). Pinging to confirm..."
|
||||||
|
else
|
||||||
|
cat <<'BANNER'
|
||||||
|
════════════════════════════════════════════════════════════════════
|
||||||
|
VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
|
||||||
|
════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
The next 60-90 s will hammer disk + memory bandwidth. Holosoma
|
||||||
|
cannot balance through this. If the robot is standing, IT WILL FALL.
|
||||||
|
|
||||||
|
Press ENTER when the robot is safely in squat / damping.
|
||||||
|
Press Ctrl-C to abort.
|
||||||
|
|
||||||
|
BANNER
|
||||||
|
read -r
|
||||||
|
echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
|
||||||
|
echo " Watching tegrastats in a second window is useful — GPU should spike."
|
||||||
|
echo
|
||||||
|
fi
|
||||||
|
|
||||||
START=$(date +%s)
|
START=$(date +%s)
|
||||||
|
|
||||||
@ -72,8 +84,12 @@ END=$(date +%s)
|
|||||||
ELAPSED=$((END - START))
|
ELAPSED=$((END - START))
|
||||||
|
|
||||||
if echo "$RESPONSE" | grep -q '"response"'; then
|
if echo "$RESPONSE" | grep -q '"response"'; then
|
||||||
echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
|
if [[ $ALREADY_WARM -eq 1 ]]; then
|
||||||
echo " You can now stand the robot and run: python3 run_marcus.py"
|
echo "✓ Model already warm — no cold-load needed (${ELAPSED}s ping)"
|
||||||
|
else
|
||||||
|
echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
|
||||||
|
echo " You can now stand the robot and run: python3 run_marcus.py"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "✗ Warmup failed after ${ELAPSED}s"
|
echo "✗ Warmup failed after ${ELAPSED}s"
|
||||||
echo " Response: $RESPONSE"
|
echo " Response: $RESPONSE"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user