Update 2026-04-22 13:28:38
This commit is contained in:
parent
9991e742da
commit
dc06864ec3
@ -93,6 +93,23 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
|
||||
if img_b64:
|
||||
msg["images"] = [img_b64]
|
||||
messages.append(msg)
|
||||
|
||||
# When an image is attached, pause YOLO to free iGPU memory for the
|
||||
# vision-encoder activations (~1.5 GiB). Without this, concurrent YOLO
|
||||
# inference + Qwen vision-encode exceeds the 15 GiB Jetson iGPU budget
|
||||
# and the llama runner is reaped by the OOM killer (status code: 500).
|
||||
# Text-only calls skip the pause — they fit easily and YOLO stays hot.
|
||||
_paused = False
|
||||
if img_b64:
|
||||
try:
|
||||
from API.yolo_api import yolo_pause, yolo_resume, YOLO_AVAILABLE
|
||||
if YOLO_AVAILABLE:
|
||||
yolo_pause()
|
||||
_paused = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
r = _client.chat(model=OLLAMA_MODEL, messages=messages,
|
||||
options={
|
||||
"temperature": 0.0,
|
||||
@ -101,6 +118,13 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
|
||||
"num_ctx": NUM_CTX,
|
||||
})
|
||||
return r["message"]["content"].strip()
|
||||
finally:
|
||||
if _paused:
|
||||
try:
|
||||
from API.yolo_api import yolo_resume
|
||||
yolo_resume()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def parse_json(raw: str):
|
||||
@ -117,16 +141,28 @@ def parse_json(raw: str):
|
||||
|
||||
|
||||
def ask(command: str, img_b64) -> dict:
|
||||
"""Send command + camera frame to the VLM with conversation history."""
|
||||
"""
|
||||
Send command + camera frame to the VLM.
|
||||
|
||||
NOTE: this path does NOT use conversation history, even though other ask_*
|
||||
paths do. With temperature=0 (required for reliable JSON action output),
|
||||
including the last answer in the prompt makes the model lock onto
|
||||
repeating it — `what do you see` then always replies with whatever it saw
|
||||
the first time, regardless of the current frame. Vision grounding has to
|
||||
be stateless per call. Chitchat (ask_talk) keeps history because there
|
||||
the whole point is continuity.
|
||||
"""
|
||||
if not VLM_ENABLED:
|
||||
return dict(_VLM_OFF_EMPTY)
|
||||
try:
|
||||
facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
|
||||
raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
|
||||
num_predict=_cfg["num_predict_main"], use_history=True)
|
||||
num_predict=_cfg["num_predict_main"], use_history=False)
|
||||
print(f" Raw: {raw}")
|
||||
d = parse_json(raw)
|
||||
speak = d.get("speak", raw) if d else raw
|
||||
# Still write to history so ask_talk() has context — just don't
|
||||
# READ from it in this path (would cause lock-on repetition).
|
||||
add_to_history(command, speak)
|
||||
if d is None:
|
||||
return {"actions": [], "arm": None, "speak": raw, "abort": None}
|
||||
|
||||
@ -17,6 +17,8 @@ def _stub_ppe(): return []
|
||||
def _stub_too_close(**k): return False
|
||||
def _stub_all(): return set()
|
||||
def _stub_fps(): return 0.0
|
||||
def _stub_pause(): return None
|
||||
def _stub_resume(): return None
|
||||
|
||||
yolo_sees = _stub_sees
|
||||
yolo_count = _stub_count
|
||||
@ -26,6 +28,8 @@ yolo_ppe_violations = _stub_ppe
|
||||
yolo_person_too_close = _stub_too_close
|
||||
yolo_all_classes = _stub_all
|
||||
yolo_fps = _stub_fps
|
||||
yolo_pause = _stub_pause
|
||||
yolo_resume = _stub_resume
|
||||
|
||||
|
||||
def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
||||
@ -33,6 +37,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
||||
global YOLO_AVAILABLE
|
||||
global yolo_sees, yolo_count, yolo_closest, yolo_summary
|
||||
global yolo_ppe_violations, yolo_person_too_close, yolo_all_classes, yolo_fps
|
||||
global yolo_pause, yolo_resume
|
||||
|
||||
# marcus_yolo.py lives in Vision/
|
||||
models_dir = os.path.join(PROJECT_ROOT, "Vision")
|
||||
@ -44,6 +49,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
||||
start_yolo, yolo_sees as _ys, yolo_count as _yc, yolo_closest as _ycl,
|
||||
yolo_summary as _ysu, yolo_ppe_violations as _ypp,
|
||||
yolo_person_too_close as _yptc, yolo_all_classes as _yac, yolo_fps as _yfps,
|
||||
yolo_pause as _ypause, yolo_resume as _yresume,
|
||||
)
|
||||
except ImportError as e:
|
||||
print(f"marcus_yolo.py not found ({e})")
|
||||
@ -81,5 +87,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
||||
yolo_person_too_close = _yptc
|
||||
yolo_all_classes = _yac
|
||||
yolo_fps = _yfps
|
||||
yolo_pause = _ypause
|
||||
yolo_resume = _yresume
|
||||
print(f"YOLO {'started' if ok else 'failed to start'}")
|
||||
return ok
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
"ollama_host": "http://127.0.0.1:11434",
|
||||
"max_history": 6,
|
||||
"num_batch": 128,
|
||||
"num_ctx": 2048,
|
||||
"num_ctx": 1024,
|
||||
"subsystems": {
|
||||
"vlm": true,
|
||||
"lidar": true,
|
||||
|
||||
1
Data/Brain/Sessions/session_031_2026-04-22/alerts.json
Normal file
1
Data/Brain/Sessions/session_031_2026-04-22/alerts.json
Normal file
@ -0,0 +1 @@
|
||||
[]
|
||||
32
Data/Brain/Sessions/session_031_2026-04-22/commands.json
Normal file
32
Data/Brain/Sessions/session_031_2026-04-22/commands.json
Normal file
@ -0,0 +1,32 @@
|
||||
[
|
||||
{
|
||||
"time": "11:18:06",
|
||||
"cmd": "what do you see",
|
||||
"response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
|
||||
"duration_s": 0.0
|
||||
},
|
||||
{
|
||||
"time": "11:18:19",
|
||||
"cmd": "hi",
|
||||
"response": "Hello! I am Sanad. How can I help you?",
|
||||
"duration_s": 0.0
|
||||
},
|
||||
{
|
||||
"time": "11:18:49",
|
||||
"cmd": "what do you see",
|
||||
"response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
|
||||
"duration_s": 0.0
|
||||
},
|
||||
{
|
||||
"time": "11:19:20",
|
||||
"cmd": "turn left 1 step",
|
||||
"response": "local command",
|
||||
"duration_s": 0.0
|
||||
},
|
||||
{
|
||||
"time": "11:20:40",
|
||||
"cmd": "help/",
|
||||
"response": "local command",
|
||||
"duration_s": 0.0
|
||||
}
|
||||
]
|
||||
@ -0,0 +1 @@
|
||||
[]
|
||||
1
Data/Brain/Sessions/session_031_2026-04-22/places.json
Normal file
1
Data/Brain/Sessions/session_031_2026-04-22/places.json
Normal file
@ -0,0 +1 @@
|
||||
{}
|
||||
@ -92,6 +92,12 @@ PPE_VIOLATION_CLASSES = {"no-helmet", "no_helmet", "no-vest", "no_vest"}
|
||||
_detections_lock = threading.Lock()
|
||||
_latest_detections = [] # list of dicts
|
||||
_yolo_running = [False]
|
||||
# When True, the inference loop skips model forward passes. Used by the VLM
|
||||
# path: Qwen2.5-VL's vision encode needs ~1.5 GiB of iGPU activations and the
|
||||
# Jetson's 15 GiB is shared with YOLO + Holosoma, so concurrent inference
|
||||
# spikes the runner into OOM. Pausing YOLO for the ~1 s the VLM spends on an
|
||||
# image prevents that peak. Model weights stay resident (fast resume).
|
||||
_yolo_paused = [False]
|
||||
_yolo_fps = [0.0]
|
||||
|
||||
|
||||
@ -279,6 +285,30 @@ def yolo_is_running() -> bool:
|
||||
return _yolo_running[0]
|
||||
|
||||
|
||||
def yolo_pause() -> None:
|
||||
"""
|
||||
Stop YOLO forward passes and release PyTorch's CUDA cache back to the
|
||||
driver so Ollama's vision encoder has contiguous iGPU memory to allocate
|
||||
into. Weights stay resident, so resume is instant.
|
||||
"""
|
||||
_yolo_paused[0] = True
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def yolo_resume() -> None:
|
||||
"""Resume YOLO inference after a pause()."""
|
||||
_yolo_paused[0] = False
|
||||
|
||||
|
||||
def yolo_is_paused() -> bool:
|
||||
return _yolo_paused[0]
|
||||
|
||||
|
||||
def yolo_fps() -> float:
|
||||
"""Return current YOLO inference FPS."""
|
||||
return _yolo_fps[0]
|
||||
@ -293,6 +323,9 @@ def _inference_loop(model, is_ppe: bool, raw_frame_ref, frame_lock,
|
||||
t_fps = time.time()
|
||||
|
||||
while _yolo_running[0]:
|
||||
if _yolo_paused[0]:
|
||||
time.sleep(0.03)
|
||||
continue
|
||||
with frame_lock:
|
||||
frame = raw_frame_ref[0]
|
||||
if frame is None:
|
||||
|
||||
22
check_ollama.sh
Executable file
22
check_ollama.sh
Executable file
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
echo "=== ollama arch + version ==="
|
||||
file $(which ollama); ollama --version; uname -m
|
||||
echo
|
||||
echo "=== does nvidia-smi work on Jetson? ==="
|
||||
nvidia-smi 2>&1 | head -5 || echo "NO nvidia-smi (expected on JetPack 5 — Tegra uses tegrastats)"
|
||||
echo
|
||||
echo "=== tegrastats (Jetson GPU util) — 2 s sample ==="
|
||||
timeout 2 tegrastats 2>&1 | head -2
|
||||
echo
|
||||
echo "=== Ollama 'inference compute' line — THE answer ==="
|
||||
journalctl -u ollama -n 200 --no-pager 2>/dev/null | grep -E "inference compute|vram|library=|starting runner|GPU" | tail -15
|
||||
echo
|
||||
echo "=== Ollama service env ==="
|
||||
systemctl cat ollama 2>/dev/null | grep -E "Environment|ExecStart"
|
||||
echo
|
||||
echo "=== JetPack / CUDA on this box ==="
|
||||
cat /etc/nv_tegra_release 2>/dev/null | head -1
|
||||
ls /usr/local/cuda/lib64/libcudart.so* 2>/dev/null | head -3
|
||||
echo
|
||||
echo "=== does Ollama's own lib dir exist? (stock install) ==="
|
||||
ls /usr/lib/ollama/ /usr/local/lib/ollama/ 2>/dev/null
|
||||
71
install_ollama_jetson.sh
Executable file
71
install_ollama_jetson.sh
Executable file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env bash
|
||||
# install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit
|
||||
#
|
||||
# WHY THIS SCRIPT EXISTS:
|
||||
# `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama
|
||||
# manually — but on the Jetson, Ollama is started by systemd at boot. Systemd
|
||||
# ignores the shell script entirely, so none of the flags were ever reaching
|
||||
# the live server. This installs a drop-in that systemd merges into the unit,
|
||||
# so `systemctl restart ollama` picks up the flags. Confirmed by the log line
|
||||
# `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works,
|
||||
# but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the
|
||||
# vision-encode pass OOMs the runner (seen as "llama runner process has
|
||||
# terminated ... status code: 500" when an image is attached).
|
||||
#
|
||||
# Run once, from the Jetson:
|
||||
# sudo ./install_ollama_jetson.sh
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
echo "This script must be run as root (it writes to /etc/systemd/system)." >&2
|
||||
echo "Re-run with: sudo $0" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DROPIN_DIR="/etc/systemd/system/ollama.service.d"
|
||||
DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf"
|
||||
|
||||
mkdir -p "$DROPIN_DIR"
|
||||
cat > "$DROPIN_FILE" <<'EOF'
|
||||
# Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh
|
||||
[Service]
|
||||
# Flash attention: ~30% less memory for attention tensors.
|
||||
Environment="OLLAMA_FLASH_ATTENTION=1"
|
||||
# Quantize KV cache to 8-bit (halves KV memory; negligible quality loss).
|
||||
Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
|
||||
# Never hold two VL models simultaneously.
|
||||
Environment="OLLAMA_MAX_LOADED_MODELS=1"
|
||||
# CRITICAL: keep the model resident essentially forever. The previous 2m value
|
||||
# meant that any pause in conversation longer than 2 min unloaded the model,
|
||||
# and the NEXT "what do you see" paid another 60-90 s cold-load. That
|
||||
# cold-load hammered unified memory + disk bandwidth hard enough to break
|
||||
# Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to
|
||||
# 153 ms and the robot fell. 24h means one cold-load per day (during
|
||||
# warmup_vlm.sh while the robot is in squat), everything after is warm.
|
||||
Environment="OLLAMA_KEEP_ALIVE=24h"
|
||||
# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
|
||||
# Holosoma + camera + Python heap). Raised from 2 GiB after observing
|
||||
# Holosoma starvation during image requests — 2 GiB was enough for memory
|
||||
# but not for memory-bandwidth headroom.
|
||||
Environment="OLLAMA_GPU_OVERHEAD=3221225472"
|
||||
# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
|
||||
# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
|
||||
# nobody else wants it. Nice=10 = lowest normal priority.
|
||||
IOSchedulingClass=idle
|
||||
Nice=10
|
||||
EOF
|
||||
|
||||
chmod 644 "$DROPIN_FILE"
|
||||
echo "Wrote $DROPIN_FILE"
|
||||
echo
|
||||
|
||||
systemctl daemon-reload
|
||||
systemctl restart ollama
|
||||
|
||||
sleep 2
|
||||
echo "=== verification: these vars should now be in the live process ==="
|
||||
journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10
|
||||
echo
|
||||
echo "If the 'inference compute' line shows library=CUDA and your flags appear"
|
||||
echo "in the 'server config' dump above, you're done. Try 'what do you see' again."
|
||||
@ -25,6 +25,13 @@ export OLLAMA_FLASH_ATTENTION=1
|
||||
export OLLAMA_KV_CACHE_TYPE=q8_0
|
||||
export OLLAMA_KEEP_ALIVE=2m
|
||||
export OLLAMA_MAX_LOADED_MODELS=1
|
||||
# Reserve 2 GiB of the Jetson's 15 GiB iGPU for the rest of the system
|
||||
# (YOLO in CUDA FP16, Holosoma, camera, Python heap). Without this, Ollama
|
||||
# assumes the full 13.8 GiB "available" is its to use and sizes its compute
|
||||
# graph that way — which works for text, but the vision-encode pass of
|
||||
# Qwen2.5-VL then pushes total allocation past physical memory and the
|
||||
# runner dies with status 500.
|
||||
export OLLAMA_GPU_OVERHEAD=2147483648
|
||||
|
||||
if [[ "$1" == "--fg" ]]; then
|
||||
echo "Running ollama in foreground..."
|
||||
|
||||
82
warmup_vlm.sh
Executable file
82
warmup_vlm.sh
Executable file
@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env bash
|
||||
# warmup_vlm.sh — pre-load Qwen2.5-VL into iGPU BEFORE the robot stands up.
|
||||
#
|
||||
# WHY THIS EXISTS:
|
||||
# Cold-loading a 3 GB vision model on the Jetson's 16 GB unified memory
|
||||
# hammers disk + memory bandwidth hard enough to blow Holosoma's 20 ms
|
||||
# real-time deadline. If that happens while the robot is standing, Holosoma
|
||||
# loses balance control and the robot falls (observed 2026-04-22 —
|
||||
# RL FPS crashed from 56 → 10 during cold-load, OOM killer reaped Holosoma).
|
||||
#
|
||||
# SAFE OPERATING PROCEDURE (do this once per boot):
|
||||
# 1. Boot robot. Keep it in squat / damping mode (NOT standing).
|
||||
# 2. Run: ./warmup_vlm.sh
|
||||
# 3. Wait for "Warmup complete" (~60-90 s).
|
||||
# 4. NOW raise the robot to standing.
|
||||
# 5. Run: python3 run_marcus.py
|
||||
#
|
||||
# After warmup, OLLAMA_KEEP_ALIVE=24h keeps the model hot for the rest of the
|
||||
# day, so every "what do you see" is fast (~2-5 s) and Holosoma stays
|
||||
# healthy. Never cold-load a vision model while the robot is standing.
|
||||
|
||||
set -e
|
||||
|
||||
MODEL="${1:-qwen2.5vl:3b}"
|
||||
HOST="http://127.0.0.1:11434"
|
||||
|
||||
cat <<'BANNER'
|
||||
════════════════════════════════════════════════════════════════════
|
||||
VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
|
||||
════════════════════════════════════════════════════════════════════
|
||||
|
||||
The next 60-90 s will hammer disk + memory bandwidth. Holosoma
|
||||
cannot balance through this. If the robot is standing, IT WILL FALL.
|
||||
|
||||
Press ENTER when the robot is safely in squat / damping.
|
||||
Press Ctrl-C to abort.
|
||||
|
||||
BANNER
|
||||
read -r
|
||||
|
||||
# Sanity: is ollama reachable?
|
||||
if ! curl -sf "$HOST/api/version" > /dev/null; then
|
||||
echo "✗ Ollama is not running on $HOST"
|
||||
echo " Start it: sudo systemctl start ollama"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Sanity: is the model in the store?
|
||||
if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
|
||||
echo "✗ Model '$MODEL' not found in Ollama store"
|
||||
echo " Pull it: ollama pull $MODEL"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
|
||||
echo " Watching tegrastats in a second window is useful — GPU should spike."
|
||||
echo
|
||||
|
||||
START=$(date +%s)
|
||||
|
||||
# Send a tiny text-only request. Ollama loads the model on first request;
|
||||
# we don't need vision here — just getting weights resident is what takes the
|
||||
# long time. A text-only warmup also avoids needing a camera frame.
|
||||
#
|
||||
# --keepalive 24h matches the systemd config and prevents the server from
|
||||
# unloading the model after the default 5 min.
|
||||
RESPONSE=$(curl -s "$HOST/api/generate" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}")
|
||||
|
||||
END=$(date +%s)
|
||||
ELAPSED=$((END - START))
|
||||
|
||||
if echo "$RESPONSE" | grep -q '"response"'; then
|
||||
echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
|
||||
echo " You can now stand the robot and run: python3 run_marcus.py"
|
||||
else
|
||||
echo "✗ Warmup failed after ${ELAPSED}s"
|
||||
echo " Response: $RESPONSE"
|
||||
echo " Check: journalctl -u ollama --since '2 minutes ago' --no-pager | tail -40"
|
||||
exit 1
|
||||
fi
|
||||
Loading…
x
Reference in New Issue
Block a user