Update 2026-04-22 13:28:38

This commit is contained in:
kassam 2026-04-22 13:28:39 +04:00
parent 9991e742da
commit dc06864ec3
12 changed files with 305 additions and 11 deletions

View File

@ -93,6 +93,23 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
if img_b64: if img_b64:
msg["images"] = [img_b64] msg["images"] = [img_b64]
messages.append(msg) messages.append(msg)
# When an image is attached, pause YOLO to free iGPU memory for the
# vision-encoder activations (~1.5 GiB). Without this, concurrent YOLO
# inference + Qwen vision-encode exceeds the 15 GiB Jetson iGPU budget
# and the llama runner is reaped by the OOM killer (status code: 500).
# Text-only calls skip the pause — they fit easily and YOLO stays hot.
_paused = False
if img_b64:
try:
from API.yolo_api import yolo_pause, yolo_resume, YOLO_AVAILABLE
if YOLO_AVAILABLE:
yolo_pause()
_paused = True
except Exception:
pass
try:
r = _client.chat(model=OLLAMA_MODEL, messages=messages, r = _client.chat(model=OLLAMA_MODEL, messages=messages,
options={ options={
"temperature": 0.0, "temperature": 0.0,
@ -101,6 +118,13 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
"num_ctx": NUM_CTX, "num_ctx": NUM_CTX,
}) })
return r["message"]["content"].strip() return r["message"]["content"].strip()
finally:
if _paused:
try:
from API.yolo_api import yolo_resume
yolo_resume()
except Exception:
pass
def parse_json(raw: str): def parse_json(raw: str):
@ -117,16 +141,28 @@ def parse_json(raw: str):
def ask(command: str, img_b64) -> dict: def ask(command: str, img_b64) -> dict:
"""Send command + camera frame to the VLM with conversation history.""" """
Send command + camera frame to the VLM.
NOTE: this path does NOT use conversation history, even though other ask_*
paths do. With temperature=0 (required for reliable JSON action output),
including the last answer in the prompt makes the model lock onto
repeating it `what do you see` then always replies with whatever it saw
the first time, regardless of the current frame. Vision grounding has to
be stateless per call. Chitchat (ask_talk) keeps history because there
the whole point is continuity.
"""
if not VLM_ENABLED: if not VLM_ENABLED:
return dict(_VLM_OFF_EMPTY) return dict(_VLM_OFF_EMPTY)
try: try:
facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else "" facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64, raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
num_predict=_cfg["num_predict_main"], use_history=True) num_predict=_cfg["num_predict_main"], use_history=False)
print(f" Raw: {raw}") print(f" Raw: {raw}")
d = parse_json(raw) d = parse_json(raw)
speak = d.get("speak", raw) if d else raw speak = d.get("speak", raw) if d else raw
# Still write to history so ask_talk() has context — just don't
# READ from it in this path (would cause lock-on repetition).
add_to_history(command, speak) add_to_history(command, speak)
if d is None: if d is None:
return {"actions": [], "arm": None, "speak": raw, "abort": None} return {"actions": [], "arm": None, "speak": raw, "abort": None}

View File

@ -17,6 +17,8 @@ def _stub_ppe(): return []
def _stub_too_close(**k): return False def _stub_too_close(**k): return False
def _stub_all(): return set() def _stub_all(): return set()
def _stub_fps(): return 0.0 def _stub_fps(): return 0.0
def _stub_pause(): return None
def _stub_resume(): return None
yolo_sees = _stub_sees yolo_sees = _stub_sees
yolo_count = _stub_count yolo_count = _stub_count
@ -26,6 +28,8 @@ yolo_ppe_violations = _stub_ppe
yolo_person_too_close = _stub_too_close yolo_person_too_close = _stub_too_close
yolo_all_classes = _stub_all yolo_all_classes = _stub_all
yolo_fps = _stub_fps yolo_fps = _stub_fps
yolo_pause = _stub_pause
yolo_resume = _stub_resume
def init_yolo(raw_frame_ref, frame_lock) -> bool: def init_yolo(raw_frame_ref, frame_lock) -> bool:
@ -33,6 +37,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
global YOLO_AVAILABLE global YOLO_AVAILABLE
global yolo_sees, yolo_count, yolo_closest, yolo_summary global yolo_sees, yolo_count, yolo_closest, yolo_summary
global yolo_ppe_violations, yolo_person_too_close, yolo_all_classes, yolo_fps global yolo_ppe_violations, yolo_person_too_close, yolo_all_classes, yolo_fps
global yolo_pause, yolo_resume
# marcus_yolo.py lives in Vision/ # marcus_yolo.py lives in Vision/
models_dir = os.path.join(PROJECT_ROOT, "Vision") models_dir = os.path.join(PROJECT_ROOT, "Vision")
@ -44,6 +49,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
start_yolo, yolo_sees as _ys, yolo_count as _yc, yolo_closest as _ycl, start_yolo, yolo_sees as _ys, yolo_count as _yc, yolo_closest as _ycl,
yolo_summary as _ysu, yolo_ppe_violations as _ypp, yolo_summary as _ysu, yolo_ppe_violations as _ypp,
yolo_person_too_close as _yptc, yolo_all_classes as _yac, yolo_fps as _yfps, yolo_person_too_close as _yptc, yolo_all_classes as _yac, yolo_fps as _yfps,
yolo_pause as _ypause, yolo_resume as _yresume,
) )
except ImportError as e: except ImportError as e:
print(f"marcus_yolo.py not found ({e})") print(f"marcus_yolo.py not found ({e})")
@ -81,5 +87,7 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
yolo_person_too_close = _yptc yolo_person_too_close = _yptc
yolo_all_classes = _yac yolo_all_classes = _yac
yolo_fps = _yfps yolo_fps = _yfps
yolo_pause = _ypause
yolo_resume = _yresume
print(f"YOLO {'started' if ok else 'failed to start'}") print(f"YOLO {'started' if ok else 'failed to start'}")
return ok return ok

View File

@ -3,7 +3,7 @@
"ollama_host": "http://127.0.0.1:11434", "ollama_host": "http://127.0.0.1:11434",
"max_history": 6, "max_history": 6,
"num_batch": 128, "num_batch": 128,
"num_ctx": 2048, "num_ctx": 1024,
"subsystems": { "subsystems": {
"vlm": true, "vlm": true,
"lidar": true, "lidar": true,

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1,32 @@
[
{
"time": "11:18:06",
"cmd": "what do you see",
"response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
"duration_s": 0.0
},
{
"time": "11:18:19",
"cmd": "hi",
"response": "Hello! I am Sanad. How can I help you?",
"duration_s": 0.0
},
{
"time": "11:18:49",
"cmd": "what do you see",
"response": "Error: llama runner process has terminated: %!w(<nil>) (status code: 500)",
"duration_s": 0.0
},
{
"time": "11:19:20",
"cmd": "turn left 1 step",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:20:40",
"cmd": "help/",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1 @@
{}

View File

@ -92,6 +92,12 @@ PPE_VIOLATION_CLASSES = {"no-helmet", "no_helmet", "no-vest", "no_vest"}
_detections_lock = threading.Lock() _detections_lock = threading.Lock()
_latest_detections = [] # list of dicts _latest_detections = [] # list of dicts
_yolo_running = [False] _yolo_running = [False]
# When True, the inference loop skips model forward passes. Used by the VLM
# path: Qwen2.5-VL's vision encode needs ~1.5 GiB of iGPU activations and the
# Jetson's 15 GiB is shared with YOLO + Holosoma, so concurrent inference
# spikes the runner into OOM. Pausing YOLO for the ~1 s the VLM spends on an
# image prevents that peak. Model weights stay resident (fast resume).
_yolo_paused = [False]
_yolo_fps = [0.0] _yolo_fps = [0.0]
@ -279,6 +285,30 @@ def yolo_is_running() -> bool:
return _yolo_running[0] return _yolo_running[0]
def yolo_pause() -> None:
"""
Stop YOLO forward passes and release PyTorch's CUDA cache back to the
driver so Ollama's vision encoder has contiguous iGPU memory to allocate
into. Weights stay resident, so resume is instant.
"""
_yolo_paused[0] = True
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception:
pass
def yolo_resume() -> None:
"""Resume YOLO inference after a pause()."""
_yolo_paused[0] = False
def yolo_is_paused() -> bool:
return _yolo_paused[0]
def yolo_fps() -> float: def yolo_fps() -> float:
"""Return current YOLO inference FPS.""" """Return current YOLO inference FPS."""
return _yolo_fps[0] return _yolo_fps[0]
@ -293,6 +323,9 @@ def _inference_loop(model, is_ppe: bool, raw_frame_ref, frame_lock,
t_fps = time.time() t_fps = time.time()
while _yolo_running[0]: while _yolo_running[0]:
if _yolo_paused[0]:
time.sleep(0.03)
continue
with frame_lock: with frame_lock:
frame = raw_frame_ref[0] frame = raw_frame_ref[0]
if frame is None: if frame is None:

22
check_ollama.sh Executable file
View File

@ -0,0 +1,22 @@
#!/bin/bash
echo "=== ollama arch + version ==="
file $(which ollama); ollama --version; uname -m
echo
echo "=== does nvidia-smi work on Jetson? ==="
nvidia-smi 2>&1 | head -5 || echo "NO nvidia-smi (expected on JetPack 5 — Tegra uses tegrastats)"
echo
echo "=== tegrastats (Jetson GPU util) — 2 s sample ==="
timeout 2 tegrastats 2>&1 | head -2
echo
echo "=== Ollama 'inference compute' line — THE answer ==="
journalctl -u ollama -n 200 --no-pager 2>/dev/null | grep -E "inference compute|vram|library=|starting runner|GPU" | tail -15
echo
echo "=== Ollama service env ==="
systemctl cat ollama 2>/dev/null | grep -E "Environment|ExecStart"
echo
echo "=== JetPack / CUDA on this box ==="
cat /etc/nv_tegra_release 2>/dev/null | head -1
ls /usr/local/cuda/lib64/libcudart.so* 2>/dev/null | head -3
echo
echo "=== does Ollama's own lib dir exist? (stock install) ==="
ls /usr/lib/ollama/ /usr/local/lib/ollama/ 2>/dev/null

71
install_ollama_jetson.sh Executable file
View File

@ -0,0 +1,71 @@
#!/usr/bin/env bash
# install_ollama_jetson.sh — add Jetson memory flags to the Ollama systemd unit
#
# WHY THIS SCRIPT EXISTS:
# `start_ollama.sh` only applies its OLLAMA_* env vars if you launch Ollama
# manually — but on the Jetson, Ollama is started by systemd at boot. Systemd
# ignores the shell script entirely, so none of the flags were ever reaching
# the live server. This installs a drop-in that systemd merges into the unit,
# so `systemctl restart ollama` picks up the flags. Confirmed by the log line
# `inference compute ... library=CUDA ... available="13.8 GiB"` — GPU works,
# but without OLLAMA_GPU_OVERHEAD Ollama claims all 13.8 GiB and the
# vision-encode pass OOMs the runner (seen as "llama runner process has
# terminated ... status code: 500" when an image is attached).
#
# Run once, from the Jetson:
# sudo ./install_ollama_jetson.sh
#
set -euo pipefail
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root (it writes to /etc/systemd/system)." >&2
echo "Re-run with: sudo $0" >&2
exit 1
fi
DROPIN_DIR="/etc/systemd/system/ollama.service.d"
DROPIN_FILE="$DROPIN_DIR/marcus-jetson.conf"
mkdir -p "$DROPIN_DIR"
cat > "$DROPIN_FILE" <<'EOF'
# Jetson memory caps for Marcus — do not edit live, re-run install_ollama_jetson.sh
[Service]
# Flash attention: ~30% less memory for attention tensors.
Environment="OLLAMA_FLASH_ATTENTION=1"
# Quantize KV cache to 8-bit (halves KV memory; negligible quality loss).
Environment="OLLAMA_KV_CACHE_TYPE=q8_0"
# Never hold two VL models simultaneously.
Environment="OLLAMA_MAX_LOADED_MODELS=1"
# CRITICAL: keep the model resident essentially forever. The previous 2m value
# meant that any pause in conversation longer than 2 min unloaded the model,
# and the NEXT "what do you see" paid another 60-90 s cold-load. That
# cold-load hammered unified memory + disk bandwidth hard enough to break
# Holosoma's 20 ms real-time deadline — inference times climbed from 2 ms to
# 153 ms and the robot fell. 24h means one cold-load per day (during
# warmup_vlm.sh while the robot is in squat), everything after is warm.
Environment="OLLAMA_KEEP_ALIVE=24h"
# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
# Holosoma + camera + Python heap). Raised from 2 GiB after observing
# Holosoma starvation during image requests — 2 GiB was enough for memory
# but not for memory-bandwidth headroom.
Environment="OLLAMA_GPU_OVERHEAD=3221225472"
# Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
# thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
# nobody else wants it. Nice=10 = lowest normal priority.
IOSchedulingClass=idle
Nice=10
EOF
chmod 644 "$DROPIN_FILE"
echo "Wrote $DROPIN_FILE"
echo
systemctl daemon-reload
systemctl restart ollama
sleep 2
echo "=== verification: these vars should now be in the live process ==="
journalctl -u ollama -n 40 --no-pager | grep -E "OLLAMA_FLASH_ATTENTION|OLLAMA_KV_CACHE_TYPE|OLLAMA_GPU_OVERHEAD|inference compute|vram" | tail -10
echo
echo "If the 'inference compute' line shows library=CUDA and your flags appear"
echo "in the 'server config' dump above, you're done. Try 'what do you see' again."

View File

@ -25,6 +25,13 @@ export OLLAMA_FLASH_ATTENTION=1
export OLLAMA_KV_CACHE_TYPE=q8_0 export OLLAMA_KV_CACHE_TYPE=q8_0
export OLLAMA_KEEP_ALIVE=2m export OLLAMA_KEEP_ALIVE=2m
export OLLAMA_MAX_LOADED_MODELS=1 export OLLAMA_MAX_LOADED_MODELS=1
# Reserve 2 GiB of the Jetson's 15 GiB iGPU for the rest of the system
# (YOLO in CUDA FP16, Holosoma, camera, Python heap). Without this, Ollama
# assumes the full 13.8 GiB "available" is its to use and sizes its compute
# graph that way — which works for text, but the vision-encode pass of
# Qwen2.5-VL then pushes total allocation past physical memory and the
# runner dies with status 500.
export OLLAMA_GPU_OVERHEAD=2147483648
if [[ "$1" == "--fg" ]]; then if [[ "$1" == "--fg" ]]; then
echo "Running ollama in foreground..." echo "Running ollama in foreground..."

82
warmup_vlm.sh Executable file
View File

@ -0,0 +1,82 @@
#!/usr/bin/env bash
# warmup_vlm.sh — pre-load Qwen2.5-VL into iGPU BEFORE the robot stands up.
#
# WHY THIS EXISTS:
# Cold-loading a 3 GB vision model on the Jetson's 16 GB unified memory
# hammers disk + memory bandwidth hard enough to blow Holosoma's 20 ms
# real-time deadline. If that happens while the robot is standing, Holosoma
# loses balance control and the robot falls (observed 2026-04-22 —
# RL FPS crashed from 56 → 10 during cold-load, OOM killer reaped Holosoma).
#
# SAFE OPERATING PROCEDURE (do this once per boot):
# 1. Boot robot. Keep it in squat / damping mode (NOT standing).
# 2. Run: ./warmup_vlm.sh
# 3. Wait for "Warmup complete" (~60-90 s).
# 4. NOW raise the robot to standing.
# 5. Run: python3 run_marcus.py
#
# After warmup, OLLAMA_KEEP_ALIVE=24h keeps the model hot for the rest of the
# day, so every "what do you see" is fast (~2-5 s) and Holosoma stays
# healthy. Never cold-load a vision model while the robot is standing.
set -e
MODEL="${1:-qwen2.5vl:3b}"
HOST="http://127.0.0.1:11434"
cat <<'BANNER'
════════════════════════════════════════════════════════════════════
VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
════════════════════════════════════════════════════════════════════
The next 60-90 s will hammer disk + memory bandwidth. Holosoma
cannot balance through this. If the robot is standing, IT WILL FALL.
Press ENTER when the robot is safely in squat / damping.
Press Ctrl-C to abort.
BANNER
read -r
# Sanity: is ollama reachable?
if ! curl -sf "$HOST/api/version" > /dev/null; then
echo "✗ Ollama is not running on $HOST"
echo " Start it: sudo systemctl start ollama"
exit 1
fi
# Sanity: is the model in the store?
if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
echo "✗ Model '$MODEL' not found in Ollama store"
echo " Pull it: ollama pull $MODEL"
exit 1
fi
echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
echo " Watching tegrastats in a second window is useful — GPU should spike."
echo
START=$(date +%s)
# Send a tiny text-only request. Ollama loads the model on first request;
# we don't need vision here — just getting weights resident is what takes the
# long time. A text-only warmup also avoids needing a camera frame.
#
# --keepalive 24h matches the systemd config and prevents the server from
# unloading the model after the default 5 min.
RESPONSE=$(curl -s "$HOST/api/generate" \
-H 'Content-Type: application/json' \
-d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}")
END=$(date +%s)
ELAPSED=$((END - START))
if echo "$RESPONSE" | grep -q '"response"'; then
echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
echo " You can now stand the robot and run: python3 run_marcus.py"
else
echo "✗ Warmup failed after ${ELAPSED}s"
echo " Response: $RESPONSE"
echo " Check: journalctl -u ollama --since '2 minutes ago' --no-pager | tail -40"
exit 1
fi