diff --git a/API/camera_api.py b/API/camera_api.py index 15644a8..b0c77d4 100644 --- a/API/camera_api.py +++ b/API/camera_api.py @@ -112,3 +112,29 @@ def get_frame(): def get_frame_age() -> float: """Return seconds since last camera frame.""" return time.time() - _cam_last_frame_time[0] if _cam_last_frame_time[0] > 0 else 999.0 + + +def get_fresh_frame(max_age_s: float = 0.3, timeout_s: float = 1.0): + """ + Return a camera frame newer than `max_age_s` seconds. If the buffer + already has a fresh frame, returns immediately; otherwise sleeps in + short increments (keeps GIL free for the camera thread) until one + arrives or `timeout_s` elapses. + + Reason this exists: during a VLM request + TTS + executor cycle, the + main Python thread can monopolize the GIL long enough that the camera + thread hasn't written a new frame by the time the next user query + arrives. Calling `get_frame()` then returns the SAME bytes as last + call → the VLM with temperature=0 returns the SAME answer → it looks + like Marcus isn't actually looking at the current scene. Forcing a + frame < 300 ms old makes each vision query see real current data. + """ + deadline = time.time() + timeout_s + while time.time() < deadline: + if get_frame_age() < max_age_s and latest_frame_b64[0] is not None: + with camera_lock: + return latest_frame_b64[0] + time.sleep(0.03) + # Timed out — return whatever we have (may be stale, better than None) + with camera_lock: + return latest_frame_b64[0] diff --git a/Brain/marcus_brain.py b/Brain/marcus_brain.py index c88ed4f..1676b01 100644 --- a/Brain/marcus_brain.py +++ b/Brain/marcus_brain.py @@ -18,7 +18,7 @@ if PROJECT_DIR not in sys.path: sys.path.insert(0, PROJECT_DIR) from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd -from API.camera_api import start_camera, stop_camera, get_frame +from API.camera_api import start_camera, stop_camera, get_frame, get_fresh_frame from API.yolo_api import ( init_yolo, yolo_summary, yolo_fps, yolo_all_classes, yolo_closest, yolo_sees, @@ -78,7 +78,15 @@ def init_brain(): init_zmq() raw_frame, raw_lock = start_camera() - init_yolo(raw_frame, raw_lock) + # YOLO is optional on the Jetson: with Qwen2.5-VL loaded, YOLO's ~2 GiB + # of iGPU pushes Ollama into 30/70 CPU/GPU split and inference crawls. + # Set subsystems.yolo=false in config_Brain.json to skip it entirely; + # the VLM can describe the scene directly. Autonomous/patrol features + # that rely on YOLO degrade gracefully via the yolo_api stubs. + if subsys.get("yolo", True): + init_yolo(raw_frame, raw_lock) + else: + print(" [YOLO] disabled by config (subsystems.yolo=false) — saves ~2 GiB iGPU for VLM") from API.zmq_api import get_socket init_odometry(zmq_sock=get_socket()) @@ -353,7 +361,7 @@ def _handle_search(cmd): def _handle_talk(cmd): print("Thinking...") try: - img = get_frame() + img = get_fresh_frame() facts_str = "" try: from API.llava_api import _facts @@ -374,7 +382,10 @@ def _handle_talk(cmd): def _handle_llava(cmd): print("Thinking...") t0 = time.time() - img = get_frame() + # get_fresh_frame() blocks up to 1 s waiting for a frame newer than + # 300 ms old. Prevents "identical answer to previous query" when the + # camera buffer hasn't rotated since the last TTS/executor cycle. + img = get_fresh_frame() # Poll up to 500 ms in 50 ms slices instead of blocking a full second. # Returns the moment a frame is available — most drops recover in <100 ms. diff --git a/Config/config_Brain.json b/Config/config_Brain.json index 57d90f2..7d3f538 100644 --- a/Config/config_Brain.json +++ b/Config/config_Brain.json @@ -2,18 +2,19 @@ "ollama_model": "qwen2.5vl:3b", "ollama_host": "http://127.0.0.1:11434", "max_history": 6, - "num_batch": 64, + "num_batch": 32, "num_ctx": 1024, "subsystems": { "vlm": true, - "lidar": true, + "yolo": true, + "lidar": false, "voice": true, "imgsearch": false, "autonomous": true }, - "num_predict_main": 120, - "num_predict_goal": 80, - "num_predict_patrol": 100, - "num_predict_talk": 80, + "num_predict_main": 50, + "num_predict_goal": 40, + "num_predict_patrol": 50, + "num_predict_talk": 50, "num_predict_verify": 10 } diff --git a/Config/config_Vision.json b/Config/config_Vision.json index dc83d33..af6a035 100644 --- a/Config/config_Vision.json +++ b/Config/config_Vision.json @@ -2,9 +2,10 @@ "yolo_model_path": "Models/yolov8m.pt", "yolo_confidence": 0.45, "yolo_iou": 0.45, - "yolo_device": "cuda", + "yolo_device": "cpu", "yolo_half": true, "yolo_img_size": 320, + "yolo_fps_cap": 2.0, "tracked_classes": [ "person", "chair", "couch", "bed", "dining table", "bottle", "cup", "laptop", "keyboard", "mouse", diff --git a/Vision/marcus_yolo.py b/Vision/marcus_yolo.py index 6d6a9e4..d646454 100644 --- a/Vision/marcus_yolo.py +++ b/Vision/marcus_yolo.py @@ -37,39 +37,50 @@ except Exception as _e: YOLO_MODEL_PATH = os.path.join(_PROJECT_ROOT, _cfg.get("yolo_model_path", "Models/yolov8m.pt")) YOLO_CONFIDENCE = float(_cfg.get("yolo_confidence", 0.45)) YOLO_IOU = float(_cfg.get("yolo_iou", 0.45)) -YOLO_DEVICE = _cfg.get("yolo_device", "cuda") # "cuda" | "0" | "cuda:N" +YOLO_DEVICE = _cfg.get("yolo_device", "cpu") # "cpu" | "cuda" | "0" | "cuda:N" YOLO_IMG_SIZE = int(_cfg.get("yolo_img_size", 320)) -YOLO_HALF = bool(_cfg.get("yolo_half", True)) # FP16 on GPU +YOLO_HALF = bool(_cfg.get("yolo_half", True)) # FP16 on GPU (ignored on CPU) +# FPS cap. On CPU, Orin NX manages ~2-3 FPS of YOLOv8m @ 320px. We throttle +# lower so CPU inference doesn't compete with Holosoma for cycles. On CUDA, +# value is irrelevant (GPU is fast enough that the existing 0.02 s sleep +# already caps at ~21 FPS). +YOLO_FPS_CAP = float(_cfg.get("yolo_fps_cap", 2.0)) def _resolve_device(requested: str) -> tuple: """ - Resolve the inference device. GPU is required — no CPU fallback. + Resolve the YOLO inference device. Both GPU and CPU are supported. - Returns (device_str, use_half). Raises RuntimeError if CUDA is unavailable - or if the config requests CPU. Marcus must run on the Jetson Orin NX GPU. + On Jetson Orin NX with Qwen2.5-VL loaded, YOLO on "cuda" takes ~2 GiB of + iGPU memory and forces Ollama into a 30/70 CPU/GPU split that crawls + vision queries. Matching Marcus_v1's working architecture, the default + is now "cpu" — YOLO gets ~1-3 FPS on Orin CPU which is plenty for + "is there a person" queries, and Qwen keeps the whole iGPU. Set + yolo_device="cuda" only if VLM is disabled (subsystems.vlm=false). + + Returns (device_str, use_half). Never raises for CPU; raises for a + CUDA request only when CUDA is genuinely unavailable. """ - req = (requested or "cuda").lower() + req = (requested or "cpu").lower() if req == "cpu": - raise RuntimeError( - "[YOLO] yolo_device='cpu' in config — Marcus requires GPU. " - "Set yolo_device to 'cuda' in Config/config_Vision.json." - ) + # half-precision only makes sense on GPU; force fp32 on CPU + return "cpu", False try: import torch except ImportError as e: raise RuntimeError( - "[YOLO] PyTorch not installed — cannot run on GPU. " - "Install CUDA-enabled torch on the Jetson." + "[YOLO] PyTorch not installed — cannot run on CUDA. " + "Either install CUDA-enabled torch, or set " + "yolo_device='cpu' in Config/config_Vision.json." ) from e if not torch.cuda.is_available(): raise RuntimeError( - "[YOLO] CUDA not available — torch.cuda.is_available() == False. " - "Check nvidia driver / JetPack CUDA runtime on the Jetson " - "(try `nvidia-smi` or `tegrastats`)." + "[YOLO] yolo_device='cuda' but torch.cuda.is_available()==False. " + "Either fix CUDA (tegrastats, nvcc --version) or set " + "yolo_device='cpu' in Config/config_Vision.json." ) dev = req if (req.startswith("cuda") or req == "0") else "cuda" @@ -321,16 +332,27 @@ def _inference_loop(model, is_ppe: bool, raw_frame_ref, frame_lock, """Background inference loop. Reads frames, updates _latest_detections.""" frame_count = 0 t_fps = time.time() + # Minimum wall-clock interval between inferences, in seconds. On CPU this + # is the main throttle; on CUDA the model itself limits throughput. + min_period = 1.0 / max(YOLO_FPS_CAP, 0.1) + last_infer = 0.0 while _yolo_running[0]: if _yolo_paused[0]: time.sleep(0.03) continue + # FPS cap — on CPU especially, we don't want YOLO to hammer the cores + # that Holosoma's 50 Hz RL policy also needs. + dt_since = time.time() - last_infer + if dt_since < min_period: + time.sleep(min(0.05, min_period - dt_since)) + continue with frame_lock: frame = raw_frame_ref[0] if frame is None: time.sleep(0.05) continue + last_infer = time.time() try: results = model( diff --git a/install_ollama_jetson.sh b/install_ollama_jetson.sh index d76b3cb..7e39a09 100755 --- a/install_ollama_jetson.sh +++ b/install_ollama_jetson.sh @@ -61,6 +61,15 @@ Environment="OLLAMA_CONTEXT_LENGTH=1024" # nobody else wants it. Nice=10 = lowest normal priority. IOSchedulingClass=idle Nice=10 +# Make Ollama the preferred OOM-killer victim. If the 15 GiB Jetson iGPU + +# system memory fills up (e.g. swap thrashing after a vision request), +# kernel picks Ollama first — it auto-restarts, the model cold-loads on +# next request, Marcus and Holosoma keep running. Much better outcome than +# killing Marcus (robot brain) or Holosoma (balance control). +# Observed 2026-04-22: with oom_score_adj=0 across the board, the kernel +# killed Marcus's python3 (22 GB virtual, swapped) instead of Ollama +# (larger RSS but systemd-managed). This flips that. +OOMScoreAdjust=500 EOF chmod 644 "$DROPIN_FILE" diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..1337304 --- /dev/null +++ b/run.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +# run.sh — one-command launcher for Marcus on the Jetson. +# +# What it does (safely, in order): +# 1. Ensures systemd Ollama is running (with drop-in flags). +# 2. Pulls the VLM model if it's missing. +# 3. Runs the VLM warmup (asks you to put the robot in squat first). +# 4. Health-gates iGPU placement and system memory. +# 5. Pauses for you to stand the robot up. +# 6. Launches Marcus. +# +# Fail-fast: any health check failure aborts BEFORE the robot stands. +# +# Usage: +# ./run.sh normal launch +# ./run.sh --skip-warmup skip warmup (only safe if already warm) +# ./run.sh --skip-stand-prompt don't pause for stand-up confirmation +# +# First time on a fresh Jetson (one-time, not run here): +# sudo ./install_ollama_jetson.sh + +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +cd "$SCRIPT_DIR" + +SKIP_WARMUP=0 +SKIP_STAND_PROMPT=0 +for arg in "$@"; do + case "$arg" in + --skip-warmup) SKIP_WARMUP=1 ;; + --skip-stand-prompt) SKIP_STAND_PROMPT=1 ;; + -h|--help) + grep '^#' "$0" | sed 's/^# \{0,1\}//' + exit 0 + ;; + *) + echo "Unknown argument: $arg" >&2 + echo "Use --help for usage." >&2 + exit 2 + ;; + esac +done + +# ── tiny color helpers ────────────────────────────────────────────── +BOLD='\033[1m'; GREEN='\033[32m'; YELLOW='\033[33m'; RED='\033[31m'; DIM='\033[2m'; OFF='\033[0m' +step() { echo -e "\n${BOLD}▶ $1${OFF}"; } +ok() { echo -e " ${GREEN}✓${OFF} $1"; } +warn() { echo -e " ${YELLOW}⚠${OFF} $1"; } +die() { echo -e " ${RED}✗${OFF} $1"; exit 1; } + +MODEL=$(python3 -c "import json; print(json.load(open('Config/config_Brain.json'))['ollama_model'])") +HOST=$(python3 -c "import json; print(json.load(open('Config/config_Brain.json'))['ollama_host'])") + +# ── 1. Ensure Ollama service is up ───────────────────────────────── +step "1/5 Ollama systemd service" +if ! systemctl is-active --quiet ollama; then + warn "service is not active — starting it" + sudo systemctl start ollama + sleep 2 +fi +if ! systemctl is-active --quiet ollama; then + die "failed to start Ollama; check: systemctl status ollama" +fi +ADJUST=$(systemctl show ollama -p OOMScoreAdjust --value) +if [[ "$ADJUST" != "500" ]]; then + warn "OOMScoreAdjust is '$ADJUST' (expected 500)" + warn "drop-in may not be installed — run: sudo ./install_ollama_jetson.sh" +fi +ok "service active (OOMScoreAdjust=$ADJUST)" + +# ── 2. Ensure the model is pulled ────────────────────────────────── +step "2/5 VLM model present" +if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then + warn "$MODEL not in store — pulling (~2.2 GB, one-time)" + ollama pull "$MODEL" +fi +ok "$MODEL available in Ollama store" + +# ── 3. Warmup (robot-in-squat safety banner lives inside the script) ── +step "3/5 VLM warmup" +if [[ $SKIP_WARMUP -eq 1 ]]; then + warn "skipped by --skip-warmup" +else + ./warmup_vlm.sh "$MODEL" +fi + +# ── 4. Health gates ──────────────────────────────────────────────── +step "4/5 Health check" + +# 4a. PROCESSOR placement +PS_LINE=$(ollama ps | awk -v m="$MODEL" '$1==m {for(i=1;i<=NF;i++) if($i ~ /GPU/) {print $(i-1), $i; exit}}') +PROCESSOR=$(ollama ps | awk -v m="$MODEL" 'NR>1 && $1==m { + for(i=4;i<=NF;i++) if($i=="GPU" || $i=="CPU/GPU" || $i=="CPU") {print $(i-1) " " $i; break} +}') +if [[ -z "$PROCESSOR" ]]; then + warn "model not loaded — running without prewarm; first vision call will cold-load" +elif echo "$PROCESSOR" | grep -qE "^100% GPU"; then + ok "placement: $PROCESSOR (ideal)" +elif echo "$PROCESSOR" | grep -qE "CPU/GPU"; then + SPLIT=$(echo "$PROCESSOR" | awk '{print $1}') + warn "placement: $PROCESSOR — partial CPU offload will slow vision queries" + warn "if unacceptable, lower OLLAMA_GPU_OVERHEAD in install_ollama_jetson.sh" +else + warn "placement: $PROCESSOR" +fi + +# 4b. system memory +read -r MEM_USED MEM_AVAIL SWAP_USED <<< "$(free -m | awk ' + /^Mem:/ { used=$3; avail=$7 } + /^Swap:/ { sused=$3 } + END { print used, avail, sused } +')" +echo " ${DIM}mem used: ${MEM_USED} MiB available: ${MEM_AVAIL} MiB swap used: ${SWAP_USED} MiB${OFF}" +if (( SWAP_USED > 500 )); then + die "swap in use (${SWAP_USED} MiB) — memory is already tight, would thrash under load" +fi +if (( MEM_AVAIL < 1000 )); then + die "only ${MEM_AVAIL} MiB available — Holosoma/Marcus/camera won't have room" +fi +ok "memory healthy" + +# ── 5. Stand the robot, then launch ──────────────────────────────── +step "5/5 Launch Marcus" +if [[ $SKIP_STAND_PROMPT -eq 0 ]]; then + echo -e " ${YELLOW}Robot can now stand up.${OFF}" + echo -e " ${DIM}Press ENTER when the robot is standing, or Ctrl-C to abort.${OFF}" + read -r +fi +echo + +exec python3 run_marcus.py diff --git a/run_marcus.py b/run_marcus.py index c1e9df4..70b5e98 100644 --- a/run_marcus.py +++ b/run_marcus.py @@ -19,16 +19,13 @@ PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) if PROJECT_ROOT not in sys.path: sys.path.insert(0, PROJECT_ROOT) -# Tell the Linux OOM killer to pick Ollama (easily restarted) or any other -# process before Marcus. -900 is almost-but-not-quite OOM-immune; we don't -# use -1000 because that disables OOM handling entirely, which is risky if -# Marcus ever had a runaway allocation. Writing oom_score_adj doesn't need -# root — a process can always lower its own score. -try: - with open(f"/proc/{os.getpid()}/oom_score_adj", "w") as _f: - _f.write("-900") -except OSError: - pass # not fatal — running under a restrictive sandbox +# NOTE: we intentionally do NOT try to lower Marcus's oom_score_adj here. +# The Linux kernel requires CAP_SYS_RESOURCE to set a negative oom_score_adj, +# and Marcus typically runs as an unprivileged user, so any write fails with +# PermissionError — silent no-op. Instead, install_ollama_jetson.sh adds +# OOMScoreAdjust=500 to the Ollama systemd unit. Under memory pressure the +# kernel then kills Ollama (auto-restarts, model cold-loads again on next +# vision query) rather than Marcus (robot brain). from Brain.marcus_brain import run_terminal diff --git a/start_ollama.sh b/start_ollama.sh index 39bede4..4de9f50 100755 --- a/start_ollama.sh +++ b/start_ollama.sh @@ -1,22 +1,34 @@ #!/usr/bin/env bash -# start_ollama.sh — launch Ollama with Jetson-friendly memory settings +# start_ollama.sh — DEPRECATED, do not use. # -# The Jetson Orin NX has 16 GB unified CPU+GPU memory. When Marcus + YOLO + -# Whisper + Holosoma + the camera + audio all run alongside Qwen2.5-VL, -# the compute-graph OOMs the llama runner and Linux kills the biggest -# process (often Holosoma — which is a safety problem for locomotion). +# This script launches Ollama as the current (unitree) user and completely +# BYPASSES the systemd drop-in at /etc/systemd/system/ollama.service.d/, +# which is where all the safety-critical flags live: # -# These env vars cut Ollama's memory footprint: -# OLLAMA_FLASH_ATTENTION=1 ~30 % less memory for attention tensors -# OLLAMA_KV_CACHE_TYPE=q8_0 quantize KV cache (halves it) -# OLLAMA_KEEP_ALIVE=2m keep the model warm for 2 min then evict -# (adjust if cold-load lag matters more -# than idle memory) -# OLLAMA_MAX_LOADED_MODELS=1 never hold two VL models at once +# OOMScoreAdjust=500 (makes Ollama the preferred OOM victim) +# IOSchedulingClass=idle (so disk reads don't starve Holosoma) +# Nice=10 (lowest normal scheduler priority) +# OLLAMA_GPU_OVERHEAD=4 GiB (reserves iGPU for YOLO/Holosoma/camera) +# OLLAMA_KEEP_ALIVE=24h (model stays resident all day) +# OLLAMA_CONTEXT_LENGTH=1024 (shrinks compute graph) +# OLLAMA_FLASH_ATTENTION=1 +# OLLAMA_KV_CACHE_TYPE=q8_0 +# OLLAMA_MAX_LOADED_MODELS=1 # -# Usage: -# ./start_ollama.sh # starts server in background, logs to /tmp/ollama.log -# ./start_ollama.sh --fg # runs in foreground (for debugging) +# Running this script instead of systemd means NONE of those protections +# are active, and the robot WILL fall the next time a vision query runs. +# +# CORRECT WAY: +# sudo systemctl start ollama # uses the drop-in +# ./warmup_vlm.sh # then warm up +# python3 run_marcus.py # then start Marcus +# +# ═════════════════════════════════════════════════════════════════════ +# REFUSING TO RUN. Delete this block only if you know what you're doing. +# ═════════════════════════════════════════════════════════════════════ +echo "start_ollama.sh is DEPRECATED. Use: sudo systemctl start ollama" >&2 +echo " (see comments at the top of this file for why)" >&2 +exit 1 pkill -f "ollama (runner|serve)" 2>/dev/null sleep 1 diff --git a/warmup_vlm.sh b/warmup_vlm.sh index 5a67597..dbd5b48 100755 --- a/warmup_vlm.sh +++ b/warmup_vlm.sh @@ -24,6 +24,16 @@ set -e MODEL="${1:-qwen2.5vl:3b}" HOST="http://127.0.0.1:11434" +# Pull num_batch / num_ctx from config_Brain.json. CRITICAL: these must match +# what Marcus will send at runtime, otherwise Ollama evicts the cached runner +# on the FIRST real vision command and re-instantiates it — the overlap of +# old-runner-unloading + new-runner-loading triggers an iGPU OOM and the +# runner dies with status 500. Symptom: warmup succeeds, first `what do you +# see` crashes 10-30 s later with "llama runner process has terminated". +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +NUM_BATCH=$(python3 -c "import json; print(json.load(open('$SCRIPT_DIR/Config/config_Brain.json'))['num_batch'])" 2>/dev/null || echo 64) +NUM_CTX=$(python3 -c "import json; print(json.load(open('$SCRIPT_DIR/Config/config_Brain.json'))['num_ctx'])" 2>/dev/null || echo 1024) + # Sanity: is ollama reachable? if ! curl -sf "$HOST/api/version" > /dev/null; then echo "✗ Ollama is not running on $HOST" @@ -64,6 +74,7 @@ else BANNER read -r echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..." + echo " num_batch=$NUM_BATCH num_ctx=$NUM_CTX (matching Marcus's runtime config)" echo " Watching tegrastats in a second window is useful — GPU should spike." echo fi @@ -78,7 +89,7 @@ START=$(date +%s) # unloading the model after the default 5 min. RESPONSE=$(curl -s "$HOST/api/generate" \ -H 'Content-Type: application/json' \ - -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}") + -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":$NUM_BATCH,\"num_ctx\":$NUM_CTX}}") END=$(date +%s) ELAPSED=$((END - START))