Update 2026-04-22 14:44:13
This commit is contained in:
parent
f4ff5c27fa
commit
e9fb18eea1
@ -112,3 +112,29 @@ def get_frame():
|
|||||||
def get_frame_age() -> float:
|
def get_frame_age() -> float:
|
||||||
"""Return seconds since last camera frame."""
|
"""Return seconds since last camera frame."""
|
||||||
return time.time() - _cam_last_frame_time[0] if _cam_last_frame_time[0] > 0 else 999.0
|
return time.time() - _cam_last_frame_time[0] if _cam_last_frame_time[0] > 0 else 999.0
|
||||||
|
|
||||||
|
|
||||||
|
def get_fresh_frame(max_age_s: float = 0.3, timeout_s: float = 1.0):
|
||||||
|
"""
|
||||||
|
Return a camera frame newer than `max_age_s` seconds. If the buffer
|
||||||
|
already has a fresh frame, returns immediately; otherwise sleeps in
|
||||||
|
short increments (keeps GIL free for the camera thread) until one
|
||||||
|
arrives or `timeout_s` elapses.
|
||||||
|
|
||||||
|
Reason this exists: during a VLM request + TTS + executor cycle, the
|
||||||
|
main Python thread can monopolize the GIL long enough that the camera
|
||||||
|
thread hasn't written a new frame by the time the next user query
|
||||||
|
arrives. Calling `get_frame()` then returns the SAME bytes as last
|
||||||
|
call → the VLM with temperature=0 returns the SAME answer → it looks
|
||||||
|
like Marcus isn't actually looking at the current scene. Forcing a
|
||||||
|
frame < 300 ms old makes each vision query see real current data.
|
||||||
|
"""
|
||||||
|
deadline = time.time() + timeout_s
|
||||||
|
while time.time() < deadline:
|
||||||
|
if get_frame_age() < max_age_s and latest_frame_b64[0] is not None:
|
||||||
|
with camera_lock:
|
||||||
|
return latest_frame_b64[0]
|
||||||
|
time.sleep(0.03)
|
||||||
|
# Timed out — return whatever we have (may be stale, better than None)
|
||||||
|
with camera_lock:
|
||||||
|
return latest_frame_b64[0]
|
||||||
|
|||||||
@ -18,7 +18,7 @@ if PROJECT_DIR not in sys.path:
|
|||||||
sys.path.insert(0, PROJECT_DIR)
|
sys.path.insert(0, PROJECT_DIR)
|
||||||
|
|
||||||
from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd
|
from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd
|
||||||
from API.camera_api import start_camera, stop_camera, get_frame
|
from API.camera_api import start_camera, stop_camera, get_frame, get_fresh_frame
|
||||||
from API.yolo_api import (
|
from API.yolo_api import (
|
||||||
init_yolo, yolo_summary, yolo_fps,
|
init_yolo, yolo_summary, yolo_fps,
|
||||||
yolo_all_classes, yolo_closest, yolo_sees,
|
yolo_all_classes, yolo_closest, yolo_sees,
|
||||||
@ -78,7 +78,15 @@ def init_brain():
|
|||||||
init_zmq()
|
init_zmq()
|
||||||
|
|
||||||
raw_frame, raw_lock = start_camera()
|
raw_frame, raw_lock = start_camera()
|
||||||
init_yolo(raw_frame, raw_lock)
|
# YOLO is optional on the Jetson: with Qwen2.5-VL loaded, YOLO's ~2 GiB
|
||||||
|
# of iGPU pushes Ollama into 30/70 CPU/GPU split and inference crawls.
|
||||||
|
# Set subsystems.yolo=false in config_Brain.json to skip it entirely;
|
||||||
|
# the VLM can describe the scene directly. Autonomous/patrol features
|
||||||
|
# that rely on YOLO degrade gracefully via the yolo_api stubs.
|
||||||
|
if subsys.get("yolo", True):
|
||||||
|
init_yolo(raw_frame, raw_lock)
|
||||||
|
else:
|
||||||
|
print(" [YOLO] disabled by config (subsystems.yolo=false) — saves ~2 GiB iGPU for VLM")
|
||||||
|
|
||||||
from API.zmq_api import get_socket
|
from API.zmq_api import get_socket
|
||||||
init_odometry(zmq_sock=get_socket())
|
init_odometry(zmq_sock=get_socket())
|
||||||
@ -353,7 +361,7 @@ def _handle_search(cmd):
|
|||||||
def _handle_talk(cmd):
|
def _handle_talk(cmd):
|
||||||
print("Thinking...")
|
print("Thinking...")
|
||||||
try:
|
try:
|
||||||
img = get_frame()
|
img = get_fresh_frame()
|
||||||
facts_str = ""
|
facts_str = ""
|
||||||
try:
|
try:
|
||||||
from API.llava_api import _facts
|
from API.llava_api import _facts
|
||||||
@ -374,7 +382,10 @@ def _handle_talk(cmd):
|
|||||||
def _handle_llava(cmd):
|
def _handle_llava(cmd):
|
||||||
print("Thinking...")
|
print("Thinking...")
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
img = get_frame()
|
# get_fresh_frame() blocks up to 1 s waiting for a frame newer than
|
||||||
|
# 300 ms old. Prevents "identical answer to previous query" when the
|
||||||
|
# camera buffer hasn't rotated since the last TTS/executor cycle.
|
||||||
|
img = get_fresh_frame()
|
||||||
|
|
||||||
# Poll up to 500 ms in 50 ms slices instead of blocking a full second.
|
# Poll up to 500 ms in 50 ms slices instead of blocking a full second.
|
||||||
# Returns the moment a frame is available — most drops recover in <100 ms.
|
# Returns the moment a frame is available — most drops recover in <100 ms.
|
||||||
|
|||||||
@ -2,18 +2,19 @@
|
|||||||
"ollama_model": "qwen2.5vl:3b",
|
"ollama_model": "qwen2.5vl:3b",
|
||||||
"ollama_host": "http://127.0.0.1:11434",
|
"ollama_host": "http://127.0.0.1:11434",
|
||||||
"max_history": 6,
|
"max_history": 6,
|
||||||
"num_batch": 64,
|
"num_batch": 32,
|
||||||
"num_ctx": 1024,
|
"num_ctx": 1024,
|
||||||
"subsystems": {
|
"subsystems": {
|
||||||
"vlm": true,
|
"vlm": true,
|
||||||
"lidar": true,
|
"yolo": true,
|
||||||
|
"lidar": false,
|
||||||
"voice": true,
|
"voice": true,
|
||||||
"imgsearch": false,
|
"imgsearch": false,
|
||||||
"autonomous": true
|
"autonomous": true
|
||||||
},
|
},
|
||||||
"num_predict_main": 120,
|
"num_predict_main": 50,
|
||||||
"num_predict_goal": 80,
|
"num_predict_goal": 40,
|
||||||
"num_predict_patrol": 100,
|
"num_predict_patrol": 50,
|
||||||
"num_predict_talk": 80,
|
"num_predict_talk": 50,
|
||||||
"num_predict_verify": 10
|
"num_predict_verify": 10
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,9 +2,10 @@
|
|||||||
"yolo_model_path": "Models/yolov8m.pt",
|
"yolo_model_path": "Models/yolov8m.pt",
|
||||||
"yolo_confidence": 0.45,
|
"yolo_confidence": 0.45,
|
||||||
"yolo_iou": 0.45,
|
"yolo_iou": 0.45,
|
||||||
"yolo_device": "cuda",
|
"yolo_device": "cpu",
|
||||||
"yolo_half": true,
|
"yolo_half": true,
|
||||||
"yolo_img_size": 320,
|
"yolo_img_size": 320,
|
||||||
|
"yolo_fps_cap": 2.0,
|
||||||
"tracked_classes": [
|
"tracked_classes": [
|
||||||
"person", "chair", "couch", "bed", "dining table",
|
"person", "chair", "couch", "bed", "dining table",
|
||||||
"bottle", "cup", "laptop", "keyboard", "mouse",
|
"bottle", "cup", "laptop", "keyboard", "mouse",
|
||||||
|
|||||||
@ -37,39 +37,50 @@ except Exception as _e:
|
|||||||
YOLO_MODEL_PATH = os.path.join(_PROJECT_ROOT, _cfg.get("yolo_model_path", "Models/yolov8m.pt"))
|
YOLO_MODEL_PATH = os.path.join(_PROJECT_ROOT, _cfg.get("yolo_model_path", "Models/yolov8m.pt"))
|
||||||
YOLO_CONFIDENCE = float(_cfg.get("yolo_confidence", 0.45))
|
YOLO_CONFIDENCE = float(_cfg.get("yolo_confidence", 0.45))
|
||||||
YOLO_IOU = float(_cfg.get("yolo_iou", 0.45))
|
YOLO_IOU = float(_cfg.get("yolo_iou", 0.45))
|
||||||
YOLO_DEVICE = _cfg.get("yolo_device", "cuda") # "cuda" | "0" | "cuda:N"
|
YOLO_DEVICE = _cfg.get("yolo_device", "cpu") # "cpu" | "cuda" | "0" | "cuda:N"
|
||||||
YOLO_IMG_SIZE = int(_cfg.get("yolo_img_size", 320))
|
YOLO_IMG_SIZE = int(_cfg.get("yolo_img_size", 320))
|
||||||
YOLO_HALF = bool(_cfg.get("yolo_half", True)) # FP16 on GPU
|
YOLO_HALF = bool(_cfg.get("yolo_half", True)) # FP16 on GPU (ignored on CPU)
|
||||||
|
# FPS cap. On CPU, Orin NX manages ~2-3 FPS of YOLOv8m @ 320px. We throttle
|
||||||
|
# lower so CPU inference doesn't compete with Holosoma for cycles. On CUDA,
|
||||||
|
# value is irrelevant (GPU is fast enough that the existing 0.02 s sleep
|
||||||
|
# already caps at ~21 FPS).
|
||||||
|
YOLO_FPS_CAP = float(_cfg.get("yolo_fps_cap", 2.0))
|
||||||
|
|
||||||
|
|
||||||
def _resolve_device(requested: str) -> tuple:
|
def _resolve_device(requested: str) -> tuple:
|
||||||
"""
|
"""
|
||||||
Resolve the inference device. GPU is required — no CPU fallback.
|
Resolve the YOLO inference device. Both GPU and CPU are supported.
|
||||||
|
|
||||||
Returns (device_str, use_half). Raises RuntimeError if CUDA is unavailable
|
On Jetson Orin NX with Qwen2.5-VL loaded, YOLO on "cuda" takes ~2 GiB of
|
||||||
or if the config requests CPU. Marcus must run on the Jetson Orin NX GPU.
|
iGPU memory and forces Ollama into a 30/70 CPU/GPU split that crawls
|
||||||
|
vision queries. Matching Marcus_v1's working architecture, the default
|
||||||
|
is now "cpu" — YOLO gets ~1-3 FPS on Orin CPU which is plenty for
|
||||||
|
"is there a person" queries, and Qwen keeps the whole iGPU. Set
|
||||||
|
yolo_device="cuda" only if VLM is disabled (subsystems.vlm=false).
|
||||||
|
|
||||||
|
Returns (device_str, use_half). Never raises for CPU; raises for a
|
||||||
|
CUDA request only when CUDA is genuinely unavailable.
|
||||||
"""
|
"""
|
||||||
req = (requested or "cuda").lower()
|
req = (requested or "cpu").lower()
|
||||||
|
|
||||||
if req == "cpu":
|
if req == "cpu":
|
||||||
raise RuntimeError(
|
# half-precision only makes sense on GPU; force fp32 on CPU
|
||||||
"[YOLO] yolo_device='cpu' in config — Marcus requires GPU. "
|
return "cpu", False
|
||||||
"Set yolo_device to 'cuda' in Config/config_Vision.json."
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import torch
|
import torch
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"[YOLO] PyTorch not installed — cannot run on GPU. "
|
"[YOLO] PyTorch not installed — cannot run on CUDA. "
|
||||||
"Install CUDA-enabled torch on the Jetson."
|
"Either install CUDA-enabled torch, or set "
|
||||||
|
"yolo_device='cpu' in Config/config_Vision.json."
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
if not torch.cuda.is_available():
|
if not torch.cuda.is_available():
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"[YOLO] CUDA not available — torch.cuda.is_available() == False. "
|
"[YOLO] yolo_device='cuda' but torch.cuda.is_available()==False. "
|
||||||
"Check nvidia driver / JetPack CUDA runtime on the Jetson "
|
"Either fix CUDA (tegrastats, nvcc --version) or set "
|
||||||
"(try `nvidia-smi` or `tegrastats`)."
|
"yolo_device='cpu' in Config/config_Vision.json."
|
||||||
)
|
)
|
||||||
|
|
||||||
dev = req if (req.startswith("cuda") or req == "0") else "cuda"
|
dev = req if (req.startswith("cuda") or req == "0") else "cuda"
|
||||||
@ -321,16 +332,27 @@ def _inference_loop(model, is_ppe: bool, raw_frame_ref, frame_lock,
|
|||||||
"""Background inference loop. Reads frames, updates _latest_detections."""
|
"""Background inference loop. Reads frames, updates _latest_detections."""
|
||||||
frame_count = 0
|
frame_count = 0
|
||||||
t_fps = time.time()
|
t_fps = time.time()
|
||||||
|
# Minimum wall-clock interval between inferences, in seconds. On CPU this
|
||||||
|
# is the main throttle; on CUDA the model itself limits throughput.
|
||||||
|
min_period = 1.0 / max(YOLO_FPS_CAP, 0.1)
|
||||||
|
last_infer = 0.0
|
||||||
|
|
||||||
while _yolo_running[0]:
|
while _yolo_running[0]:
|
||||||
if _yolo_paused[0]:
|
if _yolo_paused[0]:
|
||||||
time.sleep(0.03)
|
time.sleep(0.03)
|
||||||
continue
|
continue
|
||||||
|
# FPS cap — on CPU especially, we don't want YOLO to hammer the cores
|
||||||
|
# that Holosoma's 50 Hz RL policy also needs.
|
||||||
|
dt_since = time.time() - last_infer
|
||||||
|
if dt_since < min_period:
|
||||||
|
time.sleep(min(0.05, min_period - dt_since))
|
||||||
|
continue
|
||||||
with frame_lock:
|
with frame_lock:
|
||||||
frame = raw_frame_ref[0]
|
frame = raw_frame_ref[0]
|
||||||
if frame is None:
|
if frame is None:
|
||||||
time.sleep(0.05)
|
time.sleep(0.05)
|
||||||
continue
|
continue
|
||||||
|
last_infer = time.time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
results = model(
|
results = model(
|
||||||
|
|||||||
@ -61,6 +61,15 @@ Environment="OLLAMA_CONTEXT_LENGTH=1024"
|
|||||||
# nobody else wants it. Nice=10 = lowest normal priority.
|
# nobody else wants it. Nice=10 = lowest normal priority.
|
||||||
IOSchedulingClass=idle
|
IOSchedulingClass=idle
|
||||||
Nice=10
|
Nice=10
|
||||||
|
# Make Ollama the preferred OOM-killer victim. If the 15 GiB Jetson iGPU +
|
||||||
|
# system memory fills up (e.g. swap thrashing after a vision request),
|
||||||
|
# kernel picks Ollama first — it auto-restarts, the model cold-loads on
|
||||||
|
# next request, Marcus and Holosoma keep running. Much better outcome than
|
||||||
|
# killing Marcus (robot brain) or Holosoma (balance control).
|
||||||
|
# Observed 2026-04-22: with oom_score_adj=0 across the board, the kernel
|
||||||
|
# killed Marcus's python3 (22 GB virtual, swapped) instead of Ollama
|
||||||
|
# (larger RSS but systemd-managed). This flips that.
|
||||||
|
OOMScoreAdjust=500
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
chmod 644 "$DROPIN_FILE"
|
chmod 644 "$DROPIN_FILE"
|
||||||
|
|||||||
132
run.sh
Executable file
132
run.sh
Executable file
@ -0,0 +1,132 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# run.sh — one-command launcher for Marcus on the Jetson.
|
||||||
|
#
|
||||||
|
# What it does (safely, in order):
|
||||||
|
# 1. Ensures systemd Ollama is running (with drop-in flags).
|
||||||
|
# 2. Pulls the VLM model if it's missing.
|
||||||
|
# 3. Runs the VLM warmup (asks you to put the robot in squat first).
|
||||||
|
# 4. Health-gates iGPU placement and system memory.
|
||||||
|
# 5. Pauses for you to stand the robot up.
|
||||||
|
# 6. Launches Marcus.
|
||||||
|
#
|
||||||
|
# Fail-fast: any health check failure aborts BEFORE the robot stands.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ./run.sh normal launch
|
||||||
|
# ./run.sh --skip-warmup skip warmup (only safe if already warm)
|
||||||
|
# ./run.sh --skip-stand-prompt don't pause for stand-up confirmation
|
||||||
|
#
|
||||||
|
# First time on a fresh Jetson (one-time, not run here):
|
||||||
|
# sudo ./install_ollama_jetson.sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
SKIP_WARMUP=0
|
||||||
|
SKIP_STAND_PROMPT=0
|
||||||
|
for arg in "$@"; do
|
||||||
|
case "$arg" in
|
||||||
|
--skip-warmup) SKIP_WARMUP=1 ;;
|
||||||
|
--skip-stand-prompt) SKIP_STAND_PROMPT=1 ;;
|
||||||
|
-h|--help)
|
||||||
|
grep '^#' "$0" | sed 's/^# \{0,1\}//'
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown argument: $arg" >&2
|
||||||
|
echo "Use --help for usage." >&2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# ── tiny color helpers ──────────────────────────────────────────────
|
||||||
|
BOLD='\033[1m'; GREEN='\033[32m'; YELLOW='\033[33m'; RED='\033[31m'; DIM='\033[2m'; OFF='\033[0m'
|
||||||
|
step() { echo -e "\n${BOLD}▶ $1${OFF}"; }
|
||||||
|
ok() { echo -e " ${GREEN}✓${OFF} $1"; }
|
||||||
|
warn() { echo -e " ${YELLOW}⚠${OFF} $1"; }
|
||||||
|
die() { echo -e " ${RED}✗${OFF} $1"; exit 1; }
|
||||||
|
|
||||||
|
MODEL=$(python3 -c "import json; print(json.load(open('Config/config_Brain.json'))['ollama_model'])")
|
||||||
|
HOST=$(python3 -c "import json; print(json.load(open('Config/config_Brain.json'))['ollama_host'])")
|
||||||
|
|
||||||
|
# ── 1. Ensure Ollama service is up ─────────────────────────────────
|
||||||
|
step "1/5 Ollama systemd service"
|
||||||
|
if ! systemctl is-active --quiet ollama; then
|
||||||
|
warn "service is not active — starting it"
|
||||||
|
sudo systemctl start ollama
|
||||||
|
sleep 2
|
||||||
|
fi
|
||||||
|
if ! systemctl is-active --quiet ollama; then
|
||||||
|
die "failed to start Ollama; check: systemctl status ollama"
|
||||||
|
fi
|
||||||
|
ADJUST=$(systemctl show ollama -p OOMScoreAdjust --value)
|
||||||
|
if [[ "$ADJUST" != "500" ]]; then
|
||||||
|
warn "OOMScoreAdjust is '$ADJUST' (expected 500)"
|
||||||
|
warn "drop-in may not be installed — run: sudo ./install_ollama_jetson.sh"
|
||||||
|
fi
|
||||||
|
ok "service active (OOMScoreAdjust=$ADJUST)"
|
||||||
|
|
||||||
|
# ── 2. Ensure the model is pulled ──────────────────────────────────
|
||||||
|
step "2/5 VLM model present"
|
||||||
|
if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
|
||||||
|
warn "$MODEL not in store — pulling (~2.2 GB, one-time)"
|
||||||
|
ollama pull "$MODEL"
|
||||||
|
fi
|
||||||
|
ok "$MODEL available in Ollama store"
|
||||||
|
|
||||||
|
# ── 3. Warmup (robot-in-squat safety banner lives inside the script) ──
|
||||||
|
step "3/5 VLM warmup"
|
||||||
|
if [[ $SKIP_WARMUP -eq 1 ]]; then
|
||||||
|
warn "skipped by --skip-warmup"
|
||||||
|
else
|
||||||
|
./warmup_vlm.sh "$MODEL"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 4. Health gates ────────────────────────────────────────────────
|
||||||
|
step "4/5 Health check"
|
||||||
|
|
||||||
|
# 4a. PROCESSOR placement
|
||||||
|
PS_LINE=$(ollama ps | awk -v m="$MODEL" '$1==m {for(i=1;i<=NF;i++) if($i ~ /GPU/) {print $(i-1), $i; exit}}')
|
||||||
|
PROCESSOR=$(ollama ps | awk -v m="$MODEL" 'NR>1 && $1==m {
|
||||||
|
for(i=4;i<=NF;i++) if($i=="GPU" || $i=="CPU/GPU" || $i=="CPU") {print $(i-1) " " $i; break}
|
||||||
|
}')
|
||||||
|
if [[ -z "$PROCESSOR" ]]; then
|
||||||
|
warn "model not loaded — running without prewarm; first vision call will cold-load"
|
||||||
|
elif echo "$PROCESSOR" | grep -qE "^100% GPU"; then
|
||||||
|
ok "placement: $PROCESSOR (ideal)"
|
||||||
|
elif echo "$PROCESSOR" | grep -qE "CPU/GPU"; then
|
||||||
|
SPLIT=$(echo "$PROCESSOR" | awk '{print $1}')
|
||||||
|
warn "placement: $PROCESSOR — partial CPU offload will slow vision queries"
|
||||||
|
warn "if unacceptable, lower OLLAMA_GPU_OVERHEAD in install_ollama_jetson.sh"
|
||||||
|
else
|
||||||
|
warn "placement: $PROCESSOR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4b. system memory
|
||||||
|
read -r MEM_USED MEM_AVAIL SWAP_USED <<< "$(free -m | awk '
|
||||||
|
/^Mem:/ { used=$3; avail=$7 }
|
||||||
|
/^Swap:/ { sused=$3 }
|
||||||
|
END { print used, avail, sused }
|
||||||
|
')"
|
||||||
|
echo " ${DIM}mem used: ${MEM_USED} MiB available: ${MEM_AVAIL} MiB swap used: ${SWAP_USED} MiB${OFF}"
|
||||||
|
if (( SWAP_USED > 500 )); then
|
||||||
|
die "swap in use (${SWAP_USED} MiB) — memory is already tight, would thrash under load"
|
||||||
|
fi
|
||||||
|
if (( MEM_AVAIL < 1000 )); then
|
||||||
|
die "only ${MEM_AVAIL} MiB available — Holosoma/Marcus/camera won't have room"
|
||||||
|
fi
|
||||||
|
ok "memory healthy"
|
||||||
|
|
||||||
|
# ── 5. Stand the robot, then launch ────────────────────────────────
|
||||||
|
step "5/5 Launch Marcus"
|
||||||
|
if [[ $SKIP_STAND_PROMPT -eq 0 ]]; then
|
||||||
|
echo -e " ${YELLOW}Robot can now stand up.${OFF}"
|
||||||
|
echo -e " ${DIM}Press ENTER when the robot is standing, or Ctrl-C to abort.${OFF}"
|
||||||
|
read -r
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
|
||||||
|
exec python3 run_marcus.py
|
||||||
@ -19,16 +19,13 @@ PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
|
|||||||
if PROJECT_ROOT not in sys.path:
|
if PROJECT_ROOT not in sys.path:
|
||||||
sys.path.insert(0, PROJECT_ROOT)
|
sys.path.insert(0, PROJECT_ROOT)
|
||||||
|
|
||||||
# Tell the Linux OOM killer to pick Ollama (easily restarted) or any other
|
# NOTE: we intentionally do NOT try to lower Marcus's oom_score_adj here.
|
||||||
# process before Marcus. -900 is almost-but-not-quite OOM-immune; we don't
|
# The Linux kernel requires CAP_SYS_RESOURCE to set a negative oom_score_adj,
|
||||||
# use -1000 because that disables OOM handling entirely, which is risky if
|
# and Marcus typically runs as an unprivileged user, so any write fails with
|
||||||
# Marcus ever had a runaway allocation. Writing oom_score_adj doesn't need
|
# PermissionError — silent no-op. Instead, install_ollama_jetson.sh adds
|
||||||
# root — a process can always lower its own score.
|
# OOMScoreAdjust=500 to the Ollama systemd unit. Under memory pressure the
|
||||||
try:
|
# kernel then kills Ollama (auto-restarts, model cold-loads again on next
|
||||||
with open(f"/proc/{os.getpid()}/oom_score_adj", "w") as _f:
|
# vision query) rather than Marcus (robot brain).
|
||||||
_f.write("-900")
|
|
||||||
except OSError:
|
|
||||||
pass # not fatal — running under a restrictive sandbox
|
|
||||||
|
|
||||||
from Brain.marcus_brain import run_terminal
|
from Brain.marcus_brain import run_terminal
|
||||||
|
|
||||||
|
|||||||
@ -1,22 +1,34 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# start_ollama.sh — launch Ollama with Jetson-friendly memory settings
|
# start_ollama.sh — DEPRECATED, do not use.
|
||||||
#
|
#
|
||||||
# The Jetson Orin NX has 16 GB unified CPU+GPU memory. When Marcus + YOLO +
|
# This script launches Ollama as the current (unitree) user and completely
|
||||||
# Whisper + Holosoma + the camera + audio all run alongside Qwen2.5-VL,
|
# BYPASSES the systemd drop-in at /etc/systemd/system/ollama.service.d/,
|
||||||
# the compute-graph OOMs the llama runner and Linux kills the biggest
|
# which is where all the safety-critical flags live:
|
||||||
# process (often Holosoma — which is a safety problem for locomotion).
|
|
||||||
#
|
#
|
||||||
# These env vars cut Ollama's memory footprint:
|
# OOMScoreAdjust=500 (makes Ollama the preferred OOM victim)
|
||||||
# OLLAMA_FLASH_ATTENTION=1 ~30 % less memory for attention tensors
|
# IOSchedulingClass=idle (so disk reads don't starve Holosoma)
|
||||||
# OLLAMA_KV_CACHE_TYPE=q8_0 quantize KV cache (halves it)
|
# Nice=10 (lowest normal scheduler priority)
|
||||||
# OLLAMA_KEEP_ALIVE=2m keep the model warm for 2 min then evict
|
# OLLAMA_GPU_OVERHEAD=4 GiB (reserves iGPU for YOLO/Holosoma/camera)
|
||||||
# (adjust if cold-load lag matters more
|
# OLLAMA_KEEP_ALIVE=24h (model stays resident all day)
|
||||||
# than idle memory)
|
# OLLAMA_CONTEXT_LENGTH=1024 (shrinks compute graph)
|
||||||
# OLLAMA_MAX_LOADED_MODELS=1 never hold two VL models at once
|
# OLLAMA_FLASH_ATTENTION=1
|
||||||
|
# OLLAMA_KV_CACHE_TYPE=q8_0
|
||||||
|
# OLLAMA_MAX_LOADED_MODELS=1
|
||||||
#
|
#
|
||||||
# Usage:
|
# Running this script instead of systemd means NONE of those protections
|
||||||
# ./start_ollama.sh # starts server in background, logs to /tmp/ollama.log
|
# are active, and the robot WILL fall the next time a vision query runs.
|
||||||
# ./start_ollama.sh --fg # runs in foreground (for debugging)
|
#
|
||||||
|
# CORRECT WAY:
|
||||||
|
# sudo systemctl start ollama # uses the drop-in
|
||||||
|
# ./warmup_vlm.sh # then warm up
|
||||||
|
# python3 run_marcus.py # then start Marcus
|
||||||
|
#
|
||||||
|
# ═════════════════════════════════════════════════════════════════════
|
||||||
|
# REFUSING TO RUN. Delete this block only if you know what you're doing.
|
||||||
|
# ═════════════════════════════════════════════════════════════════════
|
||||||
|
echo "start_ollama.sh is DEPRECATED. Use: sudo systemctl start ollama" >&2
|
||||||
|
echo " (see comments at the top of this file for why)" >&2
|
||||||
|
exit 1
|
||||||
|
|
||||||
pkill -f "ollama (runner|serve)" 2>/dev/null
|
pkill -f "ollama (runner|serve)" 2>/dev/null
|
||||||
sleep 1
|
sleep 1
|
||||||
|
|||||||
@ -24,6 +24,16 @@ set -e
|
|||||||
MODEL="${1:-qwen2.5vl:3b}"
|
MODEL="${1:-qwen2.5vl:3b}"
|
||||||
HOST="http://127.0.0.1:11434"
|
HOST="http://127.0.0.1:11434"
|
||||||
|
|
||||||
|
# Pull num_batch / num_ctx from config_Brain.json. CRITICAL: these must match
|
||||||
|
# what Marcus will send at runtime, otherwise Ollama evicts the cached runner
|
||||||
|
# on the FIRST real vision command and re-instantiates it — the overlap of
|
||||||
|
# old-runner-unloading + new-runner-loading triggers an iGPU OOM and the
|
||||||
|
# runner dies with status 500. Symptom: warmup succeeds, first `what do you
|
||||||
|
# see` crashes 10-30 s later with "llama runner process has terminated".
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
NUM_BATCH=$(python3 -c "import json; print(json.load(open('$SCRIPT_DIR/Config/config_Brain.json'))['num_batch'])" 2>/dev/null || echo 64)
|
||||||
|
NUM_CTX=$(python3 -c "import json; print(json.load(open('$SCRIPT_DIR/Config/config_Brain.json'))['num_ctx'])" 2>/dev/null || echo 1024)
|
||||||
|
|
||||||
# Sanity: is ollama reachable?
|
# Sanity: is ollama reachable?
|
||||||
if ! curl -sf "$HOST/api/version" > /dev/null; then
|
if ! curl -sf "$HOST/api/version" > /dev/null; then
|
||||||
echo "✗ Ollama is not running on $HOST"
|
echo "✗ Ollama is not running on $HOST"
|
||||||
@ -64,6 +74,7 @@ else
|
|||||||
BANNER
|
BANNER
|
||||||
read -r
|
read -r
|
||||||
echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
|
echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
|
||||||
|
echo " num_batch=$NUM_BATCH num_ctx=$NUM_CTX (matching Marcus's runtime config)"
|
||||||
echo " Watching tegrastats in a second window is useful — GPU should spike."
|
echo " Watching tegrastats in a second window is useful — GPU should spike."
|
||||||
echo
|
echo
|
||||||
fi
|
fi
|
||||||
@ -78,7 +89,7 @@ START=$(date +%s)
|
|||||||
# unloading the model after the default 5 min.
|
# unloading the model after the default 5 min.
|
||||||
RESPONSE=$(curl -s "$HOST/api/generate" \
|
RESPONSE=$(curl -s "$HOST/api/generate" \
|
||||||
-H 'Content-Type: application/json' \
|
-H 'Content-Type: application/json' \
|
||||||
-d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}")
|
-d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":$NUM_BATCH,\"num_ctx\":$NUM_CTX}}")
|
||||||
|
|
||||||
END=$(date +%s)
|
END=$(date +%s)
|
||||||
ELAPSED=$((END - START))
|
ELAPSED=$((END - START))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user