diff --git a/API/camera_api.py b/API/camera_api.py
index 15644a8..b0c77d4 100644
--- a/API/camera_api.py
+++ b/API/camera_api.py
@@ -112,3 +112,29 @@ def get_frame():
 def get_frame_age() -> float:
     """Return seconds since last camera frame."""
     return time.time() - _cam_last_frame_time[0] if _cam_last_frame_time[0] > 0 else 999.0
+
+
+def get_fresh_frame(max_age_s: float = 0.3, timeout_s: float = 1.0):
+    """
+    Return a camera frame newer than `max_age_s` seconds. If the buffer
+    already has a fresh frame, returns immediately; otherwise sleeps in
+    short increments (keeps GIL free for the camera thread) until one
+    arrives or `timeout_s` elapses.
+
+    Reason this exists: during a VLM request + TTS + executor cycle, the
+    main Python thread can monopolize the GIL long enough that the camera
+    thread hasn't written a new frame by the time the next user query
+    arrives. Calling `get_frame()` then returns the SAME bytes as last
+    call → the VLM with temperature=0 returns the SAME answer → it looks
+    like Marcus isn't actually looking at the current scene. Forcing a
+    frame < 300 ms old makes each vision query see real current data.
+    """
+    deadline = time.time() + timeout_s
+    while time.time() < deadline:
+        if get_frame_age() < max_age_s and latest_frame_b64[0] is not None:
+            with camera_lock:
+                return latest_frame_b64[0]
+        time.sleep(0.03)
+    # Timed out — return whatever we have (may be stale, better than None)
+    with camera_lock:
+        return latest_frame_b64[0]
diff --git a/Brain/marcus_brain.py b/Brain/marcus_brain.py
index c88ed4f..1676b01 100644
--- a/Brain/marcus_brain.py
+++ b/Brain/marcus_brain.py
@@ -18,7 +18,7 @@ if PROJECT_DIR not in sys.path:
     sys.path.insert(0, PROJECT_DIR)
 
 from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd
-from API.camera_api import start_camera, stop_camera, get_frame
+from API.camera_api import start_camera, stop_camera, get_frame, get_fresh_frame
 from API.yolo_api import (
     init_yolo, yolo_summary, yolo_fps,
     yolo_all_classes, yolo_closest, yolo_sees,
@@ -78,7 +78,15 @@ def init_brain():
     init_zmq()
 
     raw_frame, raw_lock = start_camera()
-    init_yolo(raw_frame, raw_lock)
+    # YOLO is optional on the Jetson: with Qwen2.5-VL loaded, YOLO's ~2 GiB
+    # of iGPU pushes Ollama into 30/70 CPU/GPU split and inference crawls.
+    # Set subsystems.yolo=false in config_Brain.json to skip it entirely;
+    # the VLM can describe the scene directly. Autonomous/patrol features
+    # that rely on YOLO degrade gracefully via the yolo_api stubs.
+    if subsys.get("yolo", True):
+        init_yolo(raw_frame, raw_lock)
+    else:
+        print("  [YOLO] disabled by config (subsystems.yolo=false) — saves ~2 GiB iGPU for VLM")
 
     from API.zmq_api import get_socket
     init_odometry(zmq_sock=get_socket())
@@ -353,7 +361,7 @@ def _handle_search(cmd):
 def _handle_talk(cmd):
     print("Thinking...")
     try:
-        img = get_frame()
+        img = get_fresh_frame()
         facts_str = ""
         try:
             from API.llava_api import _facts
@@ -374,7 +382,10 @@ def _handle_talk(cmd):
 def _handle_llava(cmd):
     print("Thinking...")
     t0 = time.time()
-    img = get_frame()
+    # get_fresh_frame() blocks up to 1 s waiting for a frame newer than
+    # 300 ms old. Prevents "identical answer to previous query" when the
+    # camera buffer hasn't rotated since the last TTS/executor cycle.
+    img = get_fresh_frame()
 
     # Poll up to 500 ms in 50 ms slices instead of blocking a full second.
     # Returns the moment a frame is available — most drops recover in <100 ms.
diff --git a/Config/config_Brain.json b/Config/config_Brain.json
index 57d90f2..7d3f538 100644
--- a/Config/config_Brain.json
+++ b/Config/config_Brain.json
@@ -2,18 +2,19 @@
   "ollama_model": "qwen2.5vl:3b",
   "ollama_host":  "http://127.0.0.1:11434",
   "max_history": 6,
-  "num_batch": 64,
+  "num_batch": 32,
   "num_ctx": 1024,
   "subsystems": {
     "vlm":        true,
-    "lidar":      true,
+    "yolo":       true,
+    "lidar":      false,
     "voice":      true,
     "imgsearch":  false,
     "autonomous": true
   },
-  "num_predict_main": 120,
-  "num_predict_goal": 80,
-  "num_predict_patrol": 100,
-  "num_predict_talk": 80,
+  "num_predict_main": 50,
+  "num_predict_goal": 40,
+  "num_predict_patrol": 50,
+  "num_predict_talk": 50,
   "num_predict_verify": 10
 }
diff --git a/Config/config_Vision.json b/Config/config_Vision.json
index dc83d33..af6a035 100644
--- a/Config/config_Vision.json
+++ b/Config/config_Vision.json
@@ -2,9 +2,10 @@
   "yolo_model_path": "Models/yolov8m.pt",
   "yolo_confidence": 0.45,
   "yolo_iou": 0.45,
-  "yolo_device": "cuda",
+  "yolo_device": "cpu",
   "yolo_half": true,
   "yolo_img_size": 320,
+  "yolo_fps_cap": 2.0,
   "tracked_classes": [
     "person", "chair", "couch", "bed", "dining table",
     "bottle", "cup", "laptop", "keyboard", "mouse",
diff --git a/Vision/marcus_yolo.py b/Vision/marcus_yolo.py
index 6d6a9e4..d646454 100644
--- a/Vision/marcus_yolo.py
+++ b/Vision/marcus_yolo.py
@@ -37,39 +37,50 @@ except Exception as _e:
 YOLO_MODEL_PATH = os.path.join(_PROJECT_ROOT, _cfg.get("yolo_model_path", "Models/yolov8m.pt"))
 YOLO_CONFIDENCE = float(_cfg.get("yolo_confidence", 0.45))
 YOLO_IOU        = float(_cfg.get("yolo_iou", 0.45))
-YOLO_DEVICE     = _cfg.get("yolo_device", "cuda")    # "cuda" | "0" | "cuda:N"
+YOLO_DEVICE     = _cfg.get("yolo_device", "cpu")     # "cpu" | "cuda" | "0" | "cuda:N"
 YOLO_IMG_SIZE   = int(_cfg.get("yolo_img_size", 320))
-YOLO_HALF       = bool(_cfg.get("yolo_half", True))  # FP16 on GPU
+YOLO_HALF       = bool(_cfg.get("yolo_half", True))  # FP16 on GPU (ignored on CPU)
+# FPS cap. On CPU, Orin NX manages ~2-3 FPS of YOLOv8m @ 320px. We throttle
+# lower so CPU inference doesn't compete with Holosoma for cycles. On CUDA,
+# value is irrelevant (GPU is fast enough that the existing 0.02 s sleep
+# already caps at ~21 FPS).
+YOLO_FPS_CAP    = float(_cfg.get("yolo_fps_cap", 2.0))
 
 
 def _resolve_device(requested: str) -> tuple:
     """
-    Resolve the inference device. GPU is required — no CPU fallback.
+    Resolve the YOLO inference device. Both GPU and CPU are supported.
 
-    Returns (device_str, use_half). Raises RuntimeError if CUDA is unavailable
-    or if the config requests CPU. Marcus must run on the Jetson Orin NX GPU.
+    On Jetson Orin NX with Qwen2.5-VL loaded, YOLO on "cuda" takes ~2 GiB of
+    iGPU memory and forces Ollama into a 30/70 CPU/GPU split that crawls
+    vision queries. Matching Marcus_v1's working architecture, the default
+    is now "cpu" — YOLO gets ~1-3 FPS on Orin CPU which is plenty for
+    "is there a person" queries, and Qwen keeps the whole iGPU. Set
+    yolo_device="cuda" only if VLM is disabled (subsystems.vlm=false).
+
+    Returns (device_str, use_half). Never raises for CPU; raises for a
+    CUDA request only when CUDA is genuinely unavailable.
     """
-    req = (requested or "cuda").lower()
+    req = (requested or "cpu").lower()
 
     if req == "cpu":
-        raise RuntimeError(
-            "[YOLO] yolo_device='cpu' in config — Marcus requires GPU. "
-            "Set yolo_device to 'cuda' in Config/config_Vision.json."
-        )
+        # half-precision only makes sense on GPU; force fp32 on CPU
+        return "cpu", False
 
     try:
         import torch
     except ImportError as e:
         raise RuntimeError(
-            "[YOLO] PyTorch not installed — cannot run on GPU. "
-            "Install CUDA-enabled torch on the Jetson."
+            "[YOLO] PyTorch not installed — cannot run on CUDA. "
+            "Either install CUDA-enabled torch, or set "
+            "yolo_device='cpu' in Config/config_Vision.json."
         ) from e
 
     if not torch.cuda.is_available():
         raise RuntimeError(
-            "[YOLO] CUDA not available — torch.cuda.is_available() == False. "
-            "Check nvidia driver / JetPack CUDA runtime on the Jetson "
-            "(try `nvidia-smi` or `tegrastats`)."
+            "[YOLO] yolo_device='cuda' but torch.cuda.is_available()==False. "
+            "Either fix CUDA (tegrastats, nvcc --version) or set "
+            "yolo_device='cpu' in Config/config_Vision.json."
         )
 
     dev = req if (req.startswith("cuda") or req == "0") else "cuda"
@@ -321,16 +332,27 @@ def _inference_loop(model, is_ppe: bool, raw_frame_ref, frame_lock,
     """Background inference loop. Reads frames, updates _latest_detections."""
     frame_count = 0
     t_fps       = time.time()
+    # Minimum wall-clock interval between inferences, in seconds. On CPU this
+    # is the main throttle; on CUDA the model itself limits throughput.
+    min_period  = 1.0 / max(YOLO_FPS_CAP, 0.1)
+    last_infer  = 0.0
 
     while _yolo_running[0]:
         if _yolo_paused[0]:
             time.sleep(0.03)
             continue
+        # FPS cap — on CPU especially, we don't want YOLO to hammer the cores
+        # that Holosoma's 50 Hz RL policy also needs.
+        dt_since = time.time() - last_infer
+        if dt_since < min_period:
+            time.sleep(min(0.05, min_period - dt_since))
+            continue
         with frame_lock:
             frame = raw_frame_ref[0]
         if frame is None:
             time.sleep(0.05)
             continue
+        last_infer = time.time()
 
         try:
             results = model(
diff --git a/install_ollama_jetson.sh b/install_ollama_jetson.sh
index d76b3cb..7e39a09 100755
--- a/install_ollama_jetson.sh
+++ b/install_ollama_jetson.sh
@@ -61,6 +61,15 @@ Environment="OLLAMA_CONTEXT_LENGTH=1024"
 # nobody else wants it. Nice=10 = lowest normal priority.
 IOSchedulingClass=idle
 Nice=10
+# Make Ollama the preferred OOM-killer victim. If the 15 GiB Jetson iGPU +
+# system memory fills up (e.g. swap thrashing after a vision request),
+# kernel picks Ollama first — it auto-restarts, the model cold-loads on
+# next request, Marcus and Holosoma keep running. Much better outcome than
+# killing Marcus (robot brain) or Holosoma (balance control).
+# Observed 2026-04-22: with oom_score_adj=0 across the board, the kernel
+# killed Marcus's python3 (22 GB virtual, swapped) instead of Ollama
+# (larger RSS but systemd-managed). This flips that.
+OOMScoreAdjust=500
 EOF
 
 chmod 644 "$DROPIN_FILE"
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..1337304
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,132 @@
+#!/usr/bin/env bash
+# run.sh — one-command launcher for Marcus on the Jetson.
+#
+# What it does (safely, in order):
+#   1. Ensures systemd Ollama is running (with drop-in flags).
+#   2. Pulls the VLM model if it's missing.
+#   3. Runs the VLM warmup (asks you to put the robot in squat first).
+#   4. Health-gates iGPU placement and system memory.
+#   5. Pauses for you to stand the robot up.
+#   6. Launches Marcus.
+#
+# Fail-fast: any health check failure aborts BEFORE the robot stands.
+#
+# Usage:
+#   ./run.sh                      normal launch
+#   ./run.sh --skip-warmup        skip warmup (only safe if already warm)
+#   ./run.sh --skip-stand-prompt  don't pause for stand-up confirmation
+#
+# First time on a fresh Jetson (one-time, not run here):
+#   sudo ./install_ollama_jetson.sh
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+cd "$SCRIPT_DIR"
+
+SKIP_WARMUP=0
+SKIP_STAND_PROMPT=0
+for arg in "$@"; do
+    case "$arg" in
+        --skip-warmup)       SKIP_WARMUP=1 ;;
+        --skip-stand-prompt) SKIP_STAND_PROMPT=1 ;;
+        -h|--help)
+            grep '^#' "$0" | sed 's/^# \{0,1\}//'
+            exit 0
+            ;;
+        *)
+            echo "Unknown argument: $arg" >&2
+            echo "Use --help for usage." >&2
+            exit 2
+            ;;
+    esac
+done
+
+# ── tiny color helpers ──────────────────────────────────────────────
+BOLD='\033[1m'; GREEN='\033[32m'; YELLOW='\033[33m'; RED='\033[31m'; DIM='\033[2m'; OFF='\033[0m'
+step() { echo -e "\n${BOLD}▶ $1${OFF}"; }
+ok()   { echo -e "  ${GREEN}✓${OFF} $1"; }
+warn() { echo -e "  ${YELLOW}⚠${OFF} $1"; }
+die()  { echo -e "  ${RED}✗${OFF} $1"; exit 1; }
+
+MODEL=$(python3 -c "import json; print(json.load(open('Config/config_Brain.json'))['ollama_model'])")
+HOST=$(python3  -c "import json; print(json.load(open('Config/config_Brain.json'))['ollama_host'])")
+
+# ── 1. Ensure Ollama service is up ─────────────────────────────────
+step "1/5  Ollama systemd service"
+if ! systemctl is-active --quiet ollama; then
+    warn "service is not active — starting it"
+    sudo systemctl start ollama
+    sleep 2
+fi
+if ! systemctl is-active --quiet ollama; then
+    die "failed to start Ollama; check: systemctl status ollama"
+fi
+ADJUST=$(systemctl show ollama -p OOMScoreAdjust --value)
+if [[ "$ADJUST" != "500" ]]; then
+    warn "OOMScoreAdjust is '$ADJUST' (expected 500)"
+    warn "drop-in may not be installed — run: sudo ./install_ollama_jetson.sh"
+fi
+ok "service active (OOMScoreAdjust=$ADJUST)"
+
+# ── 2. Ensure the model is pulled ──────────────────────────────────
+step "2/5  VLM model present"
+if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
+    warn "$MODEL not in store — pulling (~2.2 GB, one-time)"
+    ollama pull "$MODEL"
+fi
+ok "$MODEL available in Ollama store"
+
+# ── 3. Warmup (robot-in-squat safety banner lives inside the script) ──
+step "3/5  VLM warmup"
+if [[ $SKIP_WARMUP -eq 1 ]]; then
+    warn "skipped by --skip-warmup"
+else
+    ./warmup_vlm.sh "$MODEL"
+fi
+
+# ── 4. Health gates ────────────────────────────────────────────────
+step "4/5  Health check"
+
+# 4a. PROCESSOR placement
+PS_LINE=$(ollama ps | awk -v m="$MODEL" '$1==m {for(i=1;i<=NF;i++) if($i ~ /GPU/) {print $(i-1), $i; exit}}')
+PROCESSOR=$(ollama ps | awk -v m="$MODEL" 'NR>1 && $1==m {
+    for(i=4;i<=NF;i++) if($i=="GPU" || $i=="CPU/GPU" || $i=="CPU") {print $(i-1) " " $i; break}
+}')
+if [[ -z "$PROCESSOR" ]]; then
+    warn "model not loaded — running without prewarm; first vision call will cold-load"
+elif echo "$PROCESSOR" | grep -qE "^100% GPU"; then
+    ok "placement: $PROCESSOR (ideal)"
+elif echo "$PROCESSOR" | grep -qE "CPU/GPU"; then
+    SPLIT=$(echo "$PROCESSOR" | awk '{print $1}')
+    warn "placement: $PROCESSOR — partial CPU offload will slow vision queries"
+    warn "if unacceptable, lower OLLAMA_GPU_OVERHEAD in install_ollama_jetson.sh"
+else
+    warn "placement: $PROCESSOR"
+fi
+
+# 4b. system memory
+read -r MEM_USED MEM_AVAIL SWAP_USED <<< "$(free -m | awk '
+    /^Mem:/  { used=$3; avail=$7 }
+    /^Swap:/ { sused=$3 }
+    END { print used, avail, sused }
+')"
+echo "  ${DIM}mem used: ${MEM_USED} MiB   available: ${MEM_AVAIL} MiB   swap used: ${SWAP_USED} MiB${OFF}"
+if (( SWAP_USED > 500 )); then
+    die "swap in use (${SWAP_USED} MiB) — memory is already tight, would thrash under load"
+fi
+if (( MEM_AVAIL < 1000 )); then
+    die "only ${MEM_AVAIL} MiB available — Holosoma/Marcus/camera won't have room"
+fi
+ok "memory healthy"
+
+# ── 5. Stand the robot, then launch ────────────────────────────────
+step "5/5  Launch Marcus"
+if [[ $SKIP_STAND_PROMPT -eq 0 ]]; then
+    echo -e "  ${YELLOW}Robot can now stand up.${OFF}"
+    echo -e "  ${DIM}Press ENTER when the robot is standing, or Ctrl-C to abort.${OFF}"
+    read -r
+fi
+echo
+
+exec python3 run_marcus.py
diff --git a/run_marcus.py b/run_marcus.py
index c1e9df4..70b5e98 100644
--- a/run_marcus.py
+++ b/run_marcus.py
@@ -19,16 +19,13 @@ PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
 if PROJECT_ROOT not in sys.path:
     sys.path.insert(0, PROJECT_ROOT)
 
-# Tell the Linux OOM killer to pick Ollama (easily restarted) or any other
-# process before Marcus. -900 is almost-but-not-quite OOM-immune; we don't
-# use -1000 because that disables OOM handling entirely, which is risky if
-# Marcus ever had a runaway allocation. Writing oom_score_adj doesn't need
-# root — a process can always lower its own score.
-try:
-    with open(f"/proc/{os.getpid()}/oom_score_adj", "w") as _f:
-        _f.write("-900")
-except OSError:
-    pass  # not fatal — running under a restrictive sandbox
+# NOTE: we intentionally do NOT try to lower Marcus's oom_score_adj here.
+# The Linux kernel requires CAP_SYS_RESOURCE to set a negative oom_score_adj,
+# and Marcus typically runs as an unprivileged user, so any write fails with
+# PermissionError — silent no-op. Instead, install_ollama_jetson.sh adds
+# OOMScoreAdjust=500 to the Ollama systemd unit. Under memory pressure the
+# kernel then kills Ollama (auto-restarts, model cold-loads again on next
+# vision query) rather than Marcus (robot brain).
 
 from Brain.marcus_brain import run_terminal
 
diff --git a/start_ollama.sh b/start_ollama.sh
index 39bede4..4de9f50 100755
--- a/start_ollama.sh
+++ b/start_ollama.sh
@@ -1,22 +1,34 @@
 #!/usr/bin/env bash
-# start_ollama.sh — launch Ollama with Jetson-friendly memory settings
+# start_ollama.sh — DEPRECATED, do not use.
 #
-# The Jetson Orin NX has 16 GB unified CPU+GPU memory. When Marcus + YOLO +
-# Whisper + Holosoma + the camera + audio all run alongside Qwen2.5-VL,
-# the compute-graph OOMs the llama runner and Linux kills the biggest
-# process (often Holosoma — which is a safety problem for locomotion).
+# This script launches Ollama as the current (unitree) user and completely
+# BYPASSES the systemd drop-in at /etc/systemd/system/ollama.service.d/,
+# which is where all the safety-critical flags live:
 #
-# These env vars cut Ollama's memory footprint:
-#   OLLAMA_FLASH_ATTENTION=1      ~30 % less memory for attention tensors
-#   OLLAMA_KV_CACHE_TYPE=q8_0     quantize KV cache (halves it)
-#   OLLAMA_KEEP_ALIVE=2m          keep the model warm for 2 min then evict
-#                                 (adjust if cold-load lag matters more
-#                                  than idle memory)
-#   OLLAMA_MAX_LOADED_MODELS=1    never hold two VL models at once
+#   OOMScoreAdjust=500           (makes Ollama the preferred OOM victim)
+#   IOSchedulingClass=idle       (so disk reads don't starve Holosoma)
+#   Nice=10                      (lowest normal scheduler priority)
+#   OLLAMA_GPU_OVERHEAD=4 GiB    (reserves iGPU for YOLO/Holosoma/camera)
+#   OLLAMA_KEEP_ALIVE=24h        (model stays resident all day)
+#   OLLAMA_CONTEXT_LENGTH=1024   (shrinks compute graph)
+#   OLLAMA_FLASH_ATTENTION=1
+#   OLLAMA_KV_CACHE_TYPE=q8_0
+#   OLLAMA_MAX_LOADED_MODELS=1
 #
-# Usage:
-#   ./start_ollama.sh            # starts server in background, logs to /tmp/ollama.log
-#   ./start_ollama.sh --fg       # runs in foreground (for debugging)
+# Running this script instead of systemd means NONE of those protections
+# are active, and the robot WILL fall the next time a vision query runs.
+#
+# CORRECT WAY:
+#   sudo systemctl start ollama        # uses the drop-in
+#   ./warmup_vlm.sh                    # then warm up
+#   python3 run_marcus.py              # then start Marcus
+#
+# ═════════════════════════════════════════════════════════════════════
+# REFUSING TO RUN. Delete this block only if you know what you're doing.
+# ═════════════════════════════════════════════════════════════════════
+echo "start_ollama.sh is DEPRECATED. Use:  sudo systemctl start ollama" >&2
+echo "  (see comments at the top of this file for why)" >&2
+exit 1
 
 pkill -f "ollama (runner|serve)" 2>/dev/null
 sleep 1
diff --git a/warmup_vlm.sh b/warmup_vlm.sh
index 5a67597..dbd5b48 100755
--- a/warmup_vlm.sh
+++ b/warmup_vlm.sh
@@ -24,6 +24,16 @@ set -e
 MODEL="${1:-qwen2.5vl:3b}"
 HOST="http://127.0.0.1:11434"
 
+# Pull num_batch / num_ctx from config_Brain.json. CRITICAL: these must match
+# what Marcus will send at runtime, otherwise Ollama evicts the cached runner
+# on the FIRST real vision command and re-instantiates it — the overlap of
+# old-runner-unloading + new-runner-loading triggers an iGPU OOM and the
+# runner dies with status 500. Symptom: warmup succeeds, first `what do you
+# see` crashes 10-30 s later with "llama runner process has terminated".
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+NUM_BATCH=$(python3 -c "import json; print(json.load(open('$SCRIPT_DIR/Config/config_Brain.json'))['num_batch'])" 2>/dev/null || echo 64)
+NUM_CTX=$(python3   -c "import json; print(json.load(open('$SCRIPT_DIR/Config/config_Brain.json'))['num_ctx'])"   2>/dev/null || echo 1024)
+
 # Sanity: is ollama reachable?
 if ! curl -sf "$HOST/api/version" > /dev/null; then
     echo "✗ Ollama is not running on $HOST"
@@ -64,6 +74,7 @@ else
 BANNER
     read -r
     echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
+    echo "  num_batch=$NUM_BATCH num_ctx=$NUM_CTX (matching Marcus's runtime config)"
     echo "  Watching tegrastats in a second window is useful — GPU should spike."
     echo
 fi
@@ -78,7 +89,7 @@ START=$(date +%s)
 # unloading the model after the default 5 min.
 RESPONSE=$(curl -s "$HOST/api/generate" \
     -H 'Content-Type: application/json' \
-    -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}")
+    -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":$NUM_BATCH,\"num_ctx\":$NUM_CTX}}")
 
 END=$(date +%s)
 ELAPSED=$((END - START))