Compare commits

...

10 Commits

Author SHA1 Message Date
bcb2fbbcdf Update 2026-04-28 15:55:43 2026-04-28 15:55:45 +04:00
c6c847e3ab Update 2026-04-27 13:48:17 2026-04-27 13:48:18 +04:00
2129cadb11 Update 2026-04-27 11:16:57 2026-04-27 11:16:57 +04:00
211d4f52ab Update 2026-04-27 09:39:12 2026-04-27 09:39:13 +04:00
9485601e18 Update 2026-04-24 15:23:19 2026-04-24 15:23:19 +04:00
5d839d4f4e Voice: finalise on faster-whisper + energy wake, remove Vosk
Full-day voice-stack refactor. Experiments run and reverted:
- Gemini Live HTTP microservice (Python 3.8 env incompat, latency)
- Vosk grammar STT (English lexicon can't decode 'Sanad'; big model
  cold-load too slow on Jetson CPU)

Kept architecture:
- Voice/wake_detector.py — pure-numpy energy state machine with
  adaptive baseline, burst-audio capture for post-hoc verify.
- Voice/marcus_voice.py — orchestrator with 3 modes
  (wake_and_command / always_on / always_on_gated), hysteretic VAD,
  pre-silence trim (300 ms pre-roll), DSP pipeline (DC remove,
  80 Hz HPF, 0.97 pre-emphasis, peak-normalize), faster-whisper
  base.en int8 with beam=8 + temperature fallback [0,0.2,0.4],
  fuzzy-match canonicalisation, GARBAGE_PATTERNS + length filter,
  /s-/ phonetic wake-verify, full-turn debug WAV recording.

Config-driven vocab (zero hardcoded strings in Python):
- stt.wake_words (33 variants of 'Sanad')
- stt.command_vocab (68 canonical phrases)
- stt.garbage_patterns (17 Whisper noise outputs)
- stt.min_transcription_length, stt.command_vocab_cutoff

Command parser widened (Brain/command_parser.py):
- _RE_SIMPLE_DIR — bare direction + verb+direction combos
  ('left', 'go back', 'move forward', 'step right', ...)
- _RE_STOP_SIMPLE — bare stop/halt/wait/pause/freeze/hold
- All motion constants sourced from config_Navigation.json
  (move_map + step_duration_sec) via API/zmq_api.py; no more
  hardcoded 0.3 / 2.0 magic numbers.

API/audio_api.py — _play_pcm now uses AudioClient.PlayStream with
automatic resampling to 16 kHz (matches Sanad's proven pattern).

Removed:
- Voice/vosk_stt.py (and all Vosk references in marcus_voice.py)
- Models/vosk-model-small-en-us-0.15/ (40 MB model + zip)
- All Vosk keys from Config/config_Voice.json

Documentation synced across README, Doc/architecture.md,
Doc/pipeline.md, Doc/functions.md, Doc/controlling.md,
Doc/MARCUS_API.md, Doc/environment.md changelog.

Known limitation: faster-whisper base.en on Jetson CPU + G1
far-field mic yields ~50% command-transcription accuracy due
to model capacity and mic reverberation. Wake + ack + recording
+ trim + Whisper + fuzzy + brain + motion all verified working
end-to-end. Future improvement path (unused): close-talking USB
mic via pactl_parec, or Gemini Live via HTTP microservice.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 14:32:28 +04:00
f45e12fae5 Update 2026-04-23 09:54:45 2026-04-23 09:54:45 +04:00
ce09b6920a Update 2026-04-22 17:54:49 2026-04-22 17:54:50 +04:00
00e52496a9 Update 2026-04-22 17:01:46 2026-04-22 17:01:48 +04:00
78a5b0b408 Update 2026-04-22 15:37:55 2026-04-22 15:37:56 +04:00
176 changed files with 8511 additions and 3042 deletions

View File

@ -211,56 +211,58 @@ class AudioAPI:
# ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ───────── # ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
def _play_pcm(self, audio_16k: np.ndarray) -> float: def _play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
"""Play 16kHz mono int16 on G1 speaker. Returns duration.""" """
Play mono int16 PCM on the G1 speaker.
`rate` is the sample rate of the incoming `audio`; we always
resample to self._target_rate (16 kHz) before sending because the
G1 speaker hardware only honors that rate if you hand it 24 kHz
PCM, it plays ~1.5x too fast. This matches the Sanad pattern.
Uses AudioClient.PlayStream (the high-level API) with a fresh
stream_id + STOP_PLAY bracket on either side so a prior stream
can't blend into this one.
"""
if not self._sdk_available: if not self._sdk_available:
log.warning("SDK not available, cannot play audio") log.warning("SDK not available, cannot play audio")
return 0.0 return 0.0
from unitree_sdk2py.g1.audio.g1_audio_api import ( src_rate = int(rate) if rate else self._target_rate
ROBOT_API_ID_AUDIO_START_PLAY, audio = self._resample(audio, src_rate) # → self._target_rate
ROBOT_API_ID_AUDIO_STOP_PLAY, if audio.size == 0:
) return 0.0
from unitree_sdk2py.g1.audio.g1_audio_api import ROBOT_API_ID_AUDIO_STOP_PLAY
app_name = self._spk["app_name"] app_name = self._spk["app_name"]
# Stop previous stream # Stop any prior stream before opening a new one.
self._client._Call( self._client._Call(
ROBOT_API_ID_AUDIO_STOP_PLAY, ROBOT_API_ID_AUDIO_STOP_PLAY,
json.dumps({"app_name": app_name}), json.dumps({"app_name": app_name}),
) )
time.sleep(0.3) time.sleep(0.15)
# Build params — unique stream_id every call
pcm = audio_16k.tobytes()
sid = f"s_{int(time.time() * 1000)}" sid = f"s_{int(time.time() * 1000)}"
param = json.dumps({ self._client.PlayStream(app_name, sid, audio.tobytes())
"app_name": app_name,
"stream_id": sid,
"sample_rate": self._target_rate,
"channels": 1,
"bits_per_sample": 16,
})
# Single call — full buffer duration = len(audio) / self._target_rate
self._client._CallRequestWithParamAndBin( # Margin covers DDS buffer drain before STOP cuts playback short.
ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm) time.sleep(duration + 0.3)
)
duration = len(audio_16k) / self._target_rate
time.sleep(duration + 0.5)
self._client._Call( self._client._Call(
ROBOT_API_ID_AUDIO_STOP_PLAY, ROBOT_API_ID_AUDIO_STOP_PLAY,
json.dumps({"app_name": app_name}), json.dumps({"app_name": app_name}),
) )
log.info("Played: %.1fs", duration) log.info("Played: %.1fs (src=%d Hz → hw=%d Hz)",
duration, src_rate, self._target_rate)
return duration return duration
def play_pcm(self, audio_16k: np.ndarray) -> float: def play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
"""Public wrapper for playing PCM audio.""" """Public wrapper for playing PCM audio."""
return self._play_pcm(audio_16k) return self._play_pcm(audio, rate=rate)
# ─── MIC RECORDING ─────────────────────────────────── # ─── MIC RECORDING ───────────────────────────────────

View File

@ -84,6 +84,15 @@ def add_to_history(user_msg: str, assistant_msg: str):
def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str: def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str:
"""
Single synchronous VLM call same mechanism as Marcus_v1's _call_llava.
With YOLO on CPU (config_Vision.json::yolo_device="cpu"), there is no
iGPU contention to guard against, so the v1-style plain call is the
right shape. num_batch and num_ctx are still passed per-request because
Ollama's compute-graph pre-allocation pays attention to them (default
batch=512/ctx=4096 would OOM on the Jetson).
"""
if not VLM_ENABLED: if not VLM_ENABLED:
return "" # safe-mode — caller must handle empty string return "" # safe-mode — caller must handle empty string
messages = [] messages = []
@ -93,38 +102,14 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
if img_b64: if img_b64:
msg["images"] = [img_b64] msg["images"] = [img_b64]
messages.append(msg) messages.append(msg)
r = _client.chat(model=OLLAMA_MODEL, messages=messages,
# When an image is attached, pause YOLO to free iGPU memory for the options={
# vision-encoder activations (~1.5 GiB). Without this, concurrent YOLO "temperature": 0.0,
# inference + Qwen vision-encode exceeds the 15 GiB Jetson iGPU budget "num_predict": num_predict,
# and the llama runner is reaped by the OOM killer (status code: 500). "num_batch": NUM_BATCH,
# Text-only calls skip the pause — they fit easily and YOLO stays hot. "num_ctx": NUM_CTX,
_paused = False })
if img_b64: return r["message"]["content"].strip()
try:
from API.yolo_api import yolo_pause, yolo_resume, YOLO_AVAILABLE
if YOLO_AVAILABLE:
yolo_pause()
_paused = True
except Exception:
pass
try:
r = _client.chat(model=OLLAMA_MODEL, messages=messages,
options={
"temperature": 0.0,
"num_predict": num_predict,
"num_batch": NUM_BATCH,
"num_ctx": NUM_CTX,
})
return r["message"]["content"].strip()
finally:
if _paused:
try:
from API.yolo_api import yolo_resume
yolo_resume()
except Exception:
pass
def parse_json(raw: str): def parse_json(raw: str):
@ -197,13 +182,21 @@ def ask_goal(goal: str, img_b64) -> dict:
def ask_talk(command: str, img_b64, facts: str = "") -> dict: def ask_talk(command: str, img_b64, facts: str = "") -> dict:
"""Handle talk-only commands using the YAML talk_prompt.""" """
Handle talk-only commands using the YAML talk_prompt.
NOTE: use_history is off. Accumulated turns push the prompt past
qwen2.5vl's 2048-token KV cache; Ollama then truncates and tries to
RoPE-shift the cache, which triggers an Ollama/ggml bug
(GGML_ASSERT(a->ne[2] * 4 == b->ne[0]) runner SIGABRTs with status
500. Keeping each call stateless avoids the ceiling entirely.
"""
if not VLM_ENABLED: if not VLM_ENABLED:
return dict(_VLM_OFF_EMPTY) return dict(_VLM_OFF_EMPTY)
try: try:
prompt = TALK_PROMPT.format(command=command, facts=facts) prompt = TALK_PROMPT.format(command=command, facts=facts)
raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"], raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"],
use_history=True) use_history=False)
print(f" Raw: {raw}") print(f" Raw: {raw}")
d = parse_json(raw) d = parse_json(raw)
if d is None: if d is None:

View File

@ -80,6 +80,11 @@ def send_cmd(cmd: str):
_ensure_sock().send_string(json.dumps({"cmd": cmd})) _ensure_sock().send_string(json.dumps({"cmd": cmd}))
# Load MOVE_MAP from navigation config (pure data, safe at import time) # Load navigation constants from config (pure data, safe at import time).
# MOVE_MAP[direction] = (vx, vy, vyaw). STEP_DURATION_SEC is how long one
# "step" of a bare directional command lasts (2 s at default velocities
# ≈ 60 cm forward or 34° turn). Both live in config_Navigation.json so
# motion can be retuned without editing Python.
_nav = load_config("Navigation") _nav = load_config("Navigation")
MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()} MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()}
STEP_DURATION_SEC = float(_nav.get("step_duration_sec", 2.0))

View File

@ -61,6 +61,25 @@ import base64
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
# Persist autonomous-mode events to logs/autonomous.log so an unattended
# explore/patrol run leaves an audit trail. Terminal output is preserved
# (the existing " [Auto] ..." indented lines still print) but every
# event is also written to the dedicated log for post-mortem review.
try:
from Core.logger import log as _core_log
except Exception:
_core_log = None
def _alog(msg: str, level: str = "info") -> None:
"""Print and persist to logs/autonomous.log."""
print(f" [Auto] {msg}")
if _core_log is not None:
try:
_core_log(f"[Auto] {msg}", level, "autonomous")
except Exception:
pass
# ══════════════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════════════
# CONFIGURATION # CONFIGURATION
@ -134,7 +153,7 @@ class AutonomousMode:
"""Start autonomous exploration.""" """Start autonomous exploration."""
with self._lock: with self._lock:
if self._enabled: if self._enabled:
print(" [Auto] Already running — use 'auto off' to stop first") _alog("Already running — use 'auto off' to stop first")
return return
self._enabled = True self._enabled = True
@ -153,19 +172,19 @@ class AutonomousMode:
) )
self._thread.start() self._thread.start()
print(f"\n [Auto] Exploration started") _alog(f"Exploration started")
print(f" [Auto] Map folder: {self._map_dir}") _alog(f"Map folder: {self._map_dir}")
print(f" [Auto] Type 'auto off' to stop\n") _alog("Type 'auto off' to stop")
def disable(self): def disable(self):
"""Stop autonomous exploration and save results.""" """Stop autonomous exploration and save results."""
with self._lock: with self._lock:
if not self._enabled: if not self._enabled:
print(" [Auto] Not running") _alog("Not running")
return return
self._enabled = False self._enabled = False
print("\n [Auto] Stopping exploration...") _alog("Stopping exploration...")
# Wait for thread to finish # Wait for thread to finish
if self._thread and self._thread.is_alive(): if self._thread and self._thread.is_alive():
@ -183,29 +202,29 @@ class AutonomousMode:
obs = len(self._observations) obs = len(self._observations)
if not running: if not running:
print(" [Auto] Status: IDLE") _alog("Status: IDLE")
if self._map_dir: if self._map_dir:
print(f" [Auto] Last map: {self._map_dir}") _alog(f"Last map: {self._map_dir}")
return return
elapsed = time.time() - (self._start_time or time.time()) elapsed = time.time() - (self._start_time or time.time())
mins = int(elapsed // 60) mins = int(elapsed // 60)
secs = int(elapsed % 60) secs = int(elapsed % 60)
print(f" [Auto] Status: EXPLORING") _alog(f"Status: EXPLORING")
print(f" [Auto] Duration: {mins}m {secs}s") _alog(f"Duration: {mins}m {secs}s")
print(f" [Auto] Steps: {step} | Observations: {obs}") _alog(f"Steps: {step} | Observations: {obs}")
if self._area_counts: if self._area_counts:
areas = ", ".join(f"{k}:{v}" for k, v in sorted(self._area_counts.items())) areas = ", ".join(f"{k}:{v}" for k, v in sorted(self._area_counts.items()))
print(f" [Auto] Areas seen: {areas}") _alog(f"Areas seen: {areas}")
if self._all_objects: if self._all_objects:
print(f" [Auto] Objects found: {', '.join(sorted(self._all_objects))}") _alog(f"Objects found: {', '.join(sorted(self._all_objects))}")
pos = self._odom_pos() pos = self._odom_pos()
if pos: if pos:
print(f" [Auto] Position: x={pos['x']:.2f} y={pos['y']:.2f} heading={pos['heading']:.1f}°") _alog(f"Position: x={pos['x']:.2f} y={pos['y']:.2f} heading={pos['heading']:.1f}°")
def is_enabled(self) -> bool: def is_enabled(self) -> bool:
with self._lock: with self._lock:
@ -215,7 +234,7 @@ class AutonomousMode:
"""Save current state to disk without stopping.""" """Save current state to disk without stopping."""
self._save_observations() self._save_observations()
self._save_path() self._save_path()
print(f" [Auto] Snapshot saved to {self._map_dir}") _alog(f"Snapshot saved to {self._map_dir}")
# ── EXPLORATION LOOP ──────────────────────────────────────────────────────── # ── EXPLORATION LOOP ────────────────────────────────────────────────────────
@ -240,7 +259,7 @@ class AutonomousMode:
if self._yolo_sees("person"): if self._yolo_sees("person"):
closest = self._yolo_closest("person") closest = self._yolo_closest("person")
if closest and closest.distance_estimate == "very close": if closest and closest.distance_estimate == "very close":
print(f" [Auto] Person very close — pausing 2s") _alog(f"Person very close — pausing 2s")
self._gradual_stop() self._gradual_stop()
time.sleep(2.0) time.sleep(2.0)
continue continue
@ -268,7 +287,7 @@ class AutonomousMode:
# ── Movement decision ───────────────────────────────────────────── # ── Movement decision ─────────────────────────────────────────────
if consecutive_blocks >= 3: if consecutive_blocks >= 3:
# Stuck — turn more aggressively # Stuck — turn more aggressively
print(f" [Auto] Stuck — turning {self._last_turn} 180°") _alog(f"Stuck — turning {self._last_turn} 180°")
self._turn(self._last_turn, TURN_DURATION * 2) self._turn(self._last_turn, TURN_DURATION * 2)
consecutive_blocks = 0 consecutive_blocks = 0
continue continue
@ -281,14 +300,14 @@ class AutonomousMode:
# Alternate left/right turns to explore both directions # Alternate left/right turns to explore both directions
turn_dir = "left" if self._last_turn == "right" else "right" turn_dir = "left" if self._last_turn == "right" else "right"
self._last_turn = turn_dir self._last_turn = turn_dir
print(f" [Auto] Obstacle — turning {turn_dir}") _alog(f"Obstacle — turning {turn_dir}")
self._turn(turn_dir, TURN_DURATION) self._turn(turn_dir, TURN_DURATION)
else: else:
consecutive_blocks = 0 consecutive_blocks = 0
# ── Max observations check ──────────────────────────────────────── # ── Max observations check ────────────────────────────────────────
if len(self._observations) >= MAX_OBSERVATIONS: if len(self._observations) >= MAX_OBSERVATIONS:
print(f" [Auto] Max observations ({MAX_OBSERVATIONS}) reached — stopping") _alog(f"Max observations ({MAX_OBSERVATIONS}) reached — stopping")
self._enabled = False self._enabled = False
break break
@ -375,9 +394,9 @@ class AutonomousMode:
} }
self._observations.append(obs) self._observations.append(obs)
print(f" [Auto] Step {self._step} | {area_type} | {observation[:60]}") _alog(f"Step {self._step} | {area_type} | {observation[:60]}")
if objects: if objects:
print(f" [Auto] Objects: {', '.join(objects)}") _alog(f"Objects: {', '.join(objects)}")
# Save frame if interesting # Save frame if interesting
if interesting and SAVE_FRAMES: if interesting and SAVE_FRAMES:
@ -396,7 +415,7 @@ class AutonomousMode:
self._save_path() self._save_path()
except Exception as e: except Exception as e:
print(f" [Auto] LLaVA assess error: {e}") _alog(f"LLaVA assess error: {e}")
# ── FILE I/O ──────────────────────────────────────────────────────────────── # ── FILE I/O ────────────────────────────────────────────────────────────────
@ -429,7 +448,7 @@ class AutonomousMode:
json.dump(self._observations, f, indent=2, ensure_ascii=False) json.dump(self._observations, f, indent=2, ensure_ascii=False)
tmp.replace(path) tmp.replace(path)
except Exception as e: except Exception as e:
print(f" [Auto] Save observations error: {e}") _alog(f"Save observations error: {e}")
def _save_path(self): def _save_path(self):
if not self._map_dir or not self._path: if not self._map_dir or not self._path:
@ -441,7 +460,7 @@ class AutonomousMode:
json.dump(self._path, f, indent=2) json.dump(self._path, f, indent=2)
tmp.replace(path) tmp.replace(path)
except Exception as e: except Exception as e:
print(f" [Auto] Save path error: {e}") _alog(f"Save path error: {e}")
def _save_frame(self, img_b64: str, step: int): def _save_frame(self, img_b64: str, step: int):
"""Save a camera frame as JPEG.""" """Save a camera frame as JPEG."""
@ -452,7 +471,7 @@ class AutonomousMode:
with open(frame_path, "wb") as f: with open(frame_path, "wb") as f:
f.write(__import__("base64").b64decode(img_b64)) f.write(__import__("base64").b64decode(img_b64))
except Exception as e: except Exception as e:
print(f" [Auto] Save frame error: {e}") _alog(f"Save frame error: {e}")
def _generate_summary(self) -> str: def _generate_summary(self) -> str:
"""Generate a text summary of the exploration session.""" """Generate a text summary of the exploration session."""
@ -500,7 +519,7 @@ class AutonomousMode:
with open(self._map_dir / "summary.txt", "w", encoding="utf-8") as f: with open(self._map_dir / "summary.txt", "w", encoding="utf-8") as f:
f.write(summary) f.write(summary)
except Exception as e: except Exception as e:
print(f" [Auto] Save summary error: {e}") _alog(f"Save summary error: {e}")
def _print_summary(self): def _print_summary(self):
"""Print exploration summary to terminal.""" """Print exploration summary to terminal."""
@ -508,15 +527,15 @@ class AutonomousMode:
mins = int(elapsed // 60) mins = int(elapsed // 60)
secs = int(elapsed % 60) secs = int(elapsed % 60)
print(f"\n [Auto] Exploration complete") _alog(f"Exploration complete")
print(f" [Auto] Duration: {mins}m {secs}s | Steps: {self._step}") _alog(f"Duration: {mins}m {secs}s | Steps: {self._step}")
print(f" [Auto] Observations: {len(self._observations)}") _alog(f"Observations: {len(self._observations)}")
if self._area_counts: if self._area_counts:
print(f" [Auto] Areas: {dict(sorted(self._area_counts.items()))}") _alog(f"Areas: {dict(sorted(self._area_counts.items()))}")
if self._all_objects: if self._all_objects:
print(f" [Auto] Objects: {', '.join(sorted(self._all_objects))}") _alog(f"Objects: {', '.join(sorted(self._all_objects))}")
if self._map_dir: if self._map_dir:
print(f" [Auto] Saved to: {self._map_dir}\n") _alog(f"Saved to: {self._map_dir}\n")

View File

@ -3,13 +3,16 @@ command_parser.py — Local command regex patterns + dispatcher
Handles place memory, odometry, session recall, help, examples Handles place memory, odometry, session recall, help, examples
""" """
import re import re
import threading
import time import time
from API.zmq_api import send_vel, gradual_stop from API.zmq_api import send_vel, gradual_stop, MOVE_MAP, STEP_DURATION_SEC
from API.memory_api import mem, place_save, place_goto, places_list_str from API.memory_api import mem, place_save, place_goto, places_list_str
from API.odometry_api import odom, ODOM_AVAILABLE from API.odometry_api import odom, ODOM_AVAILABLE
from API.camera_api import get_frame from API.camera_api import get_frame
from API.llava_api import ask from API.llava_api import ask, call_llava, parse_json
from Brain.executor import execute from Brain.executor import execute
from Core.motion_state import motion_abort, wait_while_paused
from Core.motion_log import log_motion, print_motion
# ── Compiled patterns ──────────────────────────────────────────────────────── # ── Compiled patterns ────────────────────────────────────────────────────────
@ -33,9 +36,33 @@ _RE_TURN_DEG = re.compile(
# that for a trivial two-second motion. One step = 2 s of motion at the default # that for a trivial two-second motion. One step = 2 s of motion at the default
# velocity, matching the undo-loop duration already used below. # velocity, matching the undo-loop duration already used below.
_RE_WALK_STEP = re.compile( _RE_WALK_STEP = re.compile(
r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+)\s*steps?$", re.I) r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+(?:\.\d+)?)\s*steps?$", re.I)
_RE_TURN_STEP = re.compile( _RE_TURN_STEP = re.compile(
r"^turn\s+(left|right)(?:\s+(\d+)\s*steps?)?$", re.I) r"^turn\s+(left|right)(?:\s+(\d+(?:\.\d+)?)\s*steps?)?$", re.I)
# Simple one-shot motion — one word or verb+direction, no counts/units.
# All default to a ~2 s motion at the normal velocity. Kept local so the
# user doesn't eat a 5 s Qwen round-trip for a trivial "go back".
#
# Matches:
# "left" / "right" / "forward" / "back" / "backward"
# "go back" / "step back" / "move back" / "walk back" / "run back"
# "go forward" / "step forward" / "move forward" / "walk forward"
# "go left" / "move right" / "step left" / etc.
# "head forward" / "head back"
# Does NOT match multi-word phrases like "walk to the chair" — those
# still fall through to Qwen where they belong.
_RE_SIMPLE_DIR = re.compile(
r"^(?:(?:walk|go|move|step|run|head)\s+)?"
r"(forward|back(?:ward)?|left|right)$",
re.I,
)
# Bare stop / pause words — no need to ask Qwen what "stop" means.
_RE_STOP_SIMPLE = re.compile(
r"^(?:stop|halt|wait|pause|stay|freeze|hold|stand\s+still|don'?t\s+move)$",
re.I,
)
_RE_PATROL_RT = re.compile( _RE_PATROL_RT = re.compile(
r"^patrol[/:]\s*(.+)$", re.I) r"^patrol[/:]\s*(.+)$", re.I)
_RE_LAST_CMD = re.compile( _RE_LAST_CMD = re.compile(
@ -55,6 +82,106 @@ _RE_SESSION_SUMMARY = re.compile(
_RE_AUTO = re.compile( _RE_AUTO = re.compile(
r"^auto(?:nomous)?\s+(on|off|status|save|summary)$", re.I) r"^auto(?:nomous)?\s+(on|off|status|save|summary)$", re.I)
# ── DIRECT-MOTION CANONICALS ─────────────────────────────────────────────
# Canonicals that the dispatcher emits but were previously falling through
# to the LLaVA/Qwen fallback (530s blocking call on Jetson). Each maps
# to a fast, deterministic motion or arm gesture. See Bug #3 in the
# 14-bug audit.
_RE_TURN_AROUND = re.compile(r"^turn[\s_-]+around$", re.I)
_RE_SIT_DOWN = re.compile(r"^sit[\s_-]+down$", re.I)
_RE_STAND_UP = re.compile(r"^stand[\s_-]+up$", re.I)
_RE_WAVE_HELLO = re.compile(r"^wave(?:[\s_-]+(?:hello|hi))?$", re.I)
_RE_RAISE_ARM = re.compile(r"^raise[\s_-]+(?:right[\s_-]+)?arm$", re.I)
_RE_LOWER_ARM = re.compile(r"^lower[\s_-]+(?:right[\s_-]+)?arm$", re.I)
_RE_STAY_HERE = re.compile(r"^stay[\s_-]+(?:here|in[\s_-]+place)$", re.I)
_RE_POINT_GESTURE = re.compile(r"^point(?:[\s_-]+(?:at|to)[\s_-]+.*)?$", re.I)
# Note: 'move left' / 'move right' canonicals are handled by the existing
# _RE_SIMPLE_DIR (verb=move, direction=left/right via MOVE_MAP). MOVE_MAP
# in config_Navigation.json defines 'left'/'right' as ROTATIONS, not
# strafes — historical decision; if real strafes are wanted later add
# vy axis to MOVE_MAP and a separate strafe handler.
# ── HIGHER-LEVEL MOTIONS ─────────────────────────────────────────────────────
# 'look around' = scan-left + return + scan-right + return; uses execute_action
# so motion_abort halts it cleanly.
_RE_LOOK_AROUND = re.compile(
r"^(?:look[\s_-]+around|scan[\s_-]+(?:the\s+)?(?:room|area)|survey[\s_-]+(?:the\s+)?surroundings?)$",
re.I,
)
# 'follow me' = walk forward when YOLO sees a person and they aren't too close,
# else hold. Loop bounded by FOLLOW_ME_MAX_SEC and motion_abort.
_RE_FOLLOW_ME = re.compile(
r"^(?:follow[\s_-]+me|come[\s_-]+with[\s_-]+me|stick[\s_-]+with[\s_-]+me)$",
re.I,
)
# Spatial planner: 'walk to/towards the <X>'. Distinct from _RE_GOTO
# above because that one is reserved for place-memory names ('go to
# kitchen' looks up the saved 'kitchen' position). This planner uses
# LLaVA scene grounding to actually steer toward an arbitrary visible
# object — used for unsaved targets like 'the door' or 'the red chair'.
# Verbs: walk / step + to/towards/over to. We do NOT include 'go to'
# here to avoid colliding with place memory.
_RE_WALK_TO = re.compile(
r"^(?:(?:walk|step)\s+(?:over\s+)?(?:to|towards?|up\s+to)|approach)\s+"
r"(?:the\s+|a\s+|an\s+|that\s+|this\s+)?(.+?)$",
re.I,
)
# Tunable durations for higher-level motions. Kept inline (not config) so a
# hot-edit doesn't risk an import-time crash; tweak if the demo asks for
# longer scan / follow runs.
LOOK_AROUND_TURN_SEC = 1.0 # ~45° at default vyaw
LOOK_AROUND_HOLD_SEC = 0.4 # pause at each extreme so VLM can see
FOLLOW_ME_MAX_SEC = 30.0 # hard upper bound on a 'follow me' session
FOLLOW_ME_STEP_SEC = 0.5 # one forward burst per yolo-poll cycle
FOLLOW_ME_POLL_SEC = 0.2 # how often to re-check yolo when no person
# Spatial planner tunables. The planner asks LLaVA where the target is
# (left/center/right + near/medium/far) every WALK_TO_REPLAN_SEC, then
# turns to centre + walks forward in WALK_TO_STEP_SEC bursts. Total
# session bounded by WALK_TO_MAX_SEC.
WALK_TO_MAX_SEC = 45.0
WALK_TO_REPLAN_SEC = 2.5 # cadence of LLaVA scene queries
WALK_TO_STEP_SEC = 1.0 # forward burst length between re-plans
WALK_TO_TURN_SHORT_SEC = 0.4 # small correction (~18° at default vyaw)
WALK_TO_TURN_WIDE_SEC = 1.0 # larger correction when target is far off
WALK_TO_LLAVA_TIMEOUT_SEC = 15.0 # per-call deadline on the VLM scene query.
# Without this, ollama-python's default
# 5-min HTTP timeout would let a stalled
# VLM block the entire motion worker
# — and motion_abort can't break out of
# an in-flight HTTP call.
def _call_llava_with_timeout(prompt, img_b64, num_predict=120,
timeout_sec=WALK_TO_LLAVA_TIMEOUT_SEC):
"""Run call_llava in a worker thread with a hard deadline.
Returns the raw string response, or None on timeout/error.
We cannot truly cancel the HTTP request (ollama-python doesn't
expose a cancel handle), so the thread is left to finish on its
own but the planner moves on. The orphaned thread will exit
when the HTTP eventually returns; we rely on it being a daemon
so it doesn't block process exit.
"""
result = {"raw": None, "err": None}
def _runner():
try:
result["raw"] = call_llava(prompt, img_b64, num_predict=num_predict)
except Exception as e:
result["err"] = str(e)
th = threading.Thread(target=_runner, daemon=True)
th.start()
th.join(timeout=timeout_sec)
if th.is_alive():
return None # timeout
if result["err"]:
return None # error
return result["raw"]
# Autonomous mode instance — set by init_autonomous() # Autonomous mode instance — set by init_autonomous()
_auto = None _auto = None
@ -84,9 +211,22 @@ def try_local_command(cmd: str) -> bool:
odom.return_to_start() odom.return_to_start()
else: else:
print(" [Places] Odometry not running — cannot return to start") print(" [Places] Odometry not running — cannot return to start")
else: return True
place_goto(name) # Try saved-place memory first. If place_goto returns False
return True # (no such place), fall through to the spatial planner — the
# user probably meant a visible object like 'the door' that
# was never saved as a place. Do this by rewriting the cmd
# to the planner's input shape and re-dispatching through
# try_local_command via _RE_WALK_TO below; we DON'T return
# True here so the rest of try_local_command runs.
if place_goto(name):
return True
# Re-target this same call through the spatial planner. We
# mutate the local 'cmd' string, then fall through to the
# _RE_WALK_TO branch later in this function (see further down).
print(f" [Places] '{name}' not saved — trying spatial planner")
cmd = "walk to " + name
# Note: don't return — let control fall through to _RE_WALK_TO.
m = _RE_FORGET.match(cmd) m = _RE_FORGET.match(cmd)
if m: if m:
@ -115,9 +255,11 @@ def try_local_command(cmd: str) -> bool:
if odom: if odom:
odom.walk_distance(meters) odom.walk_distance(meters)
else: else:
vx, _, _ = MOVE_MAP["forward"]
t0 = time.time() t0 = time.time()
while time.time() - t0 < meters / 0.3: while time.time() - t0 < meters / abs(vx):
send_vel(vx=0.3) if motion_abort.is_set(): break
send_vel(vx=vx)
time.sleep(0.05) time.sleep(0.05)
gradual_stop() gradual_stop()
return True return True
@ -128,9 +270,11 @@ def try_local_command(cmd: str) -> bool:
if odom: if odom:
odom.walk_distance(meters, direction="backward") odom.walk_distance(meters, direction="backward")
else: else:
vx, _, _ = MOVE_MAP["backward"]
t0 = time.time() t0 = time.time()
while time.time() - t0 < meters / 0.2: while time.time() - t0 < meters / abs(vx):
send_vel(vx=-0.2) if motion_abort.is_set(): break
send_vel(vx=vx)
time.sleep(0.05) time.sleep(0.05)
gradual_stop() gradual_stop()
return True return True
@ -144,10 +288,15 @@ def try_local_command(cmd: str) -> bool:
if odom: if odom:
odom.turn_degrees(degrees) odom.turn_degrees(degrees)
else: else:
# vyaw magnitude comes from MOVE_MAP["left"]; duration is
# abs(degrees)/(vyaw_deg_per_sec). vyaw in config is rad/s.
_, _, vyaw_mag = MOVE_MAP["left"]
vyaw_deg_per_sec = abs(vyaw_mag) * 180.0 / 3.14159265
vyaw = vyaw_mag if degrees > 0 else -vyaw_mag
duration = abs(degrees) / vyaw_deg_per_sec
t0 = time.time() t0 = time.time()
vyaw = 0.3 if degrees > 0 else -0.3
duration = abs(degrees) / 17.2
while time.time() - t0 < duration: while time.time() - t0 < duration:
if motion_abort.is_set(): break
send_vel(vyaw=vyaw) send_vel(vyaw=vyaw)
time.sleep(0.05) time.sleep(0.05)
gradual_stop() gradual_stop()
@ -156,11 +305,19 @@ def try_local_command(cmd: str) -> bool:
m = _RE_WALK_STEP.match(cmd) m = _RE_WALK_STEP.match(cmd)
if m: if m:
direction = (m.group(1) or "forward").lower() direction = (m.group(1) or "forward").lower()
steps = int(m.group(2)) if direction.startswith("back"):
vx = -0.2 if direction.startswith("back") else 0.3 direction = "backward"
duration = 2.0 * steps # Step count may be fractional ('walk 3.5 steps' / 'walk half
# a step' after fraction normalization). Convert to float for
# the duration calc; STEP_DURATION_SEC × 3.5 = 7s of motion.
steps = float(m.group(2))
vx, _, _ = MOVE_MAP[direction]
duration = STEP_DURATION_SEC * steps
t0 = time.time() t0 = time.time()
while time.time() - t0 < duration: while time.time() - t0 < duration:
if motion_abort.is_set(): break
wait_while_paused()
if motion_abort.is_set(): break
send_vel(vx=vx) send_vel(vx=vx)
time.sleep(0.05) time.sleep(0.05)
gradual_stop() gradual_stop()
@ -169,16 +326,335 @@ def try_local_command(cmd: str) -> bool:
m = _RE_TURN_STEP.match(cmd) m = _RE_TURN_STEP.match(cmd)
if m: if m:
direction = m.group(1).lower() direction = m.group(1).lower()
steps = int(m.group(2)) if m.group(2) else 1 # Fractional step count for sub-step turns (e.g. 'turn left 0.5
vyaw = 0.3 if direction == "left" else -0.3 # steps' from 'turn left half a step').
duration = 2.0 * steps steps = float(m.group(2)) if m.group(2) else 1.0
_, _, vyaw = MOVE_MAP[direction]
duration = STEP_DURATION_SEC * steps
t0 = time.time() t0 = time.time()
while time.time() - t0 < duration: while time.time() - t0 < duration:
if motion_abort.is_set(): break
wait_while_paused()
if motion_abort.is_set(): break
send_vel(vyaw=vyaw) send_vel(vyaw=vyaw)
time.sleep(0.05) time.sleep(0.05)
gradual_stop() gradual_stop()
return True return True
# ── BARE / SIMPLE DIRECTIONAL COMMANDS ───────────────────────────────
# "left", "right", "forward", "back", "go back", "move forward",
# "step right", "walk back", "head forward" — any one-word direction
# or verb+direction with no explicit count. Duration and velocities
# come entirely from config_Navigation.json (MOVE_MAP +
# step_duration_sec) — no magic numbers here.
m = _RE_SIMPLE_DIR.match(cmd)
if m:
direction = m.group(1).lower()
if direction.startswith("back"):
direction = "backward"
vx, vy, vyaw = MOVE_MAP[direction]
t0 = time.time()
while time.time() - t0 < STEP_DURATION_SEC:
if motion_abort.is_set(): break
wait_while_paused()
if motion_abort.is_set(): break
send_vel(vx=vx, vy=vy, vyaw=vyaw)
time.sleep(0.05)
gradual_stop()
return True
# ── BARE STOP / PAUSE ─────────────────────────────────────────────────
m = _RE_STOP_SIMPLE.match(cmd)
if m:
gradual_stop()
return True
# ── DIRECT-MOTION CANONICALS (no LLaVA fallback) ────────────────────
# These canonicals were previously falling through to ask(cmd, img) →
# Ollama → 530s blocking call. We handle them deterministically here.
# Bug #3 in the 14-bug audit; see README of that fix for rationale.
# 'turn around' = 180° turn. Synthesize the brain call as the
# parametric form so we reuse the existing _RE_TURN_DEG path
# (which has motion_abort / motion_pause / odom support).
if _RE_TURN_AROUND.match(cmd):
return try_local_command("turn 180 degrees")
# 'wave hello' / 'wave' → arm gesture via do_arm(). The arm_api
# is currently a stub (arm_available=false) so this prints a clear
# message rather than calling Ollama. Once GR00T is wired the gesture
# will fire automatically — no change here needed.
if _RE_WAVE_HELLO.match(cmd):
from API.arm_api import do_arm
do_arm("wave")
return True
if _RE_RAISE_ARM.match(cmd):
from API.arm_api import do_arm
do_arm("raise_right") # default to right; alias is in config_Arm.json
return True
if _RE_LOWER_ARM.match(cmd):
from API.arm_api import do_arm
do_arm("lower")
return True
if _RE_POINT_GESTURE.match(cmd):
from API.arm_api import do_arm
# 'point at X' has no canonical gesture in the arm vocabulary —
# closest match is 'right_up' which extends the right arm forward.
# Use 'high_wave' (alias=wave) only if 'right_up' isn't registered.
from API.arm_api import ARM_ALIASES, ARM_ACTIONS
for choice in ("right_up", "raise_right", "high_wave", "wave"):
if choice in ARM_ALIASES or choice in ARM_ACTIONS:
do_arm(choice)
break
else:
print(" [Arm] no point/raise gesture configured — skipping")
return True
# 'sit down' / 'stand up' need posture commands the SDK doesn't
# currently expose through marcus's ZMQ velocity bridge. Print a
# clear log line and gradual_stop (safety) instead of taking 20s
# to have Qwen invent something. When G1 SportClient.StandUp() /
# Sit() are wired into a posture_api, replace these with real calls.
if _RE_SIT_DOWN.match(cmd):
gradual_stop()
print(" [Posture] sit_down not yet wired — gradual_stop only")
return True
if _RE_STAND_UP.match(cmd):
gradual_stop()
print(" [Posture] stand_up not yet wired — gradual_stop only")
return True
# 'stay here' / 'stay in place' — explicit hold. Just stops; the
# robot maintains its standing posture by SDK default.
if _RE_STAY_HERE.match(cmd):
gradual_stop()
print(" [Posture] holding position")
return True
# ── LOOK AROUND ───────────────────────────────────────────────────────
# Scan left ~45° → centre → right ~90° → centre. Each leg goes through
# execute_action which polls motion_abort, so 'stop' mid-scan halts
# cleanly. No-op fallback if MOVE_MAP doesn't define left/right (shouldn't
# happen on a real config but keep us defensive).
if _RE_LOOK_AROUND.match(cmd):
from Brain.executor import execute_action # late import — avoids
# circular at module load
try:
execute_action("left", LOOK_AROUND_TURN_SEC)
if motion_abort.is_set(): return True
time.sleep(LOOK_AROUND_HOLD_SEC)
if motion_abort.is_set(): return True
execute_action("right", LOOK_AROUND_TURN_SEC * 2)
if motion_abort.is_set(): return True
time.sleep(LOOK_AROUND_HOLD_SEC)
if motion_abort.is_set(): return True
execute_action("left", LOOK_AROUND_TURN_SEC)
except Exception as e:
print(f" [LookAround] failed: {e}")
return True
# ── FOLLOW ME ─────────────────────────────────────────────────────────
# Watch for a person in the camera frame; when one is detected and not
# already too close, advance forward in a short burst. If the person is
# too close, hold. If no person seen, hold and re-poll. Bounded by
# FOLLOW_ME_MAX_SEC and motion_abort. yolo_api uses fallback stubs when
# YOLO isn't loaded — in that case we give up immediately and tell the
# operator (no person tracking = no point walking blindly forward).
if _RE_FOLLOW_ME.match(cmd):
from Brain.executor import move_step
try:
from API.yolo_api import yolo_sees, yolo_person_too_close
except Exception as e:
print(f" [FollowMe] yolo_api unavailable: {e}")
return True
# Probe for actual YOLO availability — both the stub `yolo_sees`
# and the stub `yolo_person_too_close` always return False, so a
# follow_me run with no real YOLO would just hold forever.
try:
from API.yolo_api import _stub_sees as _stub_sees_ref # type: ignore
stubbed = (yolo_sees is _stub_sees_ref)
except Exception:
stubbed = False
if stubbed:
print(" [FollowMe] YOLO not loaded — cannot track persons. "
"Skipping. (init_yolo() at startup to enable.)")
return True
print(f" [FollowMe] tracking up to {FOLLOW_ME_MAX_SEC:.0f}s — "
"say 'stop' to end")
deadline = time.time() + FOLLOW_ME_MAX_SEC
last_log = 0.0
while time.time() < deadline:
if motion_abort.is_set():
print(" [FollowMe] motion_abort set — ending")
gradual_stop()
break
try:
see_person = bool(yolo_sees("person"))
except Exception:
see_person = False
try:
too_close = bool(yolo_person_too_close())
except Exception:
too_close = False
now = time.time()
if now - last_log > 2.0:
state = ("person too close" if (see_person and too_close)
else "tracking" if see_person
else "no person")
print(f" [FollowMe] {state}")
last_log = now
if see_person and not too_close:
move_step("forward", FOLLOW_ME_STEP_SEC)
else:
gradual_stop()
time.sleep(FOLLOW_ME_POLL_SEC)
gradual_stop()
print(" [FollowMe] done")
return True
# ── WALK TO TARGET (spatial planner) ─────────────────────────────────
# LLaVA-grounded approach to an arbitrary visible object. This is a
# best-effort planner — accuracy depends entirely on the VLM. Loop:
# 1. Capture a frame, ask LLaVA: "Is <target> visible? bearing
# (left/center/right)? distance (near/medium/far)?"
# 2. If not visible: scan left/right small turn, retry up to N times.
# 3. If visible: turn-toward (small or wide based on bearing) then
# walk forward for WALK_TO_STEP_SEC.
# 4. Done when distance == 'near' OR motion_abort OR timeout.
# All motion goes through move_step / execute_action which already
# poll motion_abort and motion_pause, so 'stop'/'pause' work normally.
m = _RE_WALK_TO.match(cmd)
if m:
target = m.group(1).strip().rstrip(".!?,").strip()
if not target:
return True
# If the target is a known place name, defer to place_goto —
# the user probably meant the saved place, not the visible
# object with that name. (Belt-and-braces with _RE_GOTO above.)
if mem and target.lower() in {p.lower() for p in (mem.list_places() or [])}:
place_goto(target)
return True
from Brain.executor import move_step, execute_action
print(f" [WalkTo] target = {target!r}; max {WALK_TO_MAX_SEC:.0f}s")
deadline = time.time() + WALK_TO_MAX_SEC
scan_attempts_left = 4 # how many turns before giving up if unseen
last_replan = 0.0
# Kill-switch on the LLaVA retry loop. Without this, an unresponsive
# Ollama (server crashed / model OOM / network wedged) makes the
# planner grind for the full WALK_TO_MAX_SEC, hammering with retries
# every 0.5 s and producing nothing but noise. Bail after a small
# consecutive-timeout count so the user gets a fast 'unable to plan'
# rather than a 45 s silent wait. Field-tested cap.
consecutive_llava_timeouts = 0
LLAVA_TIMEOUT_GIVE_UP = 3
while time.time() < deadline:
if motion_abort.is_set():
print(" [WalkTo] motion_abort set — ending")
gradual_stop()
break
# Re-plan via LLaVA at each cycle. Caching one query per
# WALK_TO_REPLAN_SEC keeps token cost in check.
now = time.time()
if now - last_replan < WALK_TO_REPLAN_SEC and last_replan > 0:
time.sleep(0.1)
continue
last_replan = now
img_b64 = get_frame()
if not img_b64:
print(" [WalkTo] no camera frame — pausing")
time.sleep(0.5)
continue
prompt = (
f"You are guiding a robot toward the {target!r}. Look at "
f"the image. Answer ONLY in JSON, no other text:\n"
f"{{\"visible\": true|false, \"bearing\": \"left|center|right\", "
f"\"distance\": \"near|medium|far\", \"reason\": \"<brief>\"}}\n"
f"- 'visible' = whether the {target!r} is in the frame\n"
f"- 'bearing' = which side of the frame it's on\n"
f"- 'distance' = rough estimate based on apparent size\n"
f"- 'near' means within ~1 metre (almost stopping distance)"
)
# Wrap LLaVA in a thread-timeout: ollama-python's default
# HTTP timeout is ~5 minutes, which would let a stalled VLM
# block the entire motion worker far longer than the watchdog
# would tolerate. WALK_TO_LLAVA_TIMEOUT_SEC = 15s gives the
# VLM enough room to answer at idle while keeping abort
# responsive (next iteration re-checks motion_abort).
if motion_abort.is_set(): break
raw = _call_llava_with_timeout(prompt, img_b64, num_predict=120)
if raw is None:
consecutive_llava_timeouts += 1
print(f" [WalkTo] LLaVA timeout/error "
f"({consecutive_llava_timeouts}/{LLAVA_TIMEOUT_GIVE_UP}) "
f"— retrying")
if consecutive_llava_timeouts >= LLAVA_TIMEOUT_GIVE_UP:
print(f" [WalkTo] LLaVA unresponsive after "
f"{LLAVA_TIMEOUT_GIVE_UP} timeouts — giving up. "
f"Check Ollama (curl http://localhost:11434/api/version).")
gradual_stop()
return True
time.sleep(0.5)
continue
consecutive_llava_timeouts = 0
obs = parse_json(raw) or {}
visible = bool(obs.get("visible"))
bearing = (obs.get("bearing") or "").lower()
distance = (obs.get("distance") or "").lower()
reason = obs.get("reason", "")
print(f" [WalkTo] visible={visible} bearing={bearing} "
f"distance={distance} ({reason[:60]})")
if motion_abort.is_set(): break
if not visible:
if scan_attempts_left <= 0:
print(f" [WalkTo] {target!r} not found after scans — "
"giving up")
break
scan_attempts_left -= 1
# Small turn left to widen the search arc; next iteration
# the LLaVA call will see the new view.
execute_action("left", WALK_TO_TURN_SHORT_SEC)
continue
scan_attempts_left = 4 # reset once we see it again
# Reached: stop and report
if distance == "near":
gradual_stop()
print(f" [WalkTo] arrived at {target!r}")
return True
# Steer toward target
if bearing == "left":
turn_sec = (WALK_TO_TURN_WIDE_SEC if distance == "far"
else WALK_TO_TURN_SHORT_SEC)
execute_action("left", turn_sec)
elif bearing == "right":
turn_sec = (WALK_TO_TURN_WIDE_SEC if distance == "far"
else WALK_TO_TURN_SHORT_SEC)
execute_action("right", turn_sec)
# bearing == 'center' or unknown → just walk forward
if motion_abort.is_set(): break
# Forward burst
move_step("forward", WALK_TO_STEP_SEC)
gradual_stop()
return True
# ── NAMED PATROL ROUTE ─────────────────────────────────────────────── # ── NAMED PATROL ROUTE ───────────────────────────────────────────────
m = _RE_PATROL_RT.match(cmd) m = _RE_PATROL_RT.match(cmd)
if m: if m:
@ -234,17 +710,11 @@ def try_local_command(cmd: str) -> bool:
for phrase, (reverse_dir, _) in move_words.items(): for phrase, (reverse_dir, _) in move_words.items():
if phrase in cl: if phrase in cl:
print(f" Undoing: '{c}' → reversing with '{reverse_dir}'") print(f" Undoing: '{c}' → reversing with '{reverse_dir}'")
dur, t0 = 2.0, time.time() vx, vy, vyaw = MOVE_MAP[reverse_dir]
if reverse_dir in ("left", "right"): t0 = time.time()
vyaw = 0.3 if reverse_dir == "left" else -0.3 while time.time() - t0 < STEP_DURATION_SEC:
while time.time() - t0 < dur: send_vel(vx=vx, vy=vy, vyaw=vyaw)
send_vel(vyaw=vyaw) time.sleep(0.05)
time.sleep(0.05)
else:
vx = 0.3 if reverse_dir == "forward" else -0.2
while time.time() - t0 < dur:
send_vel(vx=vx)
time.sleep(0.05)
gradual_stop() gradual_stop()
return True return True
print(" No movement command to undo") print(" No movement command to undo")

View File

@ -6,6 +6,7 @@ import time
import threading import threading
from API.zmq_api import send_vel, gradual_stop, MOVE_MAP, STEP_PAUSE from API.zmq_api import send_vel, gradual_stop, MOVE_MAP, STEP_PAUSE
from API.arm_api import ALL_ARM_NAMES, do_arm from API.arm_api import ALL_ARM_NAMES, do_arm
from Core.motion_state import motion_abort, wait_while_paused
def _obstacle_check(): def _obstacle_check():
@ -29,6 +30,18 @@ def execute_action(move: str, duration: float):
vx, vy, vyaw = MOVE_MAP[move] vx, vy, vyaw = MOVE_MAP[move]
t0 = time.time() t0 = time.time()
while time.time() - t0 < duration: while time.time() - t0 < duration:
if motion_abort.is_set():
gradual_stop()
print(" [executor] motion_abort set — stopping")
return
# Honour a soft pause: hold zero velocity and wait until
# the user resumes (or aborts). wait_while_paused returns
# immediately if not paused. Re-check abort right after.
if motion_pause_check():
send_vel(0.0, 0.0, 0.0)
wait_while_paused()
if motion_abort.is_set():
gradual_stop(); return
if _obstacle_check(): if _obstacle_check():
gradual_stop() gradual_stop()
print(" [Safety] LiDAR obstacle — stopping") print(" [Safety] LiDAR obstacle — stopping")
@ -47,6 +60,15 @@ def move_step(move: str, duration: float):
vx, vy, vyaw = MOVE_MAP[move] vx, vy, vyaw = MOVE_MAP[move]
t0 = time.time() t0 = time.time()
while time.time() - t0 < duration: while time.time() - t0 < duration:
if motion_abort.is_set():
send_vel(0.0, 0.0, 0.0)
print(" [executor] motion_abort set — pausing step")
return
if motion_pause_check():
send_vel(0.0, 0.0, 0.0)
wait_while_paused()
if motion_abort.is_set():
send_vel(0.0, 0.0, 0.0); return
if _obstacle_check(): if _obstacle_check():
send_vel(0.0, 0.0, 0.0) send_vel(0.0, 0.0, 0.0)
print(" [Safety] LiDAR obstacle — pausing step") print(" [Safety] LiDAR obstacle — pausing step")
@ -57,6 +79,13 @@ def move_step(move: str, duration: float):
time.sleep(0.1) time.sleep(0.1)
def motion_pause_check() -> bool:
"""Returns True if a soft-pause is active. Tiny indirection so we
don't have to import motion_pause directly into every motion site."""
from Core.motion_state import motion_pause
return motion_pause.is_set()
def merge_actions(actions: list) -> list: def merge_actions(actions: list) -> list:
"""Merge consecutive same-direction steps into one smooth movement.""" """Merge consecutive same-direction steps into one smooth movement."""
if not actions: if not actions:

View File

@ -18,7 +18,7 @@ if PROJECT_DIR not in sys.path:
sys.path.insert(0, PROJECT_DIR) sys.path.insert(0, PROJECT_DIR)
from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd
from API.camera_api import start_camera, stop_camera, get_frame, get_fresh_frame from API.camera_api import start_camera, stop_camera, get_frame
from API.yolo_api import ( from API.yolo_api import (
init_yolo, yolo_summary, yolo_fps, init_yolo, yolo_summary, yolo_fps,
yolo_all_classes, yolo_closest, yolo_sees, yolo_all_classes, yolo_closest, yolo_sees,
@ -155,19 +155,28 @@ def init_brain():
_log("Brain initialized", "info", "brain") _log("Brain initialized", "info", "brain")
# Report VLM config only — no warmup thread. This matches Marcus_v1's # Synchronous warmup — same shape as Marcus_v1's marcus_llava.py. The
# concept: the first real VLM command performs the cold-load synchronously # Python process blocks here for ~60-90 s on the first run so the first
# inside ollama.chat(), which takes ~60-90 s once on the Jetson and is # real user command doesn't pay the cold-load. One attempt, no retry, no
# fast for every subsequent call. A background warmup thread races with # thread. By the time the dashboard prints, Qwen is resident in iGPU.
# YOLO/camera/audio/Holosoma startup and with user input, and on a from API.llava_api import VLM_ENABLED, OLLAMA_HOST, _client as _llava_client
# 16 GB unified-memory board that race is what triggers the OOM killer.
from API.llava_api import VLM_ENABLED, OLLAMA_HOST
if not VLM_ENABLED: if not VLM_ENABLED:
print(" [VLM] disabled by config — safe mode (no Ollama load)") print(" [VLM] disabled by config — safe mode (no Ollama load)")
else: else:
host_short = OLLAMA_HOST.replace("http://", "") host_short = OLLAMA_HOST.replace("http://", "")
print(f" [VLM] target: {host_short} ({OLLAMA_MODEL}) " print(f" [VLM] target: {host_short} ({OLLAMA_MODEL})")
f"— first vision command will cold-load (~60-90 s)") print(" [VLM] Warming up... (loading into iGPU — may take 60-90 s on cold start)")
try:
_llava_client.chat(
model=OLLAMA_MODEL,
messages=[{"role": "user", "content": "hi"}],
options={"temperature": 0.0, "num_predict": 5,
"num_batch": _cfg.get("num_batch", 16),
"num_ctx": _cfg.get("num_ctx", 1024)},
)
print(" [VLM] warm — first command will be fast")
except Exception as _e:
print(f" [VLM] warmup failed ({_e}) — first command may cold-load")
# Global voice references # Global voice references
@ -177,10 +186,12 @@ _voice_module = None
def _init_voice(): def _init_voice():
""" """
Initialize the voice subsystem: G1 built-in mic + Whisper STT + G1 Initialize the voice subsystem: G1 built-in mic + Gemini Live S2S
built-in TtsMaker for replies. Every transcribed command flows through (subprocess in gemini_sdk env, runs Voice/gemini_runner.py). Every
process_command(), and the resulting `speak` string is sent to the G1 transcribed user command flows through process_command(); the brain
speaker. decides motion side-effects, but the verbal reply itself comes from
Gemini's voice through the G1 speaker (NOT TtsMaker — TtsMaker stays
for non-voice subsystem callers but isn't invoked from this callback).
""" """
global _audio_api, _voice_module global _audio_api, _voice_module
try: try:
@ -189,13 +200,42 @@ def _init_voice():
_audio_api = AudioAPI() _audio_api = AudioAPI()
# Heuristic filter for unusable Gemini transcripts. Gemini emits
# `<noise>` literally when audio is non-speech and `.` for empty
# bursts. These shouldn't pollute the terminal or trigger motion.
# Bilingual aware: short Arabic motion commands like "اجلس" /
# "قف" / "تعال" are LEGIT — only reject short non-ASCII-non-Arabic
# snippets (Korean / Thai / etc — echo/distortion garbage).
def _is_supported_lang_char(c: str) -> bool:
# ASCII letters/digits/punct OR Arabic block (U+0600..U+06FF)
# OR whitespace.
return c.isascii() or "؀" <= c <= "ۿ" or c.isspace()
def _is_garbage_transcript(t: str) -> bool:
stripped = t.strip().strip(".!?,").strip()
if not stripped:
return True
low = stripped.lower()
if low in ("<noise>", "noise", "yeah", "ok", "okay", "uh", "um", "hmm", "mm"):
return True
# Bare single character (often "." → ".") or all punctuation.
if all(not c.isalnum() for c in stripped):
return True
# Reject only when text is BOTH short AND uses chars from
# neither English nor Arabic — those are echo/distortion
# mistranscriptions (Korean/Thai/etc fragments).
if len(stripped) <= 6:
if not all(_is_supported_lang_char(c) for c in stripped):
return True
return False
def _on_command(text, lang): def _on_command(text, lang):
text = (text or "").strip() text = (text or "").strip()
if not text: if not text:
return return
# One clean, distinctive line so the operator can see exactly if _is_garbage_transcript(text):
# what Whisper transcribed before the brain reacts. Everything # Skip silently — neither show nor dispatch to brain.
# else from the voice subsystem is file-only. return
print(f' [Sanad] heard: "{text}"') print(f' [Sanad] heard: "{text}"')
try: try:
result = process_command(text) result = process_command(text)
@ -204,9 +244,16 @@ def _init_voice():
return return
if isinstance(result, dict): if isinstance(result, dict):
sp = (result.get("speak") or "").strip() sp = (result.get("speak") or "").strip()
if sp and _audio_api: act = (result.get("action") or "").strip()
_audio_api.speak(sp) # In the Gemini S2S architecture Gemini owns the voice; we
# Redraw the Command: prompt that our print clobbered # do NOT call audio_api.speak(sp) here (would collide with
# Gemini's own audio reply). Just show the operator what
# the brain decided so they can correlate motion with
# Gemini's spoken acknowledgement.
if act and act not in ("NONE", "TALK"):
print(f" [Sanad] doing: {act}{f'{sp[:80]}' if sp else ''}")
elif sp:
print(f" [Brain] {sp[:120]}")
print("Command: ", end="", flush=True) print("Command: ", end="", flush=True)
_voice_module = VoiceModule(_audio_api, on_command=_on_command) _voice_module = VoiceModule(_audio_api, on_command=_on_command)
@ -299,14 +346,131 @@ def process_command(cmd: str) -> dict:
log_cmd(cmd, response) log_cmd(cmd, response)
return {"type": "greeting", "speak": response, "action": "GREETING", "elapsed": 0} return {"type": "greeting", "speak": response, "action": "GREETING", "elapsed": 0}
# ── "Come to me" shortcut ──────────────────────────────────────────── # ── "Come to me" — smart YOLO-tracked approach ──────────────────────
# Old behaviour: walk forward 2s blindly. New: if YOLO is loaded,
# locate the person in the camera frame, turn toward them if off-
# centre, walk forward in short bursts, stop when 'arrived' (person
# fills enough of the frame to be 'right in front'). Bounded by
# COME_TO_ME_MAX_SEC; falls back to a 2s blind walk if YOLO isn't
# available so the canonical still does *something*.
#
# Arrival threshold: yolo_person_too_close(threshold=...) returns
# True if the person's bbox size_ratio (bbox_area / frame_area)
# exceeds the threshold. Defaults are calibrated for safety
# (0.25 = ~1m away, used by patrol). For come_here we want the
# robot to come MUCH closer — arm's length, ~0.5m — which is
# roughly size_ratio > 0.32 (lookup in marcus_yolo.distance_estimate:
# 'very close' starts at 0.30). 0.32 is a safe 'arrived' marker;
# any higher and the robot might bump into the user.
if re.match(r"^(?:come(?:\s+back)?(?:\s+to\s+me)?|come\s+here|get\s+closer|approach|move\s+closer)\s*[!.]*$", cmd, re.IGNORECASE): if re.match(r"^(?:come(?:\s+back)?(?:\s+to\s+me)?|come\s+here|get\s+closer|approach|move\s+closer)\s*[!.]*$", cmd, re.IGNORECASE):
execute_action("forward", 2.0) from Brain.executor import move_step
try:
from API.yolo_api import yolo_sees, yolo_person_too_close, yolo_closest
from API.yolo_api import _stub_sees as _stub_sees_ref # type: ignore
yolo_active = (yolo_sees is not _stub_sees_ref)
except Exception:
yolo_active = False
COME_TO_ME_MAX_SEC = 25.0
COME_TO_ME_STEP_SEC = 0.6
COME_TO_ME_TURN_SEC = 0.3 # small correction (~14° at default vyaw)
COME_TO_ME_POLL_SEC = 0.3
# Arrival threshold: 0.32 size_ratio ≈ ~0.5m / arm's length.
# Patrol's 0.25 default is too lenient for come-to-me — it
# would treat a 1m-away user as 'already arrived'.
COME_TO_ME_ARRIVED_RATIO = 0.32
if not yolo_active:
# Fallback — same behaviour as before for hardware setups
# without YOLO loaded.
print(" [ComeToMe] YOLO not loaded — walking forward 2s")
execute_action("forward", 2.0)
resp = "Coming to you"
print(f"Sanad: {resp}")
add_to_history(cmd, resp)
log_cmd(cmd, resp)
return {"type": "move", "speak": resp, "action": "FORWARD 2.0s", "elapsed": 2.0}
print(f" [ComeToMe] tracking person up to {COME_TO_ME_MAX_SEC:.0f}s — say 'stop' to end")
deadline = time.time() + COME_TO_ME_MAX_SEC
last_log = 0.0
scan_attempts_left = 6 # how many turns before giving up if person never seen
while time.time() < deadline:
try:
from Core.motion_state import motion_abort
if motion_abort.is_set():
print(" [ComeToMe] motion_abort — stopping")
break
except Exception:
pass
try:
seen = bool(yolo_sees("person"))
except Exception:
seen = False
try:
# IMPORTANT: pass an explicit 'arrived' threshold (0.32)
# — without it, the default 0.25 (safety patrol's value)
# would call the user 'too close' when they're a metre
# away, and the robot would stop without walking. We
# want it to come arm's length close.
arrived = bool(yolo_person_too_close(
threshold=COME_TO_ME_ARRIVED_RATIO,
))
except Exception:
arrived = False
now = time.time()
if now - last_log > 1.5:
state = ("arrived (close to person)" if (seen and arrived)
else "tracking person" if seen
else "scanning for person")
print(f" [ComeToMe] {state}")
last_log = now
if seen and arrived:
# Made it — robot is close enough to stop in front
print(" [ComeToMe] arrived in front of person")
break
if seen and not arrived:
# Person is in view but not yet arm's length close —
# centre on them and walk forward in a burst.
bearing = "center"
try:
det = yolo_closest("person")
if det and getattr(det, "position", None):
# position is something like 'left' / 'center' / 'right'
bearing = str(det.position).lower()
except Exception:
pass
if bearing == "left":
execute_action("left", COME_TO_ME_TURN_SEC)
elif bearing == "right":
execute_action("right", COME_TO_ME_TURN_SEC)
# forward burst
move_step("forward", COME_TO_ME_STEP_SEC)
scan_attempts_left = 6 # reset
continue
# No person — try a small scan turn, then check again
if scan_attempts_left <= 0:
print(" [ComeToMe] no person found after scans — giving up")
break
scan_attempts_left -= 1
execute_action("left", COME_TO_ME_TURN_SEC)
time.sleep(COME_TO_ME_POLL_SEC)
try:
from API.zmq_api import gradual_stop as _gs
_gs()
except Exception:
pass
elapsed = time.time() - t0
resp = "Coming to you" resp = "Coming to you"
print(f"Sanad: {resp}") print(f"Sanad: {resp}")
add_to_history(cmd, resp) add_to_history(cmd, resp)
log_cmd(cmd, resp) log_cmd(cmd, resp)
return {"type": "move", "speak": resp, "action": "FORWARD 2.0s", "elapsed": 2.0} return {"type": "move", "speak": resp, "action": "COME_TO_ME", "elapsed": elapsed}
# ── Multi-step compound ────────────────────────────────────────────── # ── Multi-step compound ──────────────────────────────────────────────
_multi = re.match( _multi = re.match(
@ -361,7 +525,7 @@ def _handle_search(cmd):
def _handle_talk(cmd): def _handle_talk(cmd):
print("Thinking...") print("Thinking...")
try: try:
img = get_fresh_frame() img = get_frame()
facts_str = "" facts_str = ""
try: try:
from API.llava_api import _facts from API.llava_api import _facts
@ -382,10 +546,7 @@ def _handle_talk(cmd):
def _handle_llava(cmd): def _handle_llava(cmd):
print("Thinking...") print("Thinking...")
t0 = time.time() t0 = time.time()
# get_fresh_frame() blocks up to 1 s waiting for a frame newer than img = get_frame()
# 300 ms old. Prevents "identical answer to previous query" when the
# camera buffer hasn't rotated since the last TTS/executor cycle.
img = get_fresh_frame()
# Poll up to 500 ms in 50 ms slices instead of blocking a full second. # Poll up to 500 ms in 50 ms slices instead of blocking a full second.
# Returns the moment a frame is available — most drops recover in <100 ms. # Returns the moment a frame is available — most drops recover in <100 ms.
@ -576,7 +737,23 @@ def run_terminal():
print(' Pick the ID that sounded English and set it in') print(' Pick the ID that sounded English and set it in')
print(' Config/config_Voice.json :: tts.builtin_speaker_id') print(' Config/config_Voice.json :: tts.builtin_speaker_id')
continue continue
# Structured motion log for terminal-typed commands. The
# voice path already logs through Voice/marcus_voice's
# worker; here we cover the OTHER entry-point so every
# motion (text or voice) shows up in logs/motion.log with
# parsed direction/magnitude/unit + actual elapsed.
try:
from Core.motion_log import log_motion as _mlog
_mlog("start", cmd, source="text")
_t0 = time.time()
except Exception:
_t0 = time.time()
result = process_command(cmd) result = process_command(cmd)
try:
_mlog("complete", cmd, source="text",
actual_sec=time.time() - _t0)
except Exception:
pass
sp = result.get("speak", "") if isinstance(result, dict) else "" sp = result.get("speak", "") if isinstance(result, dict) else ""
if sp and _audio_api: if sp and _audio_api:
_audio_api.speak(sp) _audio_api.speak(sp)

View File

@ -2,7 +2,7 @@
"ollama_model": "qwen2.5vl:3b", "ollama_model": "qwen2.5vl:3b",
"ollama_host": "http://127.0.0.1:11434", "ollama_host": "http://127.0.0.1:11434",
"max_history": 6, "max_history": 6,
"num_batch": 32, "num_batch": 16,
"num_ctx": 1024, "num_ctx": 1024,
"subsystems": { "subsystems": {
"vlm": true, "vlm": true,
@ -12,9 +12,9 @@
"imgsearch": false, "imgsearch": false,
"autonomous": true "autonomous": true
}, },
"num_predict_main": 50, "num_predict_main": 60,
"num_predict_goal": 40, "num_predict_goal": 80,
"num_predict_patrol": 50, "num_predict_patrol": 80,
"num_predict_talk": 50, "num_predict_talk": 60,
"num_predict_verify": 10 "num_predict_verify": 10
} }

View File

@ -3,6 +3,8 @@
"min_steps_before_check": 3, "min_steps_before_check": 3,
"scan_interval_s": 0.4, "scan_interval_s": 0.4,
"rotation_speed": 0.3, "rotation_speed": 0.3,
"_step_duration_comment": "Duration of one 'step' for bare directional commands ('go back', 'turn right', etc.). 2.0s at move_map velocities ≈ 60 cm forward, 40 cm back, 34° turn. Change here and every regex fast-path in command_parser.py uses the new value.",
"step_duration_sec": 2.0,
"move_map": { "move_map": {
"forward": [0.3, 0.0, 0.0], "forward": [0.3, 0.0, 0.0],
"backward": [-0.2, 0.0, 0.0], "backward": [-0.2, 0.0, 0.0],

File diff suppressed because one or more lines are too long

912
Config/instruction.json Normal file
View File

@ -0,0 +1,912 @@
{
"_description": "Bilingual voice command instructions — single source of truth for the voice dispatch tables. Loaded by Voice/marcus_voice.py at module level. Adding a new motion command, a new accent variant, a new Arabic phrasing, or fixing a misheard wake-word transcription is a JSON-only edit; no Python change required. Arabic phrase tables cover MSA (Fusha) plus Gulf/Khaleeji, Egyptian (Masri), Levantine (Shami), Iraqi, and Maghrebi dialects.",
"_format": "wake_words = whole-word substrings the dispatch gate looks for in the user's transcript. Any match (English or Arabic) opens motion for the current turn. actions = per-motion phrase tables. Each action has a `canonical` string (what marcus_brain receives), `user_phrases` (what the user might SAY when asking for the motion — used for fuzzy-match + Arabic-to-English translation after wake-word strip), and `bot_phrases` (what Gemini might SPEAK when acknowledging — used by the bot-side dispatcher to fire motion off Gemini's own confirmation). All matching is substring-based; English entries are matched case-insensitively, Arabic entries match as-is. Keep the canonical string consistent with Brain/command_parser.py vocabulary.",
"wake_words": {
"_comment": "All variants of the robot's name 'Sanad' — the gate that authorises motion. Add new mishearings here when you see them in logs/transcript.log under HEARD lines that should have triggered motion but didn't.",
"english": [
"sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad",
"sanat", "sunnat", "sonnat", "sinnat", "sennat",
"sanid", "sanud", "saned", "sanod", "sanaad",
"senad", "sinad", "sonad", "sunad",
"sanah", "sanath", "sanadh", "sonadh",
"samad", "somad", "sumad",
"thanad", "zanad",
"sa nad", "san ad", "san odd", "san add"
],
"arabic": [
"سند", "سنّاد", "ساند", "سنود", "سنَد", "سنّد", "سَند", "سنّود", "ساندي", "سنادي",
"يا سند", "يا سنّاد", "يا ساند", "يا سَند", "يا سنود", "ياسند", "ياسنود",
"السند", "السنّاد"
]
},
"actions": {
"turn_right": {
"canonical": "turn right",
"user_phrases": {
"english": ["turn right", "rotate right", "spin right", "go right", "face right", "right"],
"arabic": [
"استدر يميناً", "استدر يمينا", "ادر يميناً", "ادر يمينا", "لف يمين", "لف يميناً", "يمين",
"دور يمين", "دور لليمين", "دور على اليمين", "دور ع اليمين",
"خش يمين", "خش ع اليمين", "حوّد يمين", "حود يمين", "حوّد لليمين",
"دير يمين", "دير لليمين", "دير على ليمن", "دير ع اليمين",
"لف لليمين", "لف على اليمين", "لف عاليمين", "لف ع اليمين",
"روح يمين", "اتجه يمين", "اتجه لليمين", "تجه يمين"
]
},
"bot_phrases": {
"english": ["turning right"],
"arabic": [
"أستدير يميناً", "أستدير يمينا", "استدير يميناً", "استدير يمينا", "ألف يميناً", "ألف يمينا",
"بلف يمين", "بدور يمين", "أدور يمين", "أحوّد يمين", "أخش يمين"
]
}
},
"turn_left": {
"canonical": "turn left",
"user_phrases": {
"english": ["turn left", "rotate left", "spin left", "go left", "face left", "left"],
"arabic": [
"استدر يساراً", "استدر يسارا", "ادر يساراً", "ادر يسارا", "لف يسار", "لف يساراً", "يسار", "شمال",
"دور يسار", "دور شمال", "دور لليسار", "دور على اليسار", "دور ع الشمال",
"خش يسار", "خش شمال", "خش ع الشمال", "حوّد شمال", "حود شمال", "حوّد يسار", "حوّد لليسار",
"دير يسار", "دير شمال", "دير لليسار", "دير على ليسار", "دير ع الشمال",
"لف لليسار", "لف على اليسار", "لف عاشمال", "لف ع الشمال",
"روح يسار", "روح شمال", "اتجه يسار", "اتجه شمال", "اتجه لليسار", "تجه شمال"
]
},
"bot_phrases": {
"english": ["turning left"],
"arabic": [
"أستدير يساراً", "أستدير يسارا", "استدير يساراً", "استدير يسارا", "ألف يساراً", "ألف يسارا",
"بلف شمال", "بدور شمال", "أدور شمال", "أحوّد شمال", "أخش شمال"
]
}
},
"turn_around": {
"canonical": "turn around",
"user_phrases": {
"english": ["turn around", "turn back", "spin around", "about face", "face the other way", "spin in place", "spin yourself", "turn yourself around"],
"arabic": [
"استدر للخلف", "استدر إلى الوراء", "اتجه للخلف", "ادر للخلف", "ارجع وجهك",
"لف ورا", "لف للورا", "لف للخلف", "دور ورا", "دور للورا", "دور للخلف",
"اتلف", "اتلف ورا", "دير وراك", "دير للورا", "استدر ورا",
"اقلب وجهك", "اقلب", "اعكس اتجاهك", "غيّر اتجاهك",
"لف حول نفسك", "لف حوالين نفسك", "لف على نفسك",
"دور حول نفسك", "دور حوالين نفسك", "دور على نفسك",
"استدر حول نفسك", "استدر حوالين نفسك",
"حول نفسك", "حوالين نفسك",
"لف بنفسك", "دور بنفسك"
]
},
"bot_phrases": {
"english": ["turning around"],
"arabic": [
"أستدير للخلف", "أستدير إلى الوراء", "استدير للخلف",
"بلف ورا", "بدور للورا", "أدور للخلف",
"أستدير حول نفسي", "ألف حول نفسي", "أدور حول نفسي",
"بلف حول نفسي", "بدور حول نفسي"
]
}
},
"move_forward": {
"canonical": "move forward",
"user_phrases": {
"english": ["move forward", "go forward", "walk forward", "step forward", "forward", "keep going", "walk ahead", "move ahead"],
"arabic": [
"تحرك للأمام", "تحرك إلى الأمام", "اذهب للأمام", "امش للأمام", "تقدم", "للأمام", "أمام",
"روح قدام", "روح لقدام", "روح كدام",
"امشي قدام", "امشي لقدام", "امش قدام",
"اتقدم", "تقدم لقدام", "يلا قدام", "يالله قدام",
"فوت", "فوت لقدام", "فوت قدام",
"سير لقدام", "سير قدام", "اسير قدام",
"خش قدام", "تعال قدام", "للقدام", "قدام"
]
},
"bot_phrases": {
"_": "Direction-anchored only. We do NOT add destination-phrased needles like 'walking towards X' / 'أمشي نحو X' even though Gemini occasionally says them — those false-fire when Gemini DESCRIBES a third party ('I see a person walking towards us'). Persona RULE 4b forbids destination phrasing in motion confirmations; trust the prompt. If a real motion phrase ever slips through and we miss it, add the EXACT slip to this list — not generic destination patterns.",
"english": [
"moving forward", "walking forward", "stepping forward",
"going forward", "going ahead", "heading forward", "walking ahead"
],
"arabic": [
"أتحرك للأمام", "أتحرك إلى الأمام", "أتقدم", "أمشي للأمام", "أذهب للأمام",
"بمشي قدام", "براح قدام", "أروح قدام", "بتقدم", "بفوت لقدام"
]
}
},
"move_backward": {
"canonical": "move backward",
"user_phrases": {
"english": ["move back", "move backward", "go back", "go backward", "walk back", "walk backward", "step back", "backward", "back", "reverse"],
"arabic": [
"تحرك للخلف", "تحرك إلى الخلف", "اذهب للخلف", "امش للخلف", "ارجع", "ارجع للخلف", "للخلف", "خلف",
"روح ورا", "روح للورا", "ارجع ورا", "ارجع للورا",
"امشي ورا", "امشي للورا", "امش ورا",
"تأخر", "تراجع", "ورا", "للورا",
"رجع للخلف", "ارجع لورا", "رجع لورا",
"سير للورا", "سير ورا"
]
},
"bot_phrases": {
"english": ["moving backward", "moving back", "walking backward", "walking back", "stepping back", "going back"],
"arabic": [
"أتحرك للخلف", "أتحرك إلى الخلف", "أرجع", "أمشي للخلف", "أعود للخلف",
"بمشي ورا", "براح ورا", "أرجع ورا", "بتراجع"
]
}
},
"move_right": {
"canonical": "move right",
"user_phrases": {
"english": ["step right", "move right", "slide right", "strafe right", "sidestep right"],
"arabic": [
"تحرك يميناً", "تحرك يمينا", "خطوة يمين", "اتجه يميناً",
"زحلق يمين", "زحلق لليمين", "اتزحلق يمين",
"خطوة لليمين", "خطوة ع اليمين", "خطوة على اليمين",
"حرك يمين", "حرّك يمين", "اخطو يمين", "اخطو لليمين",
"زح يمين", "زح لليمين"
]
},
"bot_phrases": {
"english": ["moving right", "stepping right", "sliding right"],
"arabic": [
"أتحرك يميناً", "أتحرك يمينا", "أخطو يميناً",
"بزحلق يمين", "أزحلق يمين", "بخطو يمين"
]
}
},
"move_left": {
"canonical": "move left",
"user_phrases": {
"english": ["step left", "move left", "slide left", "strafe left", "sidestep left"],
"arabic": [
"تحرك يساراً", "تحرك يسارا", "خطوة يسار", "اتجه يساراً",
"زحلق يسار", "زحلق شمال", "زحلق لليسار", "اتزحلق شمال",
"خطوة شمال", "خطوة لليسار", "خطوة ع الشمال", "خطوة على اليسار",
"حرك يسار", "حرك شمال", "حرّك شمال", "اخطو يسار", "اخطو شمال", "اخطو لليسار",
"زح يسار", "زح شمال"
]
},
"bot_phrases": {
"english": ["moving left", "stepping left", "sliding left"],
"arabic": [
"أتحرك يساراً", "أتحرك يسارا", "أخطو يساراً",
"بزحلق شمال", "أزحلق شمال", "بخطو شمال"
]
}
},
"stop": {
"canonical": "stop",
"user_phrases": {
"english": ["stop", "halt", "wait", "pause", "freeze", "hold", "stop moving", "stand still", "don't move"],
"arabic": [
"توقف", "قف مكانك", "اوقف", "انتظر", "اثبت", "لا تتحرك",
"وقّف", "وقف", "وگف", "بطّل", "بطل حركة", "بطل",
"خلاص", "كفاية", "يكفي",
"استنى", "استنى شوية", "اصبر", "صبر", "اصبر شوية",
"تو", "ثبات", "اثبت مكانك", "ما تتحرك", "ما تمشي",
"هدّي", "هدّى", "روّق"
]
},
"bot_phrases": {
"_": "Removed 'أنتظر' from this list — it's also a pause_motion bot_phrase ('I wait' is more naturally a pause than a stop). Keeping it here would cause Gemini saying 'أنتظر' to fire BOTH stop and pause_motion in the same chunk.",
"english": ["stopping", "halting", "holding"],
"arabic": [
"أتوقف", "توقفت",
"بوقّف", "وقّفت", "بستنى", "أصبر", "بطّلت"
]
}
},
"sit_down": {
"canonical": "sit down",
"user_phrases": {
"english": ["sit down", "sit", "take a seat", "have a seat"],
"arabic": [
"اجلس", "ارتح", "اقعد",
"اقعد على الأرض", "اقعد ع الأرض", "اجلس على الأرض",
"اگعد", "گلس", "تفضل اقعد", "ارتاح",
"خد راحتك واقعد", "اقعد يا سند"
]
},
"bot_phrases": {
"english": ["sitting down", "sitting"],
"arabic": [
"أجلس", "أقعد", "جلست",
"بقعد", "قعدت", "بجلس", "أرتاح"
]
}
},
"stand_up": {
"canonical": "stand up",
"user_phrases": {
"english": ["stand up", "stand", "get up", "rise"],
"arabic": [
"قف", "انهض", "ارفع نفسك",
"قوم", "گوم", "نوض", "نض",
"قوم على رجلك", "قوم على رجليك", "انهض من مكانك",
"اوقف على رجلك"
]
},
"bot_phrases": {
"english": ["standing up", "getting up", "rising"],
"arabic": [
"أقف", "أنهض", "وقفت",
"بقوم", "قمت", "نضت", "بنهض"
]
}
},
"wave_hello": {
"canonical": "wave hello",
"user_phrases": {
"english": ["wave hello", "wave", "say hi", "greet", "wave to me", "wave at me"],
"arabic": [
"لوّح", "لوح", "لوّح بيدك", "حيّ", "سلّم",
"لوّح لي", "لوّحلي", "لوّح بيدك لي",
"سلّم بيدك", "سلّم علي", "حيّيني", "حيي",
"هز يدك", "هز إيدك", "هزّ ايدك",
"قول هاي", "قول مرحبا", "قول السلام",
"حييني", "سلم علي"
]
},
"bot_phrases": {
"english": ["waving hello", "waving", "saying hi", "greeting"],
"arabic": [
"ألوّح", "ألوح", "ألوّح بيدي", "أحيّ", "أسلّم",
"بلوّح", "بسلّم", "بهز إيدي", "أهز يدي"
]
}
},
"raise_arm": {
"canonical": "raise arm",
"user_phrases": {
"english": ["raise arm", "raise your arm", "lift your arm", "arm up", "hand up"],
"arabic": [
"ارفع يدك", "ارفع ذراعك", "اليد للأعلى",
"ارفع إيدك", "ارفع ايدك", "ارفع يديك",
"طلّع يدك", "طلع يدك", "طلّع إيدك", "طلع إيدك",
"علّي يدك", "علّي إيدك", "علي يدك",
"يدك فوق", "إيدك فوق", "اليد فوق", "حط يدك فوق",
"ارفع اليد", "ارفع ايديك"
]
},
"bot_phrases": {
"english": ["raising arm", "raising my arm", "lifting my arm", "arm up"],
"arabic": [
"أرفع يدي", "أرفع ذراعي",
"برفع إيدي", "بطلّع إيدي", "بعلّي إيدي", "رفعت يدي"
]
}
},
"lower_arm": {
"canonical": "lower arm",
"user_phrases": {
"english": ["lower arm", "lower your arm", "drop your arm", "arm down", "hand down", "rest your arm"],
"arabic": [
"اخفض يدك", "اخفض ذراعك", "اليد للأسفل", "نزل يدك",
"نزّل يدك", "نزّل إيدك", "نزل إيدك", "نزّل ايدك",
"حط يدك", "حط إيدك", "حط ايدك", "خلي يدك تحت",
"طيح يدك", "طيّح يدك", "طيح إيدك",
"يدك تحت", "إيدك تحت", "اليد تحت",
"ارح يدك", "ريّح يدك", "ريّح إيدك"
]
},
"bot_phrases": {
"english": ["lowering arm", "lowering my arm", "dropping my arm", "arm down"],
"arabic": [
"أخفض يدي", "أخفض ذراعي", "أنزل يدي",
"بنزّل إيدي", "نزّلت إيدي", "بريّح إيدي"
]
}
},
"point": {
"canonical": "point",
"user_phrases": {
"english": ["point", "point at it", "point to it", "point there"],
"arabic": [
"اشر", "أشِر", "اشر إلى", "اشر هناك",
"أشّر", "اشّر", "أشر بأصبعك", "أشر بصبعك", "اشر بصبعك",
"شاور", "شاورلي", "أشاور", "شاور هناك",
"ورّيني", "وريني", "ورجيني",
"بصبعك", "حدد", "حدد بصبعك"
]
},
"bot_phrases": {
"english": ["pointing"],
"arabic": [
"أشير",
"بشاور", "بأشر", "بشير", "أشّرت"
]
}
},
"come_here": {
"canonical": "come here",
"user_phrases": {
"english": ["come here", "come to me", "come closer", "approach", "get closer", "come over here", "come"],
"arabic": [
"تعال", "تعال هنا", "تعال إليّ", "اقترب", "تقرب",
"تعالى", "تعالى هنا", "تعالى لي", "تعالى عندي",
"تعا", "تعا لهون", "تعا عندي",
"أجي", "أرواح", "ارواحلي",
"قرّب", "قرب", "قرّب لي", "قرّب مني", "قرب مني",
"گرّب", "گرب لي",
"يلا تعال", "يا الله تعال", "تعال جنبي", "خش جنبي",
"تعال يا سند", "تعال بسرعة", "هلم"
]
},
"bot_phrases": {
"_": "Expanded Arabic forms because Gemini sometimes emits non-canonical conjugations (e.g. 'أتعال' is grammatically wrong but phonetically what it produces for 'I come'). Persona Rule 5d gives Gemini correct example phrases; these extra needles catch slip cases like 'أتعال'/'بجي لك'.",
"english": [
"coming over", "coming to you", "approaching",
"coming to find you", "coming your way",
"i'm coming", "on my way to you", "heading to you"
],
"arabic": [
"آتي إليك", "أتي إليك", "أتي", "آتي", "أتعال",
"أنا قادم", "قادم إليك", "قادم لك", "أقترب منك",
"أقترب", "أتقرّب", "بجي", "بجي لك", "جاي", "جاي لك",
"بقرّب", "أقرّب منك", "رايح لك", "أتجه إليك"
]
}
},
"follow_me": {
"canonical": "follow me",
"user_phrases": {
"english": ["follow me", "come with me", "walk with me"],
"arabic": [
"اتبعني", "تعال معي", "امش معي",
"تعال ورايا", "تعال ورائي", "تعا ورايي",
"تعال ويايا", "تعال وياي",
"خش ورايا", "خليك ورايا",
"يلا معي", "يلا معاي", "يا الله معي",
"روح ويايا", "امشي معاي", "امشي معي",
"خليك معي", "ضل معي", "ضل ورايا",
"تعال خلفي", "خش خلفي", "اتبعني يا سند"
]
},
"bot_phrases": {
"english": ["following you", "following", "coming with you"],
"arabic": [
"أتبعك", "آتي معك", "أمشي معك",
"بتبعك", "جاي ورايك", "بمشي معاك", "ماشي معاك"
]
}
},
"stay_here": {
"canonical": "stay here",
"user_phrases": {
"english": ["stay here", "stay", "wait here", "hold position", "don't follow me"],
"arabic": [
"ابق هنا", "اثبت هنا", "انتظر هنا", "لا تتبعني",
"خليك هنا", "خليك هون", "خلك هنا", "خل هنا",
"فضل هنا", "ضل هنا", "ضل هون", "ابقى هنا",
"اصبر هنا", "استنى هنا", "تو هنا",
"ثبات هنا", "اثبت بمكانك", "ابق بمكانك",
"ما تتبعني", "ما تجي ورايا", "ما تمشي معي",
"خليك بمحلك", "ضل بمحلك"
]
},
"bot_phrases": {
"english": ["staying here", "staying", "waiting here"],
"arabic": [
"أبقى هنا", "أنتظر هنا", "أثبت هنا",
"باقي هنا", "بضل هنا", "بستنى هنا", "بقعد هنا"
]
}
},
"go_home": {
"canonical": "go home",
"user_phrases": {
"english": ["go home", "return home", "head home", "go back home"],
"arabic": [
"اذهب للبيت", "اذهب إلى البيت", "ارجع للبيت", "عُد للبيت",
"روح البيت", "روح للبيت", "روح ع البيت", "روح عالبيت", "روح بيت",
"ارجع البيت", "ارجاع البيت", "ارجع عالبيت", "ارجع ع البيت",
"خش البيت", "اطلع البيت",
"ارجع للقاعدة", "روح للقاعدة", "اتجه للبيت", "اتجه للقاعدة",
"عد للبيت", "عود للبيت"
]
},
"bot_phrases": {
"english": ["going home", "heading home", "returning home"],
"arabic": [
"أعود للبيت", "أذهب للبيت", "أتجه للبيت",
"براح البيت", "راجع البيت", "بروح للبيت", "أرجع للبيت"
]
}
},
"patrol": {
"canonical": "patrol",
"user_phrases": {
"english": ["patrol", "start patrol", "begin patrol", "patrol the area", "walk the route"],
"arabic": [
"طوف", "ابدأ الدورية", "ابدأ التطواف",
"دور بالمكان", "دور حواليك", "دور حوالينا", "لف بالمكان", "لف في المكان",
"تجول", "ابدأ التجوال", "تجول بالمكان",
"حراسة", "ابدأ الحراسة", "احرس المكان",
"طوف بالمكان", "تطوّف", "روح طوف"
]
},
"bot_phrases": {
"english": ["patrolling", "starting patrol", "beginning patrol"],
"arabic": [
"أطوف", "أبدأ الدورية", "أبدأ التطواف",
"بطوف", "بدور بالمكان", "ببدأ الدورية", "أتجول"
]
}
},
"look_around": {
"canonical": "look around",
"user_phrases": {
"english": ["look around", "scan the room", "scan around", "survey the area", "have a look around"],
"arabic": [
"انظر حولك", "تفحص المكان", "افحص المكان", "تطلع حولك",
"شوف حولك", "شوف حواليك", "شوف الغرفة", "شوف اللي حواليك",
"بص حواليك", "بص حولك", "بص في الغرفة",
"طلّ حواليك", "طلّ حولك", "تطلّع حواليك", "تطلّع حولك",
"افحص الغرفة", "افحص الموقع", "افحص اللي حولك",
"اسكان المكان", "امسح المكان", "امسح بنظرك"
]
},
"bot_phrases": {
"english": ["looking around", "scanning around", "surveying the area"],
"arabic": [
"أنظر حولي", "أتفحص المكان", "أتطلع حولي",
"بشوف حواليّ", "بطلّ حواليّ", "بفحص المكان", "بمسح المكان"
]
}
},
"what_do_you_see": {
"canonical": "what do you see",
"user_phrases": {
"english": ["what do you see", "what can you see", "describe this", "describe what you see", "tell me what you see"],
"arabic": [
"ماذا ترى", "ماذا تشاهد", "صف ما تراه", "أخبرني ماذا ترى",
"شو شايف", "شو بتشوف", "شو عم تشوف", "شو تشوف",
"شنو تشوف", "شنو شايف", "شنو هذا",
"إيش تشوف", "ايش تشوف", "ايش شايف", "إيش شايف",
"وش تشوف", "وش شايف", "وش هذا",
"شايف إيه", "بتشوف إيه", "إيه اللي قدامك", "إيه اللي شايفه", "إيه ده",
"اشنو كاتشوف", "اشنو كاتشوف قدامك",
"صف لي اللي قدامك", "قول لي شو شايف", "وصف لي اللي قدامك", "وصفلي اللي قدامك",
"احكي لي اللي شايفه", "احكيلي شو شايف", "قولي شو شايف"
]
},
"bot_phrases": {
"english": [],
"arabic": []
}
},
"repeat_last": {
"_": "Memory action: re-fire the most recent dispatched command. Marcus VoiceModule keeps a deque of recent commands and intercepts this canonical — it is NEVER passed to the brain, the resolved (real) command is dispatched instead. bot_phrases use distinctive multi-word forms ('repeating that', 'doing that again') to avoid colliding with the bare 'again' / 'تاني' that users might say in a different context.",
"canonical": "repeat last",
"user_phrases": {
"english": [
"do that again", "do it again", "again", "repeat", "redo",
"one more time", "once more", "do the same again",
"say that again", "do it once more", "repeat that",
"another time", "another one"
],
"arabic": [
"كرر", "كرّر", "كررها", "أعد", "أعدها", "أعد العملية",
"تاني", "مرة ثانية", "مرة تانية", "مرة كمان", "مرة أخرى",
"كرر اللي سويته", "كرر الحركة", "أعد الحركة", "مرة ثانية لو سمحت",
"أعد الكرة", "كمان مرة"
]
},
"bot_phrases": {
"english": [
"repeating that", "doing that again", "doing it again",
"one more time", "doing the same again", "repeating the last action"
],
"arabic": [
"أعيد الحركة", "أكرر الحركة", "مرة ثانية", "أعيدها",
"أكررها", "أعيد آخر حركة", "أكرر آخر حركة"
]
}
},
"pause_motion": {
"_": "Soft hold: brain motion loops freeze in place (zero velocity) and wait. Resume continues the SAME motion from where it paused — different from stop, which terminates the chain. Marcus VoiceModule intercepts this canonical and sets motion_pause directly; brain never sees it. Persona Rule 10 instructs Gemini to confirm with 'Pausing.' / 'أتوقف مؤقتاً.'.",
"canonical": "pause motion",
"user_phrases": {
"english": [
"pause", "pause motion", "hold on", "hold position", "hold up",
"wait a moment", "wait a sec", "freeze for a moment", "one second",
"give me a sec", "pause for a moment", "stay there"
],
"arabic": [
"توقف لحظة", "انتظر لحظة", "ثبت", "ثبت مكانك",
"استنى شوي", "استنى ثانية", "ثانية واحدة", "لحظة", "هدّي شوي",
"ايقاف مؤقت", "وقفة قصيرة", "انتظرني", "خليك مكانك"
]
},
"bot_phrases": {
"english": [
"pausing", "holding position", "holding for a moment",
"freezing in place", "pausing for a moment"
],
"arabic": [
"أتوقف مؤقتاً", "أتوقف لحظة", "أنتظر", "أثبت مكاني",
"إيقاف مؤقت", "انتظار قصير", "أبقى مكاني"
]
}
},
"resume_motion": {
"_": "Clears motion_pause so the previously-running brain loop continues. If nothing was actually paused, this is a quiet no-op. Persona Rule 10 instructs Gemini to confirm with 'Resuming.' / 'أكمل.'.",
"canonical": "resume motion",
"user_phrases": {
"english": [
"resume", "continue", "go on", "carry on", "keep going",
"proceed", "resume motion", "continue what you were doing",
"you can continue", "go ahead now"
],
"arabic": [
"أكمل", "استكمل", "كمّل", "كمل", "تابع", "استمر",
"كمّل الحركة", "كمّل اللي كنت فيه", "تابع اللي كنت تسوي",
"روح تابع", "استمر بالحركة"
]
},
"bot_phrases": {
"english": [
"resuming", "continuing", "carrying on", "resuming motion",
"picking up where I left off"
],
"arabic": [
"أكمل", "أستمر", "أتابع", "أكمل الحركة",
"أكمل من حيث توقفت", "أتابع اللي كنت أسويه"
]
}
},
"start_recording": {
"_": "Begin capturing subsequent dispatched commands into a named sequence buffer. Marcus VoiceModule intercepts this canonical and starts recording. Optionally pre-name the sequence here ('start recording as my-greet') — otherwise the user must name it on save. The recording is in-memory until save_sequence is called.",
"canonical": "start recording",
"user_phrases": {
"english": [
"start recording", "begin recording", "record this",
"remember this sequence", "record what i do",
"record sequence", "start saving moves", "save what i do"
],
"arabic": [
"ابدأ التسجيل", "ابدأ تسجيل", "ابدا التسجيل",
"سجل اللي بسويه", "سجل الحركات",
"سجل ما أفعله", "ابدأ الحفظ"
]
},
"bot_phrases": {
"english": [
"starting to record", "recording your sequence",
"i'm recording", "recording started"
],
"arabic": [
"بدأت التسجيل", "أسجل الحركات", "بدأت تسجيل التسلسل",
"بدأت أسجل"
]
}
},
"save_sequence": {
"_": "Stop recording and persist the buffered commands. The user typically appends a name in the SAME utterance ('save sequence as my-greet') — Gemini extracts the name and includes it in the confirmation. Marcus parses the bot_phrase, extracts the trailing name, calls Sequences.save_recording(name).",
"canonical": "save sequence",
"user_phrases": {
"english": [
"save sequence", "save this sequence", "save it as",
"save recording", "stop recording and save",
"stop recording save as", "save recording as",
"name this sequence", "call this sequence"
],
"arabic": [
"احفظ التسلسل", "احفظ التسجيل", "احفظ باسم",
"أوقف التسجيل واحفظ", "احفظ هذا التسلسل",
"سمي هذا التسلسل", "احفظ التسجيل باسم", "خلصت سجل"
]
},
"bot_phrases": {
"english": [
"saved sequence", "saved as", "saving sequence as",
"recording saved", "saved your sequence as"
],
"arabic": [
"حفظت التسلسل", "حفظت باسم", "أحفظ التسلسل باسم",
"تم الحفظ"
]
}
},
"play_sequence": {
"_": "Replay a saved sequence by name. The bot_phrase carries the name ('Playing my-greet.'); Marcus extracts it and calls Sequences.play(name) → enqueues each command on the motion worker queue. Sequence commands execute serially with normal interrupt support — say 'stop' to abort the playback.",
"canonical": "play sequence",
"user_phrases": {
"english": [
"play sequence", "play my", "run my", "do my",
"play recording", "execute sequence", "run sequence",
"perform sequence"
],
"arabic": [
"شغل التسلسل", "نفذ التسلسل", "سوي اللي سجلته",
"ابدأ تسلسل", "نفذ تسجيل", "شغل التسجيل",
"اعمل التسلسل"
]
},
"bot_phrases": {
"english": [
"playing", "running", "performing sequence",
"playing sequence", "starting sequence"
],
"arabic": [
"أشغل التسلسل", "أنفذ التسلسل", "أبدأ التسلسل",
"بدأت التشغيل"
]
}
},
"cancel_recording": {
"_": "Abandon the in-progress recording without saving. Marcus calls Sequences.cancel_recording().",
"canonical": "cancel recording",
"user_phrases": {
"english": [
"cancel recording", "discard recording", "throw away recording",
"stop recording without saving", "forget the recording"
],
"arabic": [
"ألغ التسجيل", "احذف التسجيل", "اوقف التسجيل بدون حفظ",
"تجاهل التسجيل"
]
},
"bot_phrases": {
"english": [
"discarded recording", "cancelled recording",
"recording cancelled"
],
"arabic": [
"ألغيت التسجيل", "حذفت التسجيل", "تم إلغاء التسجيل"
]
}
},
"reverse_last": {
"_": "Memory action: fire the INVERSE of the last dispatched command (turn right→turn left, walk forward→walk backward, sit down→stand up, raise arm→lower arm; parametric forms flip direction and keep the number). Marcus intercepts the 'reverse last' canonical and computes the inverse from history. NOTE: the brain has its own 'undo' regex that returns to the start position via odometry — that's a different feature; reverse_last only flips the LAST motion. Distinctive bot_phrases avoid colliding with move_backward needles like 'going back' / 'reversing'.",
"canonical": "reverse last",
"user_phrases": {
"english": [
"undo", "undo that", "reverse", "reverse that",
"go back", "take that back", "cancel that",
"undo the last", "reverse the last", "do the opposite"
],
"arabic": [
"تراجع", "تراجع عن ذلك", "ارجع ما سويت", "ارجع",
"اعكس", "اعكسها", "ألغي", "ألغي ذلك", "ألغي الحركة",
"رد ما سويت", "تراجع عن آخر حركة", "اعكس آخر حركة",
"اعمل عكس", "اعمل العكس"
]
},
"bot_phrases": {
"english": [
"reversing that", "undoing that", "reversing the last action",
"undoing the last action", "going back to before"
],
"arabic": [
"أتراجع عن ذلك", "أعكس آخر حركة", "ألغي آخر حركة",
"أتراجع عن آخر حركة", "أرجع للحالة السابقة", "أعكسها"
]
}
}
},
"_parametric_description": "Motion confirmations from Gemini that carry a NUMBER (turn 90 degrees, walk 5 steps, walk 2 meters). Each entry is regex-matched against Gemini's spoken reply; the captured groups are substituted into command_template via $1, $2 placeholders, and the resulting command_template string is dispatched to the brain — Brain/command_parser.py already parses 'turn left 90 degrees' / 'walk forward 5 steps' / 'walk 2 meters' natively. Order matters: more-specific patterns (with direction) MUST come before less-specific (no direction) so they claim spans first.",
"parametric_actions": [
{
"_": "turn left/right N degrees — 'Turning right 90 degrees.'",
"regex": "(?:turn(?:ing)?|spin(?:ning)?|rotat(?:e|ing))\\s+(left|right)\\s+(\\d+)\\s*deg(?:ree(?:s)?)?",
"command_template": "turn $1 $2 degrees"
},
{
"_": "turn left/right N steps (English) — 'Turning left 1 steps.' Maps to brain's _RE_TURN_STEP single-motion canonical 'turn left N steps' (NOT compound). User says 'turn left one step' — meaning one step's worth of left rotation.",
"regex": "(?:turn(?:ing)?|spin(?:ning)?|rotat(?:e|ing))\\s+(left|right)\\s+(\\d+(?:\\.\\d+)?)\\s*steps?",
"command_template": "turn $1 $2 steps"
},
{
"_": "Arabic أستدير يميناً/يساراً N خطوات — 'أستدير يساراً 1 خطوات' for 'لف يسار خطوة وحدة' single-motion intent. Parses as 'turn left N steps' brain canonical (not compound 'turn + walk').",
"regex": "(?:أستدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور)\\s+(?:(?:على|ع|إلى)\\s+)?(?:يمين|يميناً|يمينا|اليمين|لليمين)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)",
"command_template": "turn right $1 steps"
},
{
"_": "Arabic أستدير يساراً N خطوات — same as above, left side.",
"regex": "(?:أستدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور)\\s+(?:(?:على|ع|إلى)\\s+)?(?:يسار|يساراً|يسارا|اليسار|لليسار|شمال|الشمال|للشمال)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)",
"command_template": "turn left $1 steps"
},
{
"_": "turn N degrees — 'Turning 360 degrees.'",
"regex": "(?:turn(?:ing)?|spin(?:ning)?|rotat(?:e|ing))\\s+(\\d+)\\s*deg(?:ree(?:s)?)?",
"command_template": "turn $1 degrees"
},
{
"_": "walk forward/back N steps — 'Walking forward 5 steps.'",
"regex": "(?:walk(?:ing)?|step(?:ping)?|mov(?:e|ing))\\s+(forward|back(?:ward)?)\\s+(\\d+(?:\\.\\d+)?)\\s+steps?",
"command_template": "walk $1 $2 steps"
},
{
"_": "walk N steps forward/back — 'Walking 5 steps forward.'",
"regex": "(?:walk(?:ing)?|step(?:ping)?|tak(?:e|ing))\\s+(\\d+(?:\\.\\d+)?)\\s+steps?\\s+(forward|back(?:ward)?)",
"command_template": "walk $2 $1 steps"
},
{
"_": "walk N steps (default forward) — 'Taking 5 steps.'",
"regex": "(?:walk(?:ing)?|step(?:ping)?|tak(?:e|ing))\\s+(\\d+(?:\\.\\d+)?)\\s+steps?",
"command_template": "walk $1 steps"
},
{
"_": "walk forward/back N meters — 'Walking forward 2 meters.'",
"regex": "(?:walk(?:ing)?|mov(?:e|ing))\\s+(forward|back(?:ward)?)\\s+(\\d+(?:\\.\\d+)?)\\s*m(?:eter(?:s)?)?\\b",
"command_template": "walk $1 $2 meters"
},
{
"_": "walk N meters forward/back — 'Walking 2 meters forward.'",
"regex": "(?:walk(?:ing)?|mov(?:e|ing))\\s+(\\d+(?:\\.\\d+)?)\\s*m(?:eter(?:s)?)?\\s+(forward|back(?:ward)?)",
"command_template": "walk $2 $1 meters"
},
{
"_": "walk N meters (default forward) — 'Walking 2 meters.'",
"regex": "(?:walk(?:ing)?|mov(?:e|ing))\\s+(\\d+(?:\\.\\d+)?)\\s*m(?:eter(?:s)?)?\\b",
"command_template": "walk $1 meters"
},
{
"_": "Arabic turn right N degrees — covers MSA + Levantine/Syrian (لف يمين), Gulf/Iraqi (دور يمين), Egyptian (خش يمين). Verb roots: أستدير | استدر | لف | دور | خش. Direction tokens: يمين | يميناً | يمينا | اليمين (with optional على/ع prefix).",
"regex": "(?:أستدير|استدر|لف+|ألف+|دور+|أدور+|بدور|خش|أخش|بخش)\\s+(?:(?:على|ع)\\s+)?(?:يمين|يميناً|يمينا|اليمين|لليمين)\\s+(\\d+)\\s*درجة",
"command_template": "turn right $1 degrees"
},
{
"_": "Arabic turn left N degrees — also accepts شمال/الشمال (Lev/Egy 'left').",
"regex": "(?:أستدير|استدر|لف+|ألف+|دور+|أدور+|بدور|خش|أخش|بخش)\\s+(?:(?:على|ع)\\s+)?(?:يسار|يساراً|يسارا|اليسار|لليسار|شمال|الشمال|للشمال)\\s+(\\d+)\\s*درجة",
"command_template": "turn left $1 degrees"
},
{
"_": "Arabic turn N degrees (no direction) — 'أستدير 360 درجة' / 'لف 180 درجة' / 'دور 90 درجة'.",
"regex": "(?:أستدير|استدر|لف+|دور+)\\s+(\\d+)\\s*درجة",
"command_template": "turn $1 degrees"
},
{
"_": "Arabic walk N steps forward — verbs: أمشي | امشي | أتحرك | تحرك | روح | اروح. Forward: للأمام | لقدام | قدام (Egy/Lev).",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:للأمام|لقدام|قدام)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)",
"command_template": "walk forward $1 steps"
},
{
"_": "Arabic walk N steps back — Back: للخلف | لورا | ورا (Egy/Lev/Gulf).",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:للخلف|لورا|ورا)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)",
"command_template": "walk backward $1 steps"
},
{
"_": "Arabic walk N steps forward — number BEFORE direction (also common in Egy/Lev).",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)\\s+(?:للأمام|لقدام|قدام)",
"command_template": "walk forward $1 steps"
},
{
"_": "Arabic walk N steps back — number BEFORE direction.",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)\\s+(?:للخلف|لورا|ورا)",
"command_template": "walk backward $1 steps"
},
{
"_": "Arabic walk N steps (default forward) — 'أمشي 5 خطوات' / 'امشي خمس خطوات' (digits only — Gemini is told to use digits).",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)",
"command_template": "walk $1 steps"
},
{
"_": "Arabic walk N steps — REVERSED word order: verb + unit + number ('أمشي خطوة واحدة' → after normalise_numbers → 'أمشي خطوة 1'). Common natural-Arabic phrasing where the number-adjective follows the noun. Without this entry, this very natural phrasing fails to dispatch.",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:خطوة|خطوات|خطوتين)\\s+(\\d+)",
"command_template": "walk $1 steps"
},
{
"_": "Arabic DUAL form 'خطوتين' (two steps) — single word, no separate digit. Without this entry the dispatcher would miss 'أمشي خطوتين' since the parametric regex needs \\d+. Maps directly to walk 2 steps.",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+خطوتين",
"command_template": "walk 2 steps"
},
{
"_": "Arabic DUAL form 'مترين' (two meters) — same dual-suffix pattern as above. 'أمشي مترين' / 'أتحرك مترين' → walk 2 meters.",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+مترين",
"command_template": "walk 2 meters"
},
{
"_": "Arabic DUAL form 'درجتين' (two degrees) — for completeness, though 2-degree turns are rare.",
"regex": "(?:أستدير|استدر|لف+|ألف+|دور+|أدور+|بدور)\\s+(?:يميناً|يمينا|اليمين|لليمين|يساراً|يسارا|اليسار|لليسار|شمال|الشمال|للشمال)?\\s*درجتين",
"command_template": "turn 2 degrees"
},
{
"_": "Arabic walk N meters forward — direction first.",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:للأمام|لقدام|قدام)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)",
"command_template": "walk forward $1 meters"
},
{
"_": "Arabic walk N meters back — direction first.",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:للخلف|لورا|ورا)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)",
"command_template": "walk backward $1 meters"
},
{
"_": "Arabic walk N meters forward — number first.",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)\\s+(?:للأمام|لقدام|قدام)",
"command_template": "walk forward $1 meters"
},
{
"_": "Arabic walk N meters back — number first.",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)\\s+(?:للخلف|لورا|ورا)",
"command_template": "walk backward $1 meters"
},
{
"_": "Arabic walk N meters (default forward).",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)",
"command_template": "walk $1 meters"
},
{
"_": "Generic dialect Arabic 'turn right' WITHOUT number. Catches forms NOT in turn_right.bot_phrases.arabic, e.g. 'ألف على اليمين', 'دور لليمين', 'أستدير إلى اليمين', 'لف ع اليمين'. Verbs: لف/ألف/بلف/دور/أدور/بدور/خش/أخش/بخش/حوّد/أحوّد/أستدير/استدير. Optional prep على/ع/إلى/لـ in front of direction.",
"regex": "(?:أستدير|استدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور|خش|أخش|بخش|حوّد|أحوّد)\\s+(?:(?:على|ع|إلى)\\s+)?(?:ال)?(?:يمين|يميناً|يمينا|لليمين)",
"command_template": "turn right"
},
{
"_": "Generic dialect Arabic 'turn left' WITHOUT number — also catches شمال/الشمال.",
"regex": "(?:أستدير|استدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور|خش|أخش|بخش|حوّد|أحوّد)\\s+(?:(?:على|ع|إلى)\\s+)?(?:ال)?(?:يسار|يساراً|يسارا|لليسار|شمال|الشمال|للشمال)",
"command_template": "turn left"
},
{
"_": "Generic dialect Arabic 'turn around' WITHOUT number — back/wara variants.",
"regex": "(?:أستدير|استدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور)\\s+(?:(?:إلى|لـ)\\s*)?(?:للخلف|الخلف|للوراء|الوراء|للورا|ورا|لورا)",
"command_template": "turn around"
},
{
"_": "Generic dialect Arabic 'move forward' WITHOUT meters/steps — أمشي/أتحرك/أتقدم/أروح/بمشي/بتقدم + قدام/للأمام/الأمام variants.",
"regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|أتقدم|بتقدم|أتقدّم|أروح|اروح|روح|براح|بمشي|بفوت|أفوت)\\s+(?:(?:إلى|لـ)\\s*)?(?:للأمام|الأمام|لقدام|قدام|للقدام)",
"command_template": "move forward"
},
{
"_": "Generic dialect Arabic 'move backward' WITHOUT meters/steps — أرجع/أتراجع/برجع + للخلف/ورا/الوراء variants.",
"regex": "(?:أرجع|ارجع|برجع|أتراجع|بتراجع|أتحرك|تحرك|أمشي|امشي|امش)\\s+(?:(?:إلى|لـ)\\s*)?(?:للخلف|الخلف|لورا|ورا|للوراء|الوراء)",
"command_template": "move backward"
},
{
"_": "STEP-CLOSER (English). Note: normalise_numbers runs BEFORE this regex matches and converts 'one'→'1'. The regex must accept both forms ('one step closer' OR '1 step closer'). Maps to a deterministic 1-step walk; distinct from 'walking to X' (planner) and 'come here' (YOLO-tracked).",
"regex": "(?:stepping|moving|coming|edging)\\s+closer|(?:one|1)\\s+step\\s+closer|step(?:ping)?\\s+(?:a\\s+(?:bit|little)\\s+)?forward",
"command_template": "walk forward 1 steps"
},
{
"_": "STEP-CLOSER (Arabic). Note: normalise_numbers converts 'واحدة'→'1' before this regex runs, so the optional 'one' part must match either واحدة or 1. Same single-step semantics as English.",
"regex": "أتقدم\\s+خطوة|أقترب\\s+خطوة|خطوة\\s+(?:(?:واحدة|1)\\s+)?(?:للأمام|قدام)|أقرب\\s+خطوة|أتقرّب\\s+خطوة|أقدّم\\s+خطوة|أخطو\\s+للأمام",
"command_template": "walk forward 1 steps"
},
{
"_": "WALK-TO-TARGET (English) — captures the named object after 'walking/heading/going to/towards'. Example bot phrases the dispatcher catches: 'Walking to the door.' / 'Heading to the chair.' / 'Walking towards the table.' / 'Going up to the wall.'. The captured target ($1) is plugged into the brain command 'walk to <target>', which Brain/command_parser._RE_WALK_TO routes into the LLaVA-grounded spatial planner. Constraints: (a) stops at sentence punctuation so 'walking to the door, then sitting' picks up just 'door', (b) the determiner-strip group includes 'that\\s+|this\\s+' so anaphora 'walking to that chair' captures 'chair' not 'that chair', (c) negative-lookahead before the target rejects pronouns (us|me|him|her|them|you|it|myself|yourself) so a Rule-4-violating description like 'I see a person walking towards us' does NOT fire 'walk to us'. Persona Rule 12 instructs Gemini to use these literal shapes. NOTE: target capture allows Arabic chars (U+0600U+06FF) too — handles the field case where canonical_normalizer translated an Arabic verb+connective to English ('أتجه نحو X' → 'walking to X' with X still Arabic) and we want the planner to receive the Arabic target name. Qwen2.5-VL is multilingual; passing Arabic target to LLaVA prompt works.",
"regex": "(?:approaching|(?:walking|heading|going|moving)\\s+(?:over\\s+)?(?:to|towards?|up\\s+to)\\b)\\s+(?:the\\s+|a\\s+|an\\s+|that\\s+|this\\s+|ال)?(?!(?:us|me|him|her|them|you|it|myself|yourself|themselves|himself|herself|itself|هو|هي|هم|هما|هن|نحن|أنت|أنا)\\b)([A-Za-z\\u0600-\\u06FF][A-Za-z0-9\\u0600-\\u06FF ._-]{0,40}?)\\s*(?=[.!?،,]|$)",
"command_template": "walk to $1"
},
{
"_": "WALK-TO-TARGET (Arabic) — أمشي / أتجه / أتحرك + (إلى | نحو | تجاه | باتجاه) + <target>. Character class is the WHOLE Arabic block U+0600U+06FF so vowel diacritics (fatha/damma/kasra/sukun, U+064BU+065F) inside the target name don't truncate the capture (e.g. 'البَاب' with fatha was previously stopping at the fatha and capturing None). Determiner-strip includes هذا/هذه/ذلك/تلك (anaphora) plus the optional ال prefix. Pronoun negative-lookahead rejects هو/هي/هم/أنت/نحن/نا/ي etc as targets — handled by lookbehind avoiding common pronoun stems.",
"regex": "(?:أمشي|امشي|أتحرك|تحرك|أتجه|متجه|أروح|اروح|روح)\\s+(?:إلى|نحو|تجاه|باتجاه|لـ?ال?)\\s+(?:هذا|هذه|ذلك|تلك)?\\s*(?:ال)?(?!(?:هو|هي|هم|هما|هن|نحن|أنت|أنتم|أنا)\\b)([\\u0600-\\u06FF][\\u0600-\\u06FF 0-9_-]{0,40}?)\\s*(?=[.!?،,]|$)",
"command_template": "walk to $1"
}
]
}

222
Config/language_tables.json Normal file
View File

@ -0,0 +1,222 @@
{
"_description": "Language tables for voice motion processing. Single source of truth for ALL vocabulary data — no Arabic/English motion words live in code anymore. Adding a new dialect / variant is a JSON-only edit.",
"_consumers": [
"Voice/number_words.py — spelled-out numbers → digits (EN + AR)",
"Voice/canonical_normalizer.py — Arabic structural translation → English",
"Voice/marcus_voice.py — reverse-command map for memory operations",
"Voice/sequences.py — control commands that must NOT be captured into recordings"
],
"english_fractions": {
"_description": "English fractional words. Two semantic flavours: 'additive' (combines with a preceding integer — '3 and a half steps' → 3.5) and 'leading' (standalone — 'half a meter' → 0.5). Both forms map to the same numeric value; flavours are kept separate so the parser knows the syntactic context to apply.",
"additive": {
"half": 0.5,
"halves": 0.5,
"quarter": 0.25,
"quarters": 0.25,
"third": 0.3333,
"thirds": 0.3333
},
"leading": {
"half": 0.5,
"quarter": 0.25,
"third": 0.3333
}
},
"arabic_fractions": {
"_description": "Arabic fractional words. Same additive vs leading split as English. 'Additive' applies after a digit + ون / و conjunction ('3 ونصف' → 3.5). 'Leading' applies before a unit ('نصف متر' → 0.5 meter). Multiple dialect spellings of half (نصف / نص).",
"additive": {
"نصف": 0.5,
"نص": 0.5,
"ربع": 0.25,
"ثلث": 0.3333
},
"leading": {
"نصف": 0.5,
"نص": 0.5,
"ربع": 0.25,
"ثلث": 0.3333
}
},
"english_numbers": {
"_description": "English number-word parser tables. Used by number_words.py to convert 'ninety degrees' → '90 degrees' before regex matching.",
"ones": {
"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4,
"five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9,
"ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13,
"fourteen": 14, "fifteen": 15, "sixteen": 16,
"seventeen": 17, "eighteen": 18, "nineteen": 19
},
"tens": {
"twenty": 20, "thirty": 30, "forty": 40, "fifty": 50,
"sixty": 60, "seventy": 70, "eighty": 80, "ninety": 90
},
"scale": {
"hundred": 100
},
"glue": ["and", "-"]
},
"arabic_numbers": {
"_description": "Arabic spelled-out numbers. Order matters — longer multi-word phrases come first so they're matched before their shorter prefixes.",
"literals": [
["ثلاثمائة وستون", 360],
["ثلاثمائة وستين", 360],
["ثلاث مائة وستون", 360],
["ثلاث مائة وستين", 360],
["مائة وثمانون", 180],
["مائة وثمانين", 180],
["مية وثمانين", 180],
["مائتان وسبعون", 270],
["مائتان وسبعين", 270],
["عشرون", 20], ["عشرين", 20],
["ثلاثون", 30], ["ثلاثين", 30],
["أربعون", 40], ["أربعين", 40], ["اربعون", 40], ["اربعين", 40],
["خمسون", 50], ["خمسين", 50],
["ستون", 60], ["ستين", 60],
["سبعون", 70], ["سبعين", 70],
["ثمانون", 80], ["ثمانين", 80],
["تسعون", 90], ["تسعين", 90],
["مائتان", 200], ["مائتين", 200], ["ميتين", 200],
["ثلاثمائة", 300], ["ثلاث مائة", 300],
["أربعمائة", 400], ["اربعمائة", 400], ["أربع مائة", 400],
["خمسمائة", 500], ["خمس مائة", 500],
["مائة", 100], ["مية", 100], ["ميه", 100],
["أحد عشر", 11], ["احد عشر", 11],
["اثنا عشر", 12], ["اثني عشر", 12],
["ثلاثة عشر", 13], ["ثلاث عشرة", 13],
["أربعة عشر", 14], ["اربعة عشر", 14],
["خمسة عشر", 15], ["خمس عشرة", 15],
["ستة عشر", 16], ["ست عشرة", 16],
["سبعة عشر", 17],
["ثمانية عشر", 18],
["تسعة عشر", 19],
["واحد", 1], ["واحدة", 1],
["اثنان", 2], ["اثنين", 2], ["اثنتان", 2], ["اثنتين", 2],
["ثلاثة", 3], ["ثلاث", 3],
["أربعة", 4], ["أربع", 4], ["اربعة", 4], ["اربع", 4],
["خمسة", 5], ["خمس", 5],
["ستة", 6], ["ست", 6],
["سبعة", 7], ["سبع", 7],
["ثمانية", 8], ["ثمان", 8],
["تسعة", 9], ["تسع", 9],
["عشرة", 10], ["عشر", 10]
]
},
"arabic_verbs": {
"_description": "Arabic verb roots → English gerund. Gerund matches Gemini's English bot_phrases (e.g. 'walking forward'). Multiple Arabic dialect roots can map to the same English verb.",
"walking": [
"أمشي", "امشي", "امش",
"أتحرك", "تحرك",
"اروح", "روح", "اروحي", "روحي",
"بمشي", "براح", "أراح",
"بفوت", "أفوت",
"سير", "اسير",
"أتجه", "اتجه", "يتجه", "نتجه",
"أتوجه", "اتوجه", "يتوجه", "توجه",
"متوجه", "متجه"
],
"turning": [
"أستدير", "استدر", "استدير",
"لفّ", "لف", "ألف", "بلف",
"دوّر", "دور", "أدور", "بدور",
"خش", "أخش", "بخش",
"حوّد", "أحوّد"
],
"walking backward": [
"أرجع", "ارجع", "برجع",
"أتراجع", "بتراجع", "تراجع"
],
"walking forward": [
"أتقدم", "بتقدم", "أتقدّم", "تقدم"
]
},
"arabic_directions": {
"_description": "Arabic direction words → English directions. Includes definite-article variants (اليمين), preposition-prefixed forms (لليمين), and dialect alternatives (شمال = left in Levantine/Egyptian).",
"right": [
"يميناً", "يمينا",
"اليمين", "لليمين",
"يمين"
],
"left": [
"يساراً", "يسارا",
"اليسار", "لليسار",
"يسار",
"الشمال", "للشمال", "شمال"
],
"forward": [
"للأمام", "الأمام", "أمام",
"لقدام", "للقدام", "قدام"
],
"backward": [
"للخلف", "الخلف",
"للوراء", "الوراء",
"لورا", "ورا", "للورا"
]
},
"arabic_units": {
"_description": "Arabic unit nouns → English unit. Singular-vs-plural form is preserved so the dispatcher's regex layer (which accepts both) sees a natural form. Dual forms (خطوتين / مترين / درجتين) live in arabic_duals.",
"step": ["خطوة"],
"steps": ["خطوات"],
"meter": ["متر"],
"meters": ["أمتار"],
"degree": ["درجة"],
"degrees": ["درجات"]
},
"arabic_duals": {
"_description": "Arabic dual forms — single words containing both count and unit ('خطوتين' = 'two steps'). Substituted as a single unit BEFORE the verb/dir/unit pass. The English target form must be readable as 'N units' so the regex layer can pick up the count.",
"2 steps": ["خطوتين"],
"2 meters": ["مترين", "أمتارين"],
"2 degrees": ["درجتين"],
"2 seconds": ["ثانيتين"]
},
"arabic_conjunctions": {
"_description": "Arabic conjunctions translated to space-padded English glue. Subbed early so subsequent verb/dir passes don't accidentally treat the conjunction as a word.",
" then ": ["ثم", "وبعدين", "بعدين", "وبعد"]
},
"arabic_connectives": {
"_description": "Arabic prepositions / determiners / pronouns. Mostly pass-through; 'إلى' / 'نحو' map to 'to' so walk-to-target patterns work. 'ع' / 'على' get folded into nothing meaningful but are translated to 'on' to avoid leaving raw Arabic mid-string.",
"to": ["إلى", "نحو", "تجاه", "باتجاه"],
"on": ["على", "ع"],
"in": ["في"],
"this": ["هذا", "هذه"],
"that": ["ذلك", "تلك"]
},
"motion_inverses": {
"_description": "Pairwise inverse map for reverse_last memory operation. Used by Voice/marcus_voice.py::_reverse_command. Parametric forms (e.g. 'turn left 90 degrees' ↔ 'turn right 90 degrees') are derived in code via regex, NOT listed here — only fixed pairs.",
"turn right": "turn left",
"turn left": "turn right",
"move forward": "move backward",
"move backward": "move forward",
"sit down": "stand up",
"stand up": "sit down",
"raise arm": "lower arm",
"lower arm": "raise arm"
},
"sequence_never_record": {
"_description": "Canonical commands that must NEVER be captured into a recording session. Control commands (start/save/cancel/play recording, pause/resume, stop, repeat/reverse) would create absurd macros if captured. Used by Voice/sequences.py::record_command.",
"canonicals": [
"start recording",
"save sequence",
"cancel recording",
"play sequence",
"list sequences",
"delete sequence",
"pause motion",
"resume motion",
"stop",
"repeat last",
"reverse last"
]
}
}

View File

@ -1,519 +1,142 @@
# marcus_prompts.yaml — Marcus AI Prompts # marcus_prompts.yaml — Marcus AI Prompts (compact, 2048-ctx-safe)
# =========================================
# Project : Marcus | YS Lootah Technology
# Hardware : Unitree G1 EDU + Jetson Orin NX # Hardware : Unitree G1 EDU + Jetson Orin NX
# Model : Qwen2.5-VL 3B (Ollama, fully offline) # Model : Qwen2.5-VL 3B (Ollama)
# #
# Placeholders: # Placeholder convention: fields surrounded by <...> are instructions, NOT
# {command} — the user's typed/spoken command (English) # text to be copied. Qwen2.5-VL will copy quoted example strings verbatim
# {goal} — the navigation goal description # if they look like valid answers, so we keep example values abstract.
# {facts} — known facts from memory (e.g. "Kassam is the programmer")
# {target} — YOLO class being searched (e.g. "person")
# {condition} — extra condition for verify prompt (e.g. "holding a phone")
# {hint} — text description for image search
# {hint_line} — formatted hint line for image search compare
#
# LANGUAGE NOTE:
# All prompts instruct Qwen to detect the command language automatically
# and respond in the same language. No code-side language detection needed.
# English-only by policy — Arabic support was removed 2026-04-21.
# =============================================================================
# ══════════════════════════════════════════════════════════════════════════════
# MAIN PROMPT
# Used for: all standard commands (movement, arms, vision, questions)
# ══════════════════════════════════════════════════════════════════════════════
# ── MAIN PROMPT ──────────────────────────────────────────────────────────────
main_prompt: | main_prompt: |
You are Sanad — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai. You are Sanad, a humanoid robot (YS Lootah Technology). You have a camera,
You are physically present in the room. You have a body, arms, and a camera. two arms, and can move. Respond to commands with ONE JSON object only — no
You follow commands from your operator and respond intelligently. text before or after the JSON, no markdown.
{facts}
── YOUR CAPABILITIES ──────────────────────────────────────────────────────
Movement : forward · backward · left (rotate) · right (rotate)
Arms : wave · raise_right · raise_left · clap · high_five · hug · heart · shake_hand · face_wave
Vision : you can see through your camera right now
Speech : you respond in one clear sentence
── COMMAND ────────────────────────────────────────────────────────────────
"{command}"
── OUTPUT FORMAT ──────────────────────────────────────────────────────────
Reply with ONLY this JSON — no markdown, no extra text, no explanation:
{{"actions":[{{"move":"forward|backward|left|right|stop","duration":2.0}}],"arm":null,"speak":"one sentence","abort":null}}
── MOVEMENT RULES ─────────────────────────────────────────────────────────
- actions: ordered list of movement steps executed in sequence
- move: "forward" "backward" "left" "right" "stop" — exactly these values
- duration: seconds per step, max 5.0s (chain steps for longer movements)
- Merge consecutive same-direction steps into one:
"forward 2s + forward 2s" → "forward 4s" — NOT two separate steps
- Duration reference:
"1 step" = 1.0s
"tiny step" = 0.3s
"half a step" = 0.5s
"2 steps" = 2.0s
"3 steps" = 3.0s
"45 degrees" = 2.5s
"90 degrees" = 5.0s
"180 degrees" = 10.0s
- Speed modifiers:
"slowly" → multiply duration by 0.5
"quickly" / "fast" → multiply duration by 1.5 (cap at 5.0s)
── ARM RULES ──────────────────────────────────────────────────────────────
- arm: one value from the list above, or null
- arm runs AFTER all movement steps complete — never inside actions list
- One arm action per command maximum
- arm = null when no gesture is needed
── SPEAK RULES ────────────────────────────────────────────────────────────
- speak: one sentence, first person, natural
- Describe what you are doing OR what you see — never both in one sentence
- For pure movement: "Turning right"
- For vision questions: describe what the camera shows
- Never repeat the command word-for-word
- Always respond in English
── SAFETY RULES ───────────────────────────────────────────────────────────
- abort = null for all normal commands
- abort = "obstacle detected" if camera shows obstacle closer than 0.5m
- abort = "unsafe command" if the command could damage the robot or people
- abort = "cannot comply" if physically impossible
- When aborting: actions = [] and explain in speak
── CONTEXT RULES ──────────────────────────────────────────────────────────
- "that person" / "him" / "her" → resolve from conversation or camera
- "it" / "there" → resolve from last command context
- If ambiguous → choose the most reasonable safe interpretation
══ ENGLISH EXAMPLES ═══════════════════════════════════════════════════════
Basic movement:
"turn right"
→ {{"actions":[{{"move":"right","duration":2.0}}],"arm":null,"speak":"Turning right","abort":null}}
"turn left 90 degrees"
→ {{"actions":[{{"move":"left","duration":5.0}}],"arm":null,"speak":"Turning 90 degrees left","abort":null}}
"turn right 45 degrees slowly"
→ {{"actions":[{{"move":"right","duration":1.25}}],"arm":null,"speak":"Turning right slowly","abort":null}}
"walk forward 3 steps"
→ {{"actions":[{{"move":"forward","duration":3.0}}],"arm":null,"speak":"Walking forward 3 steps","abort":null}}
"spin around"
→ {{"actions":[{{"move":"right","duration":10.0}}],"arm":null,"speak":"Spinning around","abort":null}}
"stop"
→ {{"actions":[{{"move":"stop","duration":0}}],"arm":null,"speak":"Stopping","abort":null}}
Multi-step:
"move back then turn left"
→ {{"actions":[{{"move":"backward","duration":2.0}},{{"move":"left","duration":2.0}}],"arm":null,"speak":"Moving back then turning left","abort":null}}
"turn right 90 degrees then walk forward 2 steps"
→ {{"actions":[{{"move":"right","duration":5.0}},{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Turning right then walking forward","abort":null}}
"step back twice then face left"
→ {{"actions":[{{"move":"backward","duration":2.0}},{{"move":"left","duration":2.0}}],"arm":null,"speak":"Stepping back twice then turning left","abort":null}}
Approach / distance:
"come to me"
→ {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Coming to you","abort":null}}
"come back"
→ {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Coming back","abort":null}}
"get closer"
→ {{"actions":[{{"move":"forward","duration":1.0}}],"arm":null,"speak":"Moving closer","abort":null}}
"go away" / "back off"
→ {{"actions":[{{"move":"backward","duration":2.0}}],"arm":null,"speak":"Moving away","abort":null}}
"keep your distance"
→ {{"actions":[{{"move":"backward","duration":1.5}}],"arm":null,"speak":"Keeping my distance","abort":null}}
Arms:
"wave"
→ {{"actions":[],"arm":"wave","speak":"Waving hello","abort":null}}
"clap"
→ {{"actions":[],"arm":"clap","speak":"Clapping","abort":null}}
"high five"
→ {{"actions":[],"arm":"high_five","speak":"High five!","abort":null}}
"raise your right arm"
→ {{"actions":[],"arm":"raise_right","speak":"Raising right arm","abort":null}}
"give him a hug"
→ {{"actions":[{{"move":"forward","duration":1.0}}],"arm":"hug","speak":"Going in for a hug","abort":null}}
"shake my hand"
→ {{"actions":[],"arm":"shake_hand","speak":"Extending hand for a handshake","abort":null}}
"walk forward and wave"
→ {{"actions":[{{"move":"forward","duration":2.0}}],"arm":"wave","speak":"Walking forward and waving","abort":null}}
"approach and shake hand"
→ {{"actions":[{{"move":"forward","duration":1.5}}],"arm":"shake_hand","speak":"Approaching to shake hands","abort":null}}
Vision / questions:
"what do you see"
→ {{"actions":[],"arm":null,"speak":"I see a person sitting at a desk with a laptop and monitor","abort":null}}
"describe the room"
→ {{"actions":[],"arm":null,"speak":"I see an open office with several workstations, chairs, and screens","abort":null}}
"is anyone here"
→ {{"actions":[],"arm":null,"speak":"Yes, I can see one person sitting in front of me","abort":null}}
"look around"
→ {{"actions":[{{"move":"left","duration":3.0}}],"arm":null,"speak":"Looking around the environment","abort":null}}
"face me"
→ {{"actions":[{{"move":"left","duration":2.0}}],"arm":null,"speak":"Turning to face you","abort":null}}
Identity / facts:
"my name is Kassam"
→ {{"actions":[],"arm":null,"speak":"Nice to meet you Kassam, I will remember that","abort":null}}
"who are you"
→ {{"actions":[],"arm":null,"speak":"I am Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
Safety:
"walk into the wall"
→ {{"actions":[],"arm":null,"speak":"I cannot do that safely","abort":"unsafe command"}}
Command: "{command}"
JSON:
# ══════════════════════════════════════════════════════════════════════════════
# GOAL PROMPT
# Used for: navigate_to_goal() — YOLO found the class, now verify with LLaVA
# ══════════════════════════════════════════════════════════════════════════════
goal_prompt: |
You are Sanad, a humanoid robot actively navigating toward a specific target.
YOUR MISSION: "{goal}"
Study the camera image carefully and honestly.
Reply ONLY with this JSON — no markdown, no explanation:
{{"reached":false,"next_move":"left","duration":0.5,"speak":"what you actually see right now","confidence":"low|medium|high"}}
── REACHED RULES ──────────────────────────────────────────────────────────
- reached = true ONLY when the target is CLEARLY and UNAMBIGUOUSLY visible now
- reached = false if: partially visible · occluded · uncertain · far away · similar but not exact
- For compound goals ("person holding a phone"):
reached = true only when BOTH parts are confirmed simultaneously
- confidence:
"high" — very clear, no doubt
"medium" — likely, small uncertainty
"low" — possible but unclear — keep searching
- Only set reached=true when confidence is "medium" or "high"
── MOVEMENT RULES ─────────────────────────────────────────────────────────
- next_move: "left" · "right" · "forward"
- duration: 0.3 to 0.8 seconds per step
- Default when not found: "left" at 0.4s — keep scanning
- Use "forward" when target IS visible but too far — to approach
- Use "right" if you scanned too far left and may have passed it
- Use "forward" + short duration to reposition when target is at edge of frame
── SPEAK RULES ────────────────────────────────────────────────────────────
- Describe what the camera ACTUALLY shows right now — not what you want to see
- Be specific: mention what you DO see and why the goal is/isn't met
- Good: "I see a person at a desk but they are not holding a phone"
- Good: "Target confirmed — person holding phone visible at center"
- Bad: "I don't see the target" — always say what you DO see instead
── EXAMPLES (English) ─────────────────────────────────────────────────────
Goal: "stop when you see a person"
Camera: empty office corridor
→ {{"reached":false,"next_move":"left","duration":0.4,"speak":"I see an empty corridor with chairs — no person visible","confidence":"high"}}
Goal: "stop when you see a person"
Camera: person sitting at desk
→ {{"reached":true,"next_move":"left","duration":0.0,"speak":"Person clearly visible at center — stopping","confidence":"high"}}
Goal: "find a laptop"
Camera: desk with monitor but no laptop
→ {{"reached":false,"next_move":"left","duration":0.4,"speak":"I see a desk and monitor but no laptop","confidence":"high"}}
Goal: "stop when you see a person holding a phone"
Camera: person visible but no phone in hand
→ {{"reached":false,"next_move":"left","duration":0.4,"speak":"I see a person but they are not holding a phone","confidence":"high"}}
Goal: "stop when you see a person holding a phone"
Camera: person clearly holding phone
→ {{"reached":true,"next_move":"left","duration":0.0,"speak":"Person holding a phone confirmed — stopping","confidence":"high"}}
Goal: "find a chair"
Camera: chair visible but far away at edge of frame
→ {{"reached":false,"next_move":"forward","duration":0.5,"speak":"I can see a chair far ahead — moving closer","confidence":"medium"}}
# ══════════════════════════════════════════════════════════════════════════════
# PATROL PROMPT
# Used for: autonomous office exploration — auto on / auto off
# ══════════════════════════════════════════════════════════════════════════════
patrol_prompt: |
You are Sanad, a humanoid robot autonomously exploring and mapping an office environment.
Your mission: move through the space intelligently, identify areas and objects,
and build a spatial understanding of the layout.
Study the camera image carefully.
Reply ONLY with this JSON — no markdown, no explanation:
{{"observation":"what you see","area_type":"office|corridor|meeting_room|reception|storage|lab|kitchen|unknown","objects":["obj1","obj2"],"people_count":0,"next_move":"forward","duration":1.0,"interesting":false,"landmark":null}}
── OBSERVATION RULES ──────────────────────────────────────────────────────
- observation: one clear factual sentence about the current scene
- area_type — classify based on visual evidence:
"office" — desks, monitors, computers, office chairs
"corridor" — narrow passage, doors along sides, no furniture clusters
"meeting_room" — large central table, multiple chairs around it, screen/projector
"reception" — front desk / reception counter, waiting chairs, entrance
"storage" — shelves, boxes, filing cabinets, equipment racks
"lab" — specialized equipment, workbenches, electronics
"kitchen" — refrigerator, microwave, sink, coffee machine
"unknown" — cannot determine from current view
- objects: list up to 6 identifiable objects
Be specific: "office chair" not just "chair", "standing desk" not just "desk"
Include: desk · monitor · chair · laptop · printer · cabinet · door · window
whiteboard · phone · person · plant · screen · projector · rack
- people_count: exact number of people visible (0 if none)
- interesting: true when you see:
Any person · a new room type not seen recently · a landmark · something unusual
An exit or entrance · a feature worth recording for navigation
── LANDMARK RULE ──────────────────────────────────────────────────────────
- landmark: describe a specific, memorable visual anchor point, or null
- Good landmarks: "red fire extinguisher on left wall", "large window at end of corridor",
"reception desk with YS Lootah sign", "glass meeting room with whiteboard"
- Null if nothing distinctive is visible
── MOVEMENT RULES ─────────────────────────────────────────────────────────
- next_move: "forward" | "left" | "right"
- duration: 0.5 to 2.0 seconds
- Strategy:
Prefer "forward" to explore new unseen areas
Use "left" or "right" to scan when in an interesting area
Use shorter duration (0.50.8s) near people, obstacles, or interesting objects
Use longer duration (1.52.0s) in clear open corridors
── EXAMPLES ───────────────────────────────────────────────────────────────
Open office with people:
{{"observation":"Open office area with four workstations, two people working at monitors","area_type":"office","objects":["desk","monitor","office chair","laptop","phone","plant"],"people_count":2,"next_move":"left","duration":0.8,"interesting":true,"landmark":"desk cluster near window on right side"}}
Empty corridor:
{{"observation":"Long corridor with closed doors on both sides, no people, overhead lighting","area_type":"corridor","objects":["door","wall","light fixture","fire extinguisher"],"people_count":0,"next_move":"forward","duration":2.0,"interesting":false,"landmark":null}}
Meeting room:
{{"observation":"Large meeting room with central table, six chairs, wall-mounted screen at far end","area_type":"meeting_room","objects":["conference table","chair","screen","whiteboard","projector"],"people_count":0,"next_move":"left","duration":0.8,"interesting":true,"landmark":"large wall-mounted screen with YS Lootah branding"}}
Kitchen area:
{{"observation":"Small kitchen area with coffee machine, microwave, and refrigerator","area_type":"kitchen","objects":["coffee machine","microwave","refrigerator","counter","sink"],"people_count":0,"next_move":"right","duration":0.6,"interesting":true,"landmark":"coffee machine on counter near window"}}
Storage room:
{{"observation":"Storage area with metal shelving units holding boxes and equipment","area_type":"storage","objects":["shelf","box","cabinet","equipment rack"],"people_count":0,"next_move":"forward","duration":1.0,"interesting":false,"landmark":null}}
JSON:
# ══════════════════════════════════════════════════════════════════════════════
# TALK PROMPT
# Used for: questions, statements, identity, facts — no movement
# ══════════════════════════════════════════════════════════════════════════════
talk_prompt: |
You are Sanad, a humanoid robot assistant. You have been asked a question
or given information. Do NOT move — just respond intelligently.
{facts} {facts}
Command: "{command}" Command: "{command}"
Use the camera image to inform your answer when relevant. Schema (replace every <…> with your actual value):
{{"actions":[{{"move":"<forward|backward|left|right|stop>","duration":<seconds 0.0-5.0>}}],"arm":<null or one gesture>,"speak":"<one short sentence in first person>","abort":<null or short reason>}}
Reply ONLY with this JSON — no markdown, no explanation:
{{"actions":[],"arm":null,"speak":"your response in 1-2 sentences","abort":null}}
── RESPONSE RULES ─────────────────────────────────────────────────────────
- actions MUST always be [] — you are not moving
- arm MUST always be null — no gestures for talk-only
- speak: answer naturally and helpfully in 1-2 sentences maximum
- Use the camera: if asked about the environment, look at the image and describe what you see
- If given a fact ("my name is X"): acknowledge clearly and confirm you remember
- If asked who someone is: use known facts from memory + what you see in camera
- If asked about your status: report systems are functioning
- If asked opinion or advice: give a helpful, honest answer
- CRITICAL: respond in the exact same language as the command
── ENGLISH EXAMPLES ───────────────────────────────────────────────────────
"what do you see"
→ {{"actions":[],"arm":null,"speak":"I see a person sitting at a desk with a laptop and a large monitor","abort":null}}
"describe what is in front of you"
→ {{"actions":[],"arm":null,"speak":"In front of me I see an office workspace with a monitor, keyboard, and a person working","abort":null}}
"is there anyone in the room"
→ {{"actions":[],"arm":null,"speak":"Yes, I can see one person sitting at a workstation in front of me","abort":null}}
"my name is Kassam"
→ {{"actions":[],"arm":null,"speak":"Got it, nice to meet you Kassam — I will remember your name","abort":null}}
"I am your programmer"
→ {{"actions":[],"arm":null,"speak":"Understood, you are my programmer — ready for your instructions","abort":null}}
"who is that person"
→ {{"actions":[],"arm":null,"speak":"I can see a person at the desk — based on what you told me, that is Kassam","abort":null}}
"how are you"
→ {{"actions":[],"arm":null,"speak":"All systems are functioning normally and I am ready to assist you","abort":null}}
"what is your name"
→ {{"actions":[],"arm":null,"speak":"My name is Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
"who built you"
→ {{"actions":[],"arm":null,"speak":"I was built by YS Lootah Technology in Dubai, and my programmer is Kassam","abort":null}}
"what can you do"
→ {{"actions":[],"arm":null,"speak":"I can move through the environment, identify objects and people, navigate to goals, and respond in English","abort":null}}
"ok" / "good" / "thanks"
→ {{"actions":[],"arm":null,"speak":"Ready for your next command","abort":null}}
# ══════════════════════════════════════════════════════════════════════════════
# VERIFY PROMPT
# Used for: two-stage goal detection — confirm extra condition after YOLO finds class
# ══════════════════════════════════════════════════════════════════════════════
verify_prompt: |
You are a visual verification system for a robot.
A {target} has been detected in the camera image by the object detector.
Your task: verify whether this specific condition is true:
→ "{condition}"
Study the image carefully and honestly.
Reply with ONLY one word: yes or no
Rules: Rules:
- "yes" only if the condition is CLEARLY and VISIBLY true in this image - actions: ordered motion steps. duration max 5.0 s. Merge same-direction steps.
- "no" if you are uncertain, cannot see clearly, or the condition is not met - Duration guide: 1 step = 1 s · 45° = 2.5 s · 90° = 5 s · "slowly" ×0.5 · "fast" ×1.5
- Do not infer or guess — only confirm what is visually obvious - arm: one of wave · raise_right · raise_left · clap · high_five · hug · heart · shake_hand · face_wave — or null. Runs after motion.
- A partially held object, unclear position, or occlusion = "no" - speak: actually describe what you are doing OR what the camera shows right now. Do NOT copy example text. First person. English.
- abort: null normally; "obstacle detected" / "unsafe command" / "cannot comply" with actions=[] when unsafe.
- CRITICAL — IF THE COMMAND IS UNCLEAR OR NOT AN ACTION:
If the input text is a single unclear word (like "I", "alright", "ok", "um"),
a random phrase ("I have a lot of beauty", "turn turn turn"), noise, a
greeting ("hello", "hi"), or anything that isn't clearly a movement /
arm / vision / memory instruction — DO NOT INVENT a command. Instead
reply with:
{{"actions":[],"arm":null,"speak":"Sorry, I didn't understand that — please repeat","abort":"command not understood"}}
Better to ask again than to guess and perform the wrong action.
# ══════════════════════════════════════════════════════════════════════════════ Examples (learn the STRUCTURE, don't reuse the speak text):
# IMAGE SEARCH — COMPARE (two images: reference photo vs current camera) "turn right" → {{"actions":[{{"move":"right","duration":2.0}}],"arm":null,"speak":"Turning right","abort":null}}
# Used for: search/ /path/photo.jpg hint "walk 2 steps" → {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Walking forward","abort":null}}
# ══════════════════════════════════════════════════════════════════════════════ "wave" → {{"actions":[],"arm":"wave","speak":"Waving","abort":null}}
image_search_compare_prompt: |
You are the visual matching system for a robot performing a targeted search.
IMAGE 1 — Reference photo: shows the specific target to find.
IMAGE 2 — Current camera: shows what the robot sees right now.
{hint_line}
Task: determine if the target from IMAGE 1 is visible in IMAGE 2.
Reply ONLY with this JSON — no markdown, no explanation:
{{"found":false,"confidence":"low|medium|high","position":"left|center|right|not visible","description":"one sentence"}}
── MATCHING RULES ─────────────────────────────────────────────────────────
- found = true only when you are confident it is the SAME specific target
- This is identity matching — same person or same object, not just same category
- For people: match clothing, hair, body shape, face features — not just "a person"
- For objects: match color, shape, size, distinctive features — not just "a bag"
- confidence levels:
"high" — very clear match, high certainty (same person clearly visible)
"medium" — likely the same, minor uncertainty (similar appearance, slightly occluded)
"low" — possible match but unclear — robot should keep searching
- Stop searching only when found=true AND confidence is "medium" or "high"
- position: where in IMAGE 2 the target appears
"left" · "center" · "right" · "not visible"
- description: one honest sentence about what you see in IMAGE 2 and your reasoning
Good: "Person in blue shirt visible at center, matches reference photo clothing"
Good: "I see a person but face is not clear enough to confirm identity"
Good: "No match — the person visible is wearing different clothing"
── EXAMPLES ───────────────────────────────────────────────────────────────
Clear match:
{{"found":true,"confidence":"high","position":"center","description":"Person in blue shirt and glasses visible at center, closely matches the reference photo"}}
Likely match:
{{"found":true,"confidence":"medium","position":"right","description":"Person with similar clothing visible on right, slight occlusion but likely the same individual"}}
No match:
{{"found":false,"confidence":"high","position":"not visible","description":"I see a different person — clothing and appearance do not match the reference"}}
Unclear:
{{"found":false,"confidence":"low","position":"left","description":"Someone visible on left but partially occluded, cannot confirm identity — continuing search"}}
JSON: JSON:
# ══════════════════════════════════════════════════════════════════════════════ # ── GOAL PROMPT ──────────────────────────────────────────────────────────────
# IMAGE SEARCH — TEXT ONLY (description-based search, no reference photo) goal_prompt: |
# Used for: search/ person in blue shirt You are Sanad navigating toward a target.
# ══════════════════════════════════════════════════════════════════════════════ Mission: "{goal}"
Study the current camera image carefully and reply with ONE JSON — no text
before or after, no markdown. Fill every <…> with your actual judgement.
Schema:
{{"reached":<true|false>,"next_move":"<left|right|forward>","duration":<0.3-0.8>,"speak":"<one-sentence description of what THIS camera image actually shows>","confidence":"<low|medium|high>"}}
Rules:
- reached = true ONLY when the target described by the mission is CLEARLY present in this exact frame. Default to reached = false.
- "office env" ≠ hallway, door, corridor, or random room — require the specific target type (e.g. an office must show desks/monitors/workstations).
- "person" means a human body visible — not just a chair or bag that belongs to someone.
- If you are not sure the target type matches exactly → reached = false, keep searching.
- For compound goals ("person holding phone"), BOTH parts must be visible in the SAME frame.
- confidence: "high" clear · "medium" likely · "low" keep searching. Only set reached=true at medium+.
- next_move: "left" (default scan) · "right" · "forward" (approach if target visible but far).
- speak: write a concrete description of the objects visible in THIS frame, in your own words.
# ── PATROL PROMPT ────────────────────────────────────────────────────────────
patrol_prompt: |
You are Sanad autonomously exploring. Study the image and reply with ONE
JSON — no text before or after, no markdown. Replace every <…>.
Schema:
{{"observation":"<one factual sentence about the current scene>","area_type":"<office|corridor|meeting_room|reception|storage|lab|kitchen|unknown>","objects":[<up to 6 specific items>],"people_count":<integer>,"next_move":"<forward|left|right>","duration":<0.5-2.0>,"interesting":<true|false>,"landmark":<null or "<specific memorable anchor>">}}
Rules:
- observation: describe THIS image, not a generic scene.
- area_type: pick from the list based on visible evidence.
- objects: specific items ("standing desk" not "desk").
- people_count: exact integer.
- interesting = true when you see a person, new room type, entrance, or unusual object.
- landmark: a specific visual anchor (e.g. "red extinguisher on left wall") or null.
- next_move: "forward" to explore, "left"/"right" to scan.
# ── TALK PROMPT ──────────────────────────────────────────────────────────────
talk_prompt: |
You are Sanad, a humanoid robot. The user asked you something. Do NOT move.
Use the camera image when the question asks about what you see.
{facts}
Command: "{command}"
Reply with ONE JSON only — no text before or after, no markdown:
{{"actions":[],"arm":null,"speak":"<your honest 1-2 sentence answer>","abort":null}}
Rules:
- actions MUST be [] and arm MUST be null. You are not moving.
- For vision questions ("what do you see", "describe...", "who is there", "what is in front of me"): describe the actual camera image in your own words. Do NOT copy example text.
- For facts the user tells you ("my name is X"): acknowledge and say you will remember.
- For "who are you" / "what are you": introduce yourself briefly.
- Answer honestly and specifically. 1-2 sentences.
# ── VERIFY PROMPT ────────────────────────────────────────────────────────────
verify_prompt: |
A {target} was detected in the image. Verify this condition:
"{condition}"
Reply with ONLY one word: yes or no
- "yes" only if clearly and visibly true right now.
- "no" if uncertain, occluded, or condition not met.
# ── IMAGE SEARCH — COMPARE ───────────────────────────────────────────────────
image_search_compare_prompt: |
IMAGE 1 = reference photo of the target. IMAGE 2 = current camera view.
{hint_line}
Task: is the target from IMAGE 1 visible in IMAGE 2?
Reply with ONE JSON — no other text, no markdown. Replace every <…>:
{{"found":<true|false>,"confidence":"<low|medium|high>","position":"<left|center|right|not visible>","description":"<one sentence about IMAGE 2 and your reasoning>"}}
Rules:
- Identity matching: same specific person/object, not just same category.
- People: match clothing, hair, body shape, face.
- Objects: match color, shape, size, distinctive features.
- Only found=true at medium+ confidence.
# ── IMAGE SEARCH — TEXT ONLY ─────────────────────────────────────────────────
image_search_text_prompt: | image_search_text_prompt: |
You are the visual search system for a robot looking for a target by description.
Target description: "{hint}" Target description: "{hint}"
Study the current camera image.
Study the current camera image carefully. Reply with ONE JSON — no other text, no markdown. Replace every <…>:
{{"found":<true|false>,"confidence":"<low|medium|high>","position":"<left|center|right|not visible>","description":"<one sentence about what you see>"}}
Reply ONLY with this JSON — no markdown, no explanation: Rules:
{{"found":false,"confidence":"low|medium|high","position":"left|center|right|not visible","description":"one sentence"}} - found = true only when the image clearly matches all described attributes.
- confidence: "high" all elements confirmed · "medium" minor uncertainty · "low" unclear.
── SEARCH RULES ─────────────────────────────────────────────────────────── - Only report found=true at medium+ confidence.
- found = true only when the camera shows something that clearly matches the description
- Be specific about the match — does the image actually show what was described?
- For people descriptions (color, clothing, activity): all mentioned attributes must match
- For object descriptions (color, type, location): all mentioned attributes must match
- confidence levels:
"high" — target clearly visible, all description elements confirmed
"medium" — target likely visible, minor uncertainty on one element
"low" — possible match but one or more elements unclear or missing
- Only report found=true at "medium" or "high" confidence
── EXAMPLES ───────────────────────────────────────────────────────────────
Hint: "person in blue shirt"
Camera: person in blue shirt clearly visible
→ {{"found":true,"confidence":"high","position":"center","description":"Person wearing a blue shirt clearly visible at center of frame"}}
Hint: "person in blue shirt"
Camera: person in white shirt
→ {{"found":false,"confidence":"high","position":"not visible","description":"I see a person but they are wearing white, not blue"}}
Hint: "red backpack near the door"
Camera: red backpack on a chair, no door visible
→ {{"found":false,"confidence":"medium","position":"left","description":"Red backpack visible on left but no door nearby — partial match"}}
Hint: "laptop on desk"
Camera: laptop clearly on desk
→ {{"found":true,"confidence":"high","position":"center","description":"Laptop visible on desk at center of frame"}}
JSON:

218
Core/motion_log.py Normal file
View File

@ -0,0 +1,218 @@
"""
motion_log.py structured per-motion event log.
Drop-in: brain handlers and worker call log_motion(event, cmd, ...)
and the helper auto-parses direction/magnitude/unit from the canonical
string. The worker barely needs to know what it's logging.
Every motion the brain or worker executes writes ONE line to
`logs/motion.log` capturing:
timestamp | canonical | direction mag unit | target | actual | outcome | source
So a single voice command produces a paired START + END line:
17:45:01 | turn left 19 degrees | dir=left mag=19 unit=deg | target=2.4s | - | start | voice
17:45:03 | turn left 19 degrees | dir=left mag=19 unit=deg | target=2.4s | actual=2.41s | complete | voice
Or for a parameterless motion:
17:46:00 | turn right | dir=right mag=- unit=- | target=2.0s | - | start | voice
17:46:02 | turn right | dir=right mag=- unit=- | target=2.0s | actual=2.02s | complete | voice
This is independent of voice.log and transcript.log:
- voice.log = per-thread debug detail (dispatch, worker, watchdog,
reconnect)
- transcript.log = user-facing dialogue (what user said vs. what
Gemini said vs. what canonical fired)
- motion.log = ONE LINE PER MOTION with parsed params + actual
outcome for post-hoc auditing of physical movements
API:
log_motion(event, canonical, direction=..., magnitude=..., unit=...,
target_sec=..., actual_sec=..., source="voice"|"text",
outcome="start"|"complete"|"interrupted"|"error", **extra)
Events (event field):
'start' brain handler is about to issue motor commands.
target_sec is the planned duration; actual is None.
'complete' brain returned normally; actual_sec set.
'interrupted' motion_abort fired during execution.
'error' brain raised an exception; reason in extra.
"""
from __future__ import annotations
import logging
import os
import re
from logging.handlers import RotatingFileHandler
# ─── canonical parser ─────────────────────────────────────────────
#
# Map a canonical command string to (direction, magnitude, unit) so
# the structured log auto-fills these fields when the caller doesn't
# pass them explicitly.
_RE_TURN_DEG_DIR = re.compile(r"^turn\s+(left|right)\s+(\d+(?:\.\d+)?)\s*deg(?:rees?)?$", re.I)
_RE_TURN_DEG = re.compile(r"^turn\s+(\d+(?:\.\d+)?)\s*deg(?:rees?)?$", re.I)
_RE_TURN_STEP_DIR = re.compile(r"^turn\s+(left|right)\s+(\d+(?:\.\d+)?)\s*steps?$", re.I)
_RE_TURN_DIR = re.compile(r"^turn\s+(left|right|around)$", re.I)
_RE_WALK_STEP = re.compile(r"^walk\s+(?:(forward|back(?:ward)?)\s+)?(\d+(?:\.\d+)?)\s*steps?$", re.I)
_RE_WALK_M = re.compile(r"^walk\s+(?:(forward|back(?:ward)?)\s+)?(\d+(?:\.\d+)?)\s*m(?:eter(?:s)?)?$", re.I)
_RE_MOVE_DIR = re.compile(r"^move\s+(forward|back(?:ward)?|left|right)$", re.I)
_RE_WALK_TO = re.compile(r"^walk\s+to\s+(.+)$", re.I)
def parse_canonical(cmd: str) -> dict:
"""Parse a canonical command into structured motion fields.
Returns {direction, magnitude, unit, kind}. Best-effort if the
canonical doesn't match any known shape, all fields are '-' and
kind='other'."""
out = {"direction": "-", "magnitude": "-", "unit": "-", "kind": "other"}
if not cmd:
return out
s = cmd.strip().lower()
m = _RE_TURN_DEG_DIR.match(s)
if m:
return {"direction": m.group(1), "magnitude": m.group(2),
"unit": "deg", "kind": "turn"}
m = _RE_TURN_DEG.match(s)
if m:
return {"direction": "-", "magnitude": m.group(1),
"unit": "deg", "kind": "turn"}
m = _RE_TURN_STEP_DIR.match(s)
if m:
return {"direction": m.group(1), "magnitude": m.group(2),
"unit": "step", "kind": "turn"}
m = _RE_TURN_DIR.match(s)
if m:
return {"direction": m.group(1), "magnitude": "-",
"unit": "-", "kind": "turn"}
m = _RE_WALK_STEP.match(s)
if m:
d = (m.group(1) or "forward").lower()
if d.startswith("back"): d = "backward"
return {"direction": d, "magnitude": m.group(2),
"unit": "step", "kind": "walk"}
m = _RE_WALK_M.match(s)
if m:
d = (m.group(1) or "forward").lower()
if d.startswith("back"): d = "backward"
return {"direction": d, "magnitude": m.group(2),
"unit": "m", "kind": "walk"}
m = _RE_MOVE_DIR.match(s)
if m:
d = m.group(1).lower()
if d.startswith("back"): d = "backward"
return {"direction": d, "magnitude": "-",
"unit": "-", "kind": "move"}
m = _RE_WALK_TO.match(s)
if m:
return {"direction": "tracked", "magnitude": m.group(1).strip(),
"unit": "target", "kind": "walk_to"}
# Bare canonicals
if s == "stop": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "stop"}
if s == "sit down": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "posture"}
if s == "stand up": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "posture"}
if s == "wave hello":return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "arm"}
if s == "raise arm": return {"direction": "right", "magnitude": "-", "unit": "-", "kind": "arm"}
if s == "lower arm": return {"direction": "right", "magnitude": "-", "unit": "-", "kind": "arm"}
if s == "come here": return {"direction": "tracked", "magnitude": "-", "unit": "yolo", "kind": "come"}
if s == "follow me": return {"direction": "tracked", "magnitude": "-", "unit": "yolo", "kind": "follow"}
if s == "look around":return {"direction": "scan", "magnitude": "-", "unit": "-", "kind": "look"}
if s == "stay here": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "stay"}
if s == "go home": return {"direction": "home", "magnitude": "-", "unit": "-", "kind": "nav"}
if s == "patrol": return {"direction": "scan", "magnitude": "-", "unit": "-", "kind": "patrol"}
if s == "pause motion": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "pause"}
if s == "resume motion": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "resume"}
return out
try:
from Core.env_loader import PROJECT_ROOT # type: ignore
except Exception:
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
_LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(_LOG_DIR, exist_ok=True)
_LOG_PATH = os.path.join(_LOG_DIR, "motion.log")
_logger = logging.getLogger("marcus.motion")
_logger.setLevel(logging.INFO)
_logger.propagate = False
if not _logger.handlers:
_h = RotatingFileHandler(
_LOG_PATH, maxBytes=2_000_000, backupCount=3, encoding="utf-8",
)
# Use pipe-separator format so the file is grep/awk friendly.
_h.setFormatter(logging.Formatter("%(asctime)s | %(message)s"))
_logger.addHandler(_h)
def log_motion(event: str,
canonical: str,
direction=None,
magnitude=None,
unit=None,
target_sec=None,
actual_sec=None,
source: str = "voice",
**extra) -> None:
"""Emit one structured line describing a motion event.
direction/magnitude/unit auto-fill from parse_canonical(canonical)
if not passed so the worker just calls log_motion('start', cmd)
and the parser handles the rest. Override any field by passing it
explicitly (useful when the brain knows odom-measured actuals).
Fields render with fixed width for grep/awk friendliness. Extra
kwargs become trailing 'k=v' tokens.
"""
def _f(v):
if v is None: return "-"
if isinstance(v, float): return "{:.2f}".format(v)
return str(v)
canonical = (canonical or "").strip() or "-"
parsed = parse_canonical(canonical) if canonical != "-" else {}
direction = direction if direction is not None else parsed.get("direction", "-")
magnitude = magnitude if magnitude is not None else parsed.get("magnitude", "-")
unit = unit if unit is not None else parsed.get("unit", "-")
parts = [
"{:35s}".format(canonical[:35]),
"dir={:8s}".format(_f(direction)[:8]),
"mag={:8s}".format(_f(magnitude)[:8]),
"unit={:6s}".format(_f(unit)[:6]),
"target={:>7s}".format(_f(target_sec) + "s" if target_sec is not None else "-"),
"actual={:>7s}".format(_f(actual_sec) + "s" if actual_sec is not None else "-"),
"{:11s}".format(event[:11]),
"{:5s}".format(source[:5]),
]
if extra:
parts.append(" ".join("{}={}".format(k, _f(v)) for k, v in extra.items()))
_logger.info(" | ".join(parts))
# Convenience tee so console + log show the same one-liner
def print_motion(event: str, canonical: str, **kw) -> None:
"""Like log_motion but ALSO prints to stdout so the operator sees
the line in the terminal dashboard. Use for the high-traffic
'start' and 'complete' events; use log_motion alone for the noisier
intermediate events."""
log_motion(event, canonical, **kw)
direction = kw.get("direction", "-")
magnitude = kw.get("magnitude", "-")
unit = kw.get("unit", "-")
target = kw.get("target_sec")
actual = kw.get("actual_sec")
src = kw.get("source", "voice")
line = " [Motion {:9s}] {:30s} dir={} mag={} unit={}".format(
event, canonical[:30], direction, magnitude, unit,
)
if target is not None:
line += " target={:.2f}s".format(float(target))
if actual is not None:
line += " actual={:.2f}s".format(float(actual))
print(line)

46
Core/motion_state.py Normal file
View File

@ -0,0 +1,46 @@
"""
motion_state.py shared motion-control signals.
Three module-level threading.Events used by Voice/marcus_voice.py to
control in-progress motion. Producers are VoiceModule's motion worker;
consumers are brain motion loops in Brain/command_parser.py,
Brain/executor.py, Navigation/marcus_odometry.py, Navigation/patrol.py.
motion_abort hard stop. Loops break out and run gradual_stop().
Set by 'stop'/'cancel'/etc. Cleared by the worker
at the start of each new non-stop command, so a
stale set never kills the next motion.
motion_pause soft hold. Loops do NOT break; they hold zero
velocity and wait for the event to clear, then
resume the same loop iteration where they paused.
While paused they ALSO check motion_abort each
~50ms a pause-then-stop combo cleanly aborts.
The helper `wait_while_paused` centralizes the polling loop so brain
sites don't repeat the same stanza:
if motion_abort.is_set(): break
wait_while_paused()
if motion_abort.is_set(): break
send_vel(...)
time.sleep(0.05)
Single source of truth placed in Core/ so both Voice/* and Brain/*
can import without circular deps.
"""
from __future__ import annotations
import threading
import time
motion_abort: threading.Event = threading.Event()
motion_pause: threading.Event = threading.Event()
def wait_while_paused(poll_sec: float = 0.05) -> None:
"""Block while motion_pause is set. Returns when pause clears OR
motion_abort fires (so the caller can immediately bail out via
its existing motion_abort check)."""
while motion_pause.is_set() and not motion_abort.is_set():
time.sleep(poll_sec)

View File

@ -1,20 +0,0 @@
[
{
"time": "16:42:30",
"cmd": "hi",
"response": "Hello!",
"duration_s": 5.98
},
{
"time": "16:43:00",
"cmd": "go back two step",
"response": "Going back two steps",
"duration_s": 10.01
},
{
"time": "16:43:23",
"cmd": "now com",
"response": "Coming to you",
"duration_s": 8.11
}
]

View File

@ -1,66 +0,0 @@
[
{
"time": "16:42:30",
"class": "mouse",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:42:30",
"class": "chair",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:42:30",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:42:30",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:43:00",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:43:00",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:43:23",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:43:23",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,14 +0,0 @@
[
{
"time": "16:44:40",
"cmd": "what do you see",
"response": "I see a person sitting at a desk in an office environment. The individual appears to be engaged with a computer screen and is wearing a red shirt.",
"duration_s": 8.01
},
{
"time": "16:45:06",
"cmd": "that is me the one whos is programming you",
"response": "That is me, the one who's programming you",
"duration_s": 6.71
}
]

View File

@ -1,42 +0,0 @@
[
{
"time": "16:44:40",
"class": "person",
"position": "left",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:45:06",
"class": "mouse",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:45:06",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:45:06",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:45:06",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
}
]

View File

@ -0,0 +1,434 @@
[
{
"time": "14:39:02",
"cmd": "turn around",
"response": "Turning left",
"duration_s": 4.94
},
{
"time": "14:40:32",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:40:53",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:41:17",
"cmd": "walk backward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:41:20",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:41:23",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:41:26",
"cmd": "walk 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:41:29",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:41:32",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:41:35",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:41:39",
"cmd": "walk 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:41:42",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:42:22",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:42:47",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:43:08",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:43:24",
"cmd": "turn right",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:43:37",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:43:40",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:44:02",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:44:08",
"cmd": "turn right 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:44:15",
"cmd": "turn right 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:44:21",
"cmd": "turn right 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:44:47",
"cmd": "turn left 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:45:29",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:45:48",
"cmd": "turn left 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:46:25",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:46:28",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:46:32",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:46:43",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:46:46",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:47:36",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:47:49",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:48:16",
"cmd": "turn around",
"response": "Turning left",
"duration_s": 17.53
},
{
"time": "14:48:52",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:49:39",
"cmd": "walk forward 1 meters",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:49:43",
"cmd": "walk forward 1 meters",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:49:47",
"cmd": "walk forward 1 meters",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:49:52",
"cmd": "walk forward 1 meters",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:50:07",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:51:17",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:51:41",
"cmd": "turn right",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:52:13",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:52:50",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:53:15",
"cmd": "turn right",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:54:05",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:56:07",
"cmd": "go back o",
"response": "Turning left",
"duration_s": 15.61
},
{
"time": "14:56:59",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:57:00",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:57:04",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:57:33",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:57:37",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:57:40",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:57:45",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:58:00",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:58:05",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:58:23",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:58:26",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:58:30",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:58:33",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:00:23",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:00:25",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:01:58",
"cmd": "follow me",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:02:03",
"cmd": "walk 2 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:02:07",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:02:18",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:02:25",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:02:28",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:02:53",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:04:40",
"cmd": "turn left 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:04:46",
"cmd": "turn left 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:05:04",
"cmd": "turn right 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "15:05:10",
"cmd": "turn right 90 degrees",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,14 +0,0 @@
[
{
"time": "16:46:46",
"cmd": "hi",
"response": "Error.",
"duration_s": 0.0
},
{
"time": "16:46:49",
"cmd": "hiii",
"response": "Error.",
"duration_s": 0.0
}
]

View File

@ -1,34 +0,0 @@
[
{
"time": "16:46:46",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:46:46",
"class": "chair",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:46:46",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:46:49",
"class": "mouse",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
}
]

View File

@ -0,0 +1,74 @@
[
{
"time": "16:40:07",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "16:40:22",
"cmd": "turn left 19 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "16:40:37",
"cmd": "walk backward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "16:40:55",
"cmd": "walk forward 4 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "16:41:05",
"cmd": "walk backward 4 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "16:41:47",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "16:42:02",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "16:42:18",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "16:42:44",
"cmd": "walk 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "16:43:05",
"cmd": "walk backward 3 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "16:43:06",
"cmd": "turn right 0 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "16:43:49",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,32 +0,0 @@
[
{
"time": "16:48:15",
"cmd": "hii",
"response": "Greeting",
"duration_s": 7.93
},
{
"time": "16:48:32",
"cmd": "what do you see",
"response": "I see a person sitting at a desk in an office environment. The person appears to be working on a computer.",
"duration_s": 7.64
},
{
"time": "16:48:58",
"cmd": "that person is the guy who is programming you",
"response": "That person is the one who programmed me",
"duration_s": 6.76
},
{
"time": "16:49:18",
"cmd": "who is that person",
"response": "I don't have enough information to identify the person in the image.",
"duration_s": 7.1
},
{
"time": "16:50:03",
"cmd": "example/",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,34 +0,0 @@
[
{
"time": "16:48:15",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:48:32",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:48:58",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:49:18",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -0,0 +1,8 @@
[
{
"time": "09:02:24",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,44 +0,0 @@
[
{
"time": "16:51:08",
"cmd": "hiii",
"response": "Greeting",
"duration_s": 7.97
},
{
"time": "16:51:23",
"cmd": "hi",
"response": "Greeting",
"duration_s": 6.16
},
{
"time": "16:51:51",
"cmd": "my name is Kassam",
"response": "My name is Kassam",
"duration_s": 6.48
},
{
"time": "16:51:59",
"cmd": "that person is Kassam the programmer",
"response": "That person is Kassam the programmer",
"duration_s": 6.74
},
{
"time": "16:52:07",
"cmd": "who is that person",
"response": "That is a man",
"duration_s": 6.42
},
{
"time": "16:52:47",
"cmd": "hiigo back 1 step",
"response": "Backing up one step",
"duration_s": 8.51
},
{
"time": "16:53:24",
"cmd": "what can you do",
"response": "I am a humanoid robot and I can perform various tasks such as walking, talking, and interacting with objects. However, in this image, I see a person sitting at a desk using a computer. If you have any specific questions or requests for me to perform, please let me know!",
"duration_s": 10.6
}
]

View File

@ -1,146 +0,0 @@
[
{
"time": "16:51:08",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:08",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:23",
"class": "chair",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:23",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:23",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:51",
"class": "chair",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:51",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:51",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:59",
"class": "chair",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:59",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:59",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:07",
"class": "chair",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:07",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:47",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:47",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:47",
"class": "cup",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:53:24",
"class": "chair",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:53:24",
"class": "person",
"position": "left",
"distance": "very close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -0,0 +1,56 @@
[
{
"time": "09:14:00",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:14:04",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:15:07",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:16:07",
"cmd": "turn left 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:17:20",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:17:40",
"cmd": "walk back 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:18:27",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "09:18:40",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:19:55",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -0,0 +1,26 @@
[
{
"time": "09:40:17",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:40:20",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:40:37",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:40:40",
"cmd": "walk 1 steps",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -0,0 +1,20 @@
[
{
"time": "09:42:11",
"cmd": "turn around",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:42:20",
"cmd": "look around",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:42:34",
"cmd": "turn around",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,44 +0,0 @@
[
{
"time": "17:43:52",
"cmd": "hi",
"response": "Hi",
"duration_s": 3.23
},
{
"time": "17:44:32",
"cmd": "step one step back",
"response": "Step one step back",
"duration_s": 3.44
},
{
"time": "17:44:43",
"cmd": "what do you see",
"response": "I see a person sitting at a desk with a laptop in front of them.",
"duration_s": 3.93
},
{
"time": "17:44:52",
"cmd": "hii",
"response": "Hi",
"duration_s": 3.31
},
{
"time": "17:45:27",
"cmd": "one step back",
"response": "One step back",
"duration_s": 3.46
},
{
"time": "17:45:38",
"cmd": "one more",
"response": "One more",
"duration_s": 3.43
},
{
"time": "17:46:09",
"cmd": "two step back",
"response": "Moving back two steps",
"duration_s": 3.51
}
]

View File

@ -1,138 +0,0 @@
[
{
"time": "17:43:52",
"class": "chair",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "17:43:52",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:43:52",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:43:52",
"class": "cup",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:32",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:32",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:43",
"class": "person",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:52",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:52",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:27",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:27",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:38",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:38",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:38",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:09",
"class": "chair",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:09",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:09",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -0,0 +1,26 @@
[
{
"time": "09:49:11",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:49:39",
"cmd": "turn right",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:50:07",
"cmd": "turn right",
"response": "local command",
"duration_s": 0.0
},
{
"time": "09:50:28",
"cmd": "turn right 100 degrees",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,14 +0,0 @@
[
{
"time": "17:46:37",
"cmd": "hii",
"response": "Hii",
"duration_s": 3.27
},
{
"time": "17:46:54",
"cmd": "why did you moved",
"response": "I moved because I was instructed to do so.",
"duration_s": 3.71
}
]

View File

@ -1,58 +0,0 @@
[
{
"time": "17:46:37",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:37",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:37",
"class": "cup",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:54",
"class": "mouse",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:54",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:54",
"class": "chair",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:54",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -0,0 +1,8 @@
[
{
"time": "11:24:20",
"cmd": "turn right",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,56 +0,0 @@
[
{
"time": "17:49:16",
"cmd": "hii",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
},
{
"time": "17:49:19",
"cmd": "hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
},
{
"time": "17:49:20",
"cmd": "Hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
},
{
"time": "17:49:37",
"cmd": "move one step back",
"response": "Moving back",
"duration_s": 3.47
},
{
"time": "17:49:50",
"cmd": "one tiny step back",
"response": "Moving back",
"duration_s": 3.44
},
{
"time": "17:50:05",
"cmd": "now come back to me",
"response": "Coming back to you",
"duration_s": 3.53
},
{
"time": "17:50:36",
"cmd": "turn right",
"response": "Turning right",
"duration_s": 3.4
},
{
"time": "17:51:01",
"cmd": "come back to me",
"response": "Coming back to you",
"duration_s": 3.5
},
{
"time": "17:51:22",
"cmd": "turn right 90deg and then back two steps",
"response": "Turning right",
"duration_s": 3.43
}
]

View File

@ -1,106 +0,0 @@
[
{
"time": "17:49:37",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:49:37",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:49:37",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:49:50",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:49:50",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:05",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:05",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:05",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:36",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:36",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:51:01",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:51:01",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:51:22",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -0,0 +1,248 @@
[
{
"time": "11:31:56",
"cmd": "walk 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:32:29",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:33:19",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:33:31",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:34:22",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "11:34:57",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:35:02",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:35:28",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:35:37",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:36:13",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:36:28",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:36:47",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:36:50",
"cmd": "turn right",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:37:09",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:37:12",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:37:15",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:37:26",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:37:37",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:37:52",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "11:37:56",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:39:06",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "11:39:33",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:40:00",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "11:40:53",
"cmd": "stay here",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:41:15",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "11:42:27",
"cmd": "walk 2 meters",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:42:50",
"cmd": "turn 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:43:22",
"cmd": "turn 360 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:43:42",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:44:15",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "11:44:57",
"cmd": "walk forward 10 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:44:58",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:45:18",
"cmd": "walk backward 5 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:45:30",
"cmd": "turn right",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:45:56",
"cmd": "walk back 3 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:45:59",
"cmd": "turn right",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:46:19",
"cmd": "walk to person",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:47:19",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:48:05",
"cmd": "walk forward 5 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:48:08",
"cmd": "turn right 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "11:48:13",
"cmd": "turn left 2 steps",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,26 +0,0 @@
[
{
"time": "17:53:36",
"cmd": "turn right 90deg and then move back 3 steps",
"response": "Turned right 90 degrees then moved backward",
"duration_s": 0.0
},
{
"time": "17:54:21",
"cmd": "keep turning left unitl you see a guy holding a phone",
"response": "Turning left",
"duration_s": 3.72
},
{
"time": "17:55:33",
"cmd": "goal/ keep turning left unitl you see a guy holding a phone",
"response": "Goal navigation: keep turning left unitl you see a guy holding a phone",
"duration_s": 5.47
},
{
"time": "17:55:53",
"cmd": "describe what he is doing",
"response": "Turning left",
"duration_s": 3.73
}
]

View File

@ -1,50 +0,0 @@
[
{
"time": "17:54:21",
"class": "suitcase",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:54:21",
"class": "backpack",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:55:32",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:55:53",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:55:53",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:55:53",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -0,0 +1,236 @@
[
{
"time": "12:54:29",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:54:46",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:55:28",
"cmd": "walk to حقيبة",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:56:36",
"cmd": "walk forward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:56:39",
"cmd": "turn left 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:56:59",
"cmd": "walk backward 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:57:59",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:58:07",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:58:22",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "12:59:14",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:59:17",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:59:30",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "12:59:58",
"cmd": "turn left 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:00:41",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "13:01:52",
"cmd": "walk forward 5 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:02:09",
"cmd": "walk back 2 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:02:52",
"cmd": "walk 5 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:02:57",
"cmd": "walk forward 2 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:03:04",
"cmd": "turn left 3 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:03:07",
"cmd": "turn right 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:03:51",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:03:56",
"cmd": "walk forward 2 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:03:59",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:04:27",
"cmd": "walk backward 3 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:04:32",
"cmd": "walk forward 2 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:05:19",
"cmd": "walk backward 10 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:05:30",
"cmd": "walk forward 5 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:05:34",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:06:27",
"cmd": "turn right 180 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:06:30",
"cmd": "move forward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:06:33",
"cmd": "move backward",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:07:15",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
},
{
"time": "13:08:52",
"cmd": "turn right 90 degrees",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:08:55",
"cmd": "walk 1 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:08:58",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:10:28",
"cmd": "walk backward 20 steps",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:10:29",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:11:41",
"cmd": "stop",
"response": "local command",
"duration_s": 0.0
},
{
"time": "13:11:57",
"cmd": "come here",
"response": "Coming to you",
"duration_s": 0.0
}
]

View File

@ -1,32 +0,0 @@
[
{
"time": "17:57:56",
"cmd": "turn right 90 deg",
"response": "local command",
"duration_s": 0.0
},
{
"time": "17:58:22",
"cmd": "keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"response": "Turning left",
"duration_s": 3.68
},
{
"time": "17:58:55",
"cmd": "goal/ keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"response": "Goal navigation: keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"duration_s": 25.79
},
{
"time": "17:59:13",
"cmd": "keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"response": "Turning left",
"duration_s": 3.73
},
{
"time": "17:59:40",
"cmd": "goal/keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"response": "Goal navigation: keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"duration_s": 23.61
}
]

View File

@ -1,18 +0,0 @@
[
{
"time": "17:58:22",
"class": "suitcase",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:58:22",
"class": "backpack",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,26 +0,0 @@
[
{
"time": "18:09:23",
"cmd": "what do you see",
"response": "I see a person sitting at a desk with a computer and a monitor in front of them.",
"duration_s": 4.48
},
{
"time": "18:09:47",
"cmd": "what do you see",
"response": "I see a person sitting at a desk with a computer and a monitor in front of them.",
"duration_s": 4.52
},
{
"time": "18:10:17",
"cmd": "example/",
"response": "local command",
"duration_s": 0.0
},
{
"time": "18:10:41",
"cmd": "is there anyone in the room",
"response": "I see a person sitting at a desk with a computer and a monitor in front of them.",
"duration_s": 4.57
}
]

View File

@ -1,58 +0,0 @@
[
{
"time": "18:09:23",
"class": "mouse",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "18:09:23",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "18:09:23",
"class": "person",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:09:23",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:09:47",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:10:41",
"class": "person",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:10:41",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,13 +0,0 @@
Session: session_014_2026-04-05
Date: 2026-04-05 18:10
Duration: 4m 48s
Commands: 4
YOLO detections: 7
Alerts: 0
Known places: none
First commands:
- what do you see
- what do you see
- example/
- is there anyone in the room

View File

@ -1,26 +0,0 @@
[
{
"time": "18:12:31",
"cmd": "example/",
"response": "local command",
"duration_s": 0.0
},
{
"time": "18:12:49",
"cmd": "goal/ keep turning left until you see a guy",
"response": "Goal navigation: keep turning left until you see a guy",
"duration_s": 5.22
},
{
"time": "18:13:04",
"cmd": "goal/ keep turning left until you see a guy",
"response": "Goal navigation: keep turning left until you see a guy",
"duration_s": 3.22
},
{
"time": "18:13:25",
"cmd": "what do you see",
"response": "I see a man sitting in a chair, talking on a phone. There is a cardboard box and a backpack on a table next to him.",
"duration_s": 0.0
}
]

View File

@ -1,18 +0,0 @@
[
{
"time": "18:12:48",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:13:03",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,8 +0,0 @@
[
{
"time": "13:33:49",
"cmd": "hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
}
]

View File

@ -1,8 +0,0 @@
[
{
"time": "13:35:59",
"cmd": "hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
}
]

Some files were not shown because too many files have changed in this diff Show More