diff --git a/Brain/command_parser.py b/Brain/command_parser.py index c25a462..ee74d8a 100644 --- a/Brain/command_parser.py +++ b/Brain/command_parser.py @@ -3,13 +3,16 @@ command_parser.py — Local command regex patterns + dispatcher Handles place memory, odometry, session recall, help, examples """ import re +import threading import time from API.zmq_api import send_vel, gradual_stop, MOVE_MAP, STEP_DURATION_SEC from API.memory_api import mem, place_save, place_goto, places_list_str from API.odometry_api import odom, ODOM_AVAILABLE from API.camera_api import get_frame -from API.llava_api import ask +from API.llava_api import ask, call_llava, parse_json from Brain.executor import execute +from Core.motion_state import motion_abort, wait_while_paused +from Core.motion_log import log_motion, print_motion # ── Compiled patterns ──────────────────────────────────────────────────────── @@ -33,9 +36,9 @@ _RE_TURN_DEG = re.compile( # that for a trivial two-second motion. One step = 2 s of motion at the default # velocity, matching the undo-loop duration already used below. _RE_WALK_STEP = re.compile( - r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+)\s*steps?$", re.I) + r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+(?:\.\d+)?)\s*steps?$", re.I) _RE_TURN_STEP = re.compile( - r"^turn\s+(left|right)(?:\s+(\d+)\s*steps?)?$", re.I) + r"^turn\s+(left|right)(?:\s+(\d+(?:\.\d+)?)\s*steps?)?$", re.I) # Simple one-shot motion — one word or verb+direction, no counts/units. # All default to a ~2 s motion at the normal velocity. Kept local so the @@ -79,6 +82,106 @@ _RE_SESSION_SUMMARY = re.compile( _RE_AUTO = re.compile( r"^auto(?:nomous)?\s+(on|off|status|save|summary)$", re.I) +# ── DIRECT-MOTION CANONICALS ───────────────────────────────────────────── +# Canonicals that the dispatcher emits but were previously falling through +# to the LLaVA/Qwen fallback (5–30s blocking call on Jetson). Each maps +# to a fast, deterministic motion or arm gesture. See Bug #3 in the +# 14-bug audit. +_RE_TURN_AROUND = re.compile(r"^turn[\s_-]+around$", re.I) +_RE_SIT_DOWN = re.compile(r"^sit[\s_-]+down$", re.I) +_RE_STAND_UP = re.compile(r"^stand[\s_-]+up$", re.I) +_RE_WAVE_HELLO = re.compile(r"^wave(?:[\s_-]+(?:hello|hi))?$", re.I) +_RE_RAISE_ARM = re.compile(r"^raise[\s_-]+(?:right[\s_-]+)?arm$", re.I) +_RE_LOWER_ARM = re.compile(r"^lower[\s_-]+(?:right[\s_-]+)?arm$", re.I) +_RE_STAY_HERE = re.compile(r"^stay[\s_-]+(?:here|in[\s_-]+place)$", re.I) +_RE_POINT_GESTURE = re.compile(r"^point(?:[\s_-]+(?:at|to)[\s_-]+.*)?$", re.I) +# Note: 'move left' / 'move right' canonicals are handled by the existing +# _RE_SIMPLE_DIR (verb=move, direction=left/right via MOVE_MAP). MOVE_MAP +# in config_Navigation.json defines 'left'/'right' as ROTATIONS, not +# strafes — historical decision; if real strafes are wanted later add +# vy axis to MOVE_MAP and a separate strafe handler. + +# ── HIGHER-LEVEL MOTIONS ───────────────────────────────────────────────────── +# 'look around' = scan-left + return + scan-right + return; uses execute_action +# so motion_abort halts it cleanly. +_RE_LOOK_AROUND = re.compile( + r"^(?:look[\s_-]+around|scan[\s_-]+(?:the\s+)?(?:room|area)|survey[\s_-]+(?:the\s+)?surroundings?)$", + re.I, +) +# 'follow me' = walk forward when YOLO sees a person and they aren't too close, +# else hold. Loop bounded by FOLLOW_ME_MAX_SEC and motion_abort. +_RE_FOLLOW_ME = re.compile( + r"^(?:follow[\s_-]+me|come[\s_-]+with[\s_-]+me|stick[\s_-]+with[\s_-]+me)$", + re.I, +) + +# Spatial planner: 'walk to/towards the '. Distinct from _RE_GOTO +# above because that one is reserved for place-memory names ('go to +# kitchen' looks up the saved 'kitchen' position). This planner uses +# LLaVA scene grounding to actually steer toward an arbitrary visible +# object — used for unsaved targets like 'the door' or 'the red chair'. +# Verbs: walk / step + to/towards/over to. We do NOT include 'go to' +# here to avoid colliding with place memory. +_RE_WALK_TO = re.compile( + r"^(?:(?:walk|step)\s+(?:over\s+)?(?:to|towards?|up\s+to)|approach)\s+" + r"(?:the\s+|a\s+|an\s+|that\s+|this\s+)?(.+?)$", + re.I, +) + +# Tunable durations for higher-level motions. Kept inline (not config) so a +# hot-edit doesn't risk an import-time crash; tweak if the demo asks for +# longer scan / follow runs. +LOOK_AROUND_TURN_SEC = 1.0 # ~45° at default vyaw +LOOK_AROUND_HOLD_SEC = 0.4 # pause at each extreme so VLM can see +FOLLOW_ME_MAX_SEC = 30.0 # hard upper bound on a 'follow me' session +FOLLOW_ME_STEP_SEC = 0.5 # one forward burst per yolo-poll cycle +FOLLOW_ME_POLL_SEC = 0.2 # how often to re-check yolo when no person + +# Spatial planner tunables. The planner asks LLaVA where the target is +# (left/center/right + near/medium/far) every WALK_TO_REPLAN_SEC, then +# turns to centre + walks forward in WALK_TO_STEP_SEC bursts. Total +# session bounded by WALK_TO_MAX_SEC. +WALK_TO_MAX_SEC = 45.0 +WALK_TO_REPLAN_SEC = 2.5 # cadence of LLaVA scene queries +WALK_TO_STEP_SEC = 1.0 # forward burst length between re-plans +WALK_TO_TURN_SHORT_SEC = 0.4 # small correction (~18° at default vyaw) +WALK_TO_TURN_WIDE_SEC = 1.0 # larger correction when target is far off +WALK_TO_LLAVA_TIMEOUT_SEC = 15.0 # per-call deadline on the VLM scene query. + # Without this, ollama-python's default + # 5-min HTTP timeout would let a stalled + # VLM block the entire motion worker + # — and motion_abort can't break out of + # an in-flight HTTP call. + + +def _call_llava_with_timeout(prompt, img_b64, num_predict=120, + timeout_sec=WALK_TO_LLAVA_TIMEOUT_SEC): + """Run call_llava in a worker thread with a hard deadline. + Returns the raw string response, or None on timeout/error. + + We cannot truly cancel the HTTP request (ollama-python doesn't + expose a cancel handle), so the thread is left to finish on its + own — but the planner moves on. The orphaned thread will exit + when the HTTP eventually returns; we rely on it being a daemon + so it doesn't block process exit. + """ + result = {"raw": None, "err": None} + + def _runner(): + try: + result["raw"] = call_llava(prompt, img_b64, num_predict=num_predict) + except Exception as e: + result["err"] = str(e) + + th = threading.Thread(target=_runner, daemon=True) + th.start() + th.join(timeout=timeout_sec) + if th.is_alive(): + return None # timeout + if result["err"]: + return None # error + return result["raw"] + # Autonomous mode instance — set by init_autonomous() _auto = None @@ -108,9 +211,22 @@ def try_local_command(cmd: str) -> bool: odom.return_to_start() else: print(" [Places] Odometry not running — cannot return to start") - else: - place_goto(name) - return True + return True + # Try saved-place memory first. If place_goto returns False + # (no such place), fall through to the spatial planner — the + # user probably meant a visible object like 'the door' that + # was never saved as a place. Do this by rewriting the cmd + # to the planner's input shape and re-dispatching through + # try_local_command via _RE_WALK_TO below; we DON'T return + # True here so the rest of try_local_command runs. + if place_goto(name): + return True + # Re-target this same call through the spatial planner. We + # mutate the local 'cmd' string, then fall through to the + # _RE_WALK_TO branch later in this function (see further down). + print(f" [Places] '{name}' not saved — trying spatial planner") + cmd = "walk to " + name + # Note: don't return — let control fall through to _RE_WALK_TO. m = _RE_FORGET.match(cmd) if m: @@ -142,6 +258,7 @@ def try_local_command(cmd: str) -> bool: vx, _, _ = MOVE_MAP["forward"] t0 = time.time() while time.time() - t0 < meters / abs(vx): + if motion_abort.is_set(): break send_vel(vx=vx) time.sleep(0.05) gradual_stop() @@ -156,6 +273,7 @@ def try_local_command(cmd: str) -> bool: vx, _, _ = MOVE_MAP["backward"] t0 = time.time() while time.time() - t0 < meters / abs(vx): + if motion_abort.is_set(): break send_vel(vx=vx) time.sleep(0.05) gradual_stop() @@ -178,6 +296,7 @@ def try_local_command(cmd: str) -> bool: duration = abs(degrees) / vyaw_deg_per_sec t0 = time.time() while time.time() - t0 < duration: + if motion_abort.is_set(): break send_vel(vyaw=vyaw) time.sleep(0.05) gradual_stop() @@ -188,11 +307,17 @@ def try_local_command(cmd: str) -> bool: direction = (m.group(1) or "forward").lower() if direction.startswith("back"): direction = "backward" - steps = int(m.group(2)) + # Step count may be fractional ('walk 3.5 steps' / 'walk half + # a step' after fraction normalization). Convert to float for + # the duration calc; STEP_DURATION_SEC × 3.5 = 7s of motion. + steps = float(m.group(2)) vx, _, _ = MOVE_MAP[direction] duration = STEP_DURATION_SEC * steps t0 = time.time() while time.time() - t0 < duration: + if motion_abort.is_set(): break + wait_while_paused() + if motion_abort.is_set(): break send_vel(vx=vx) time.sleep(0.05) gradual_stop() @@ -201,11 +326,16 @@ def try_local_command(cmd: str) -> bool: m = _RE_TURN_STEP.match(cmd) if m: direction = m.group(1).lower() - steps = int(m.group(2)) if m.group(2) else 1 + # Fractional step count for sub-step turns (e.g. 'turn left 0.5 + # steps' from 'turn left half a step'). + steps = float(m.group(2)) if m.group(2) else 1.0 _, _, vyaw = MOVE_MAP[direction] duration = STEP_DURATION_SEC * steps t0 = time.time() while time.time() - t0 < duration: + if motion_abort.is_set(): break + wait_while_paused() + if motion_abort.is_set(): break send_vel(vyaw=vyaw) time.sleep(0.05) gradual_stop() @@ -225,6 +355,9 @@ def try_local_command(cmd: str) -> bool: vx, vy, vyaw = MOVE_MAP[direction] t0 = time.time() while time.time() - t0 < STEP_DURATION_SEC: + if motion_abort.is_set(): break + wait_while_paused() + if motion_abort.is_set(): break send_vel(vx=vx, vy=vy, vyaw=vyaw) time.sleep(0.05) gradual_stop() @@ -236,6 +369,292 @@ def try_local_command(cmd: str) -> bool: gradual_stop() return True + # ── DIRECT-MOTION CANONICALS (no LLaVA fallback) ──────────────────── + # These canonicals were previously falling through to ask(cmd, img) → + # Ollama → 5–30s blocking call. We handle them deterministically here. + # Bug #3 in the 14-bug audit; see README of that fix for rationale. + + # 'turn around' = 180° turn. Synthesize the brain call as the + # parametric form so we reuse the existing _RE_TURN_DEG path + # (which has motion_abort / motion_pause / odom support). + if _RE_TURN_AROUND.match(cmd): + return try_local_command("turn 180 degrees") + + # 'wave hello' / 'wave' → arm gesture via do_arm(). The arm_api + # is currently a stub (arm_available=false) so this prints a clear + # message rather than calling Ollama. Once GR00T is wired the gesture + # will fire automatically — no change here needed. + if _RE_WAVE_HELLO.match(cmd): + from API.arm_api import do_arm + do_arm("wave") + return True + + if _RE_RAISE_ARM.match(cmd): + from API.arm_api import do_arm + do_arm("raise_right") # default to right; alias is in config_Arm.json + return True + + if _RE_LOWER_ARM.match(cmd): + from API.arm_api import do_arm + do_arm("lower") + return True + + if _RE_POINT_GESTURE.match(cmd): + from API.arm_api import do_arm + # 'point at X' has no canonical gesture in the arm vocabulary — + # closest match is 'right_up' which extends the right arm forward. + # Use 'high_wave' (alias=wave) only if 'right_up' isn't registered. + from API.arm_api import ARM_ALIASES, ARM_ACTIONS + for choice in ("right_up", "raise_right", "high_wave", "wave"): + if choice in ARM_ALIASES or choice in ARM_ACTIONS: + do_arm(choice) + break + else: + print(" [Arm] no point/raise gesture configured — skipping") + return True + + # 'sit down' / 'stand up' need posture commands the SDK doesn't + # currently expose through marcus's ZMQ velocity bridge. Print a + # clear log line and gradual_stop (safety) instead of taking 20s + # to have Qwen invent something. When G1 SportClient.StandUp() / + # Sit() are wired into a posture_api, replace these with real calls. + if _RE_SIT_DOWN.match(cmd): + gradual_stop() + print(" [Posture] sit_down not yet wired — gradual_stop only") + return True + if _RE_STAND_UP.match(cmd): + gradual_stop() + print(" [Posture] stand_up not yet wired — gradual_stop only") + return True + + # 'stay here' / 'stay in place' — explicit hold. Just stops; the + # robot maintains its standing posture by SDK default. + if _RE_STAY_HERE.match(cmd): + gradual_stop() + print(" [Posture] holding position") + return True + + # ── LOOK AROUND ─────────────────────────────────────────────────────── + # Scan left ~45° → centre → right ~90° → centre. Each leg goes through + # execute_action which polls motion_abort, so 'stop' mid-scan halts + # cleanly. No-op fallback if MOVE_MAP doesn't define left/right (shouldn't + # happen on a real config but keep us defensive). + if _RE_LOOK_AROUND.match(cmd): + from Brain.executor import execute_action # late import — avoids + # circular at module load + try: + execute_action("left", LOOK_AROUND_TURN_SEC) + if motion_abort.is_set(): return True + time.sleep(LOOK_AROUND_HOLD_SEC) + if motion_abort.is_set(): return True + execute_action("right", LOOK_AROUND_TURN_SEC * 2) + if motion_abort.is_set(): return True + time.sleep(LOOK_AROUND_HOLD_SEC) + if motion_abort.is_set(): return True + execute_action("left", LOOK_AROUND_TURN_SEC) + except Exception as e: + print(f" [LookAround] failed: {e}") + return True + + # ── FOLLOW ME ───────────────────────────────────────────────────────── + # Watch for a person in the camera frame; when one is detected and not + # already too close, advance forward in a short burst. If the person is + # too close, hold. If no person seen, hold and re-poll. Bounded by + # FOLLOW_ME_MAX_SEC and motion_abort. yolo_api uses fallback stubs when + # YOLO isn't loaded — in that case we give up immediately and tell the + # operator (no person tracking = no point walking blindly forward). + if _RE_FOLLOW_ME.match(cmd): + from Brain.executor import move_step + try: + from API.yolo_api import yolo_sees, yolo_person_too_close + except Exception as e: + print(f" [FollowMe] yolo_api unavailable: {e}") + return True + # Probe for actual YOLO availability — both the stub `yolo_sees` + # and the stub `yolo_person_too_close` always return False, so a + # follow_me run with no real YOLO would just hold forever. + try: + from API.yolo_api import _stub_sees as _stub_sees_ref # type: ignore + stubbed = (yolo_sees is _stub_sees_ref) + except Exception: + stubbed = False + if stubbed: + print(" [FollowMe] YOLO not loaded — cannot track persons. " + "Skipping. (init_yolo() at startup to enable.)") + return True + + print(f" [FollowMe] tracking up to {FOLLOW_ME_MAX_SEC:.0f}s — " + "say 'stop' to end") + deadline = time.time() + FOLLOW_ME_MAX_SEC + last_log = 0.0 + while time.time() < deadline: + if motion_abort.is_set(): + print(" [FollowMe] motion_abort set — ending") + gradual_stop() + break + try: + see_person = bool(yolo_sees("person")) + except Exception: + see_person = False + try: + too_close = bool(yolo_person_too_close()) + except Exception: + too_close = False + + now = time.time() + if now - last_log > 2.0: + state = ("person too close" if (see_person and too_close) + else "tracking" if see_person + else "no person") + print(f" [FollowMe] {state}") + last_log = now + + if see_person and not too_close: + move_step("forward", FOLLOW_ME_STEP_SEC) + else: + gradual_stop() + time.sleep(FOLLOW_ME_POLL_SEC) + gradual_stop() + print(" [FollowMe] done") + return True + + # ── WALK TO TARGET (spatial planner) ───────────────────────────────── + # LLaVA-grounded approach to an arbitrary visible object. This is a + # best-effort planner — accuracy depends entirely on the VLM. Loop: + # 1. Capture a frame, ask LLaVA: "Is visible? bearing + # (left/center/right)? distance (near/medium/far)?" + # 2. If not visible: scan left/right small turn, retry up to N times. + # 3. If visible: turn-toward (small or wide based on bearing) then + # walk forward for WALK_TO_STEP_SEC. + # 4. Done when distance == 'near' OR motion_abort OR timeout. + # All motion goes through move_step / execute_action which already + # poll motion_abort and motion_pause, so 'stop'/'pause' work normally. + m = _RE_WALK_TO.match(cmd) + if m: + target = m.group(1).strip().rstrip(".!?,").strip() + if not target: + return True + # If the target is a known place name, defer to place_goto — + # the user probably meant the saved place, not the visible + # object with that name. (Belt-and-braces with _RE_GOTO above.) + if mem and target.lower() in {p.lower() for p in (mem.list_places() or [])}: + place_goto(target) + return True + + from Brain.executor import move_step, execute_action + + print(f" [WalkTo] target = {target!r}; max {WALK_TO_MAX_SEC:.0f}s") + deadline = time.time() + WALK_TO_MAX_SEC + scan_attempts_left = 4 # how many turns before giving up if unseen + last_replan = 0.0 + # Kill-switch on the LLaVA retry loop. Without this, an unresponsive + # Ollama (server crashed / model OOM / network wedged) makes the + # planner grind for the full WALK_TO_MAX_SEC, hammering with retries + # every 0.5 s and producing nothing but noise. Bail after a small + # consecutive-timeout count so the user gets a fast 'unable to plan' + # rather than a 45 s silent wait. Field-tested cap. + consecutive_llava_timeouts = 0 + LLAVA_TIMEOUT_GIVE_UP = 3 + + while time.time() < deadline: + if motion_abort.is_set(): + print(" [WalkTo] motion_abort set — ending") + gradual_stop() + break + + # Re-plan via LLaVA at each cycle. Caching one query per + # WALK_TO_REPLAN_SEC keeps token cost in check. + now = time.time() + if now - last_replan < WALK_TO_REPLAN_SEC and last_replan > 0: + time.sleep(0.1) + continue + last_replan = now + + img_b64 = get_frame() + if not img_b64: + print(" [WalkTo] no camera frame — pausing") + time.sleep(0.5) + continue + + prompt = ( + f"You are guiding a robot toward the {target!r}. Look at " + f"the image. Answer ONLY in JSON, no other text:\n" + f"{{\"visible\": true|false, \"bearing\": \"left|center|right\", " + f"\"distance\": \"near|medium|far\", \"reason\": \"\"}}\n" + f"- 'visible' = whether the {target!r} is in the frame\n" + f"- 'bearing' = which side of the frame it's on\n" + f"- 'distance' = rough estimate based on apparent size\n" + f"- 'near' means within ~1 metre (almost stopping distance)" + ) + # Wrap LLaVA in a thread-timeout: ollama-python's default + # HTTP timeout is ~5 minutes, which would let a stalled VLM + # block the entire motion worker far longer than the watchdog + # would tolerate. WALK_TO_LLAVA_TIMEOUT_SEC = 15s gives the + # VLM enough room to answer at idle while keeping abort + # responsive (next iteration re-checks motion_abort). + if motion_abort.is_set(): break + raw = _call_llava_with_timeout(prompt, img_b64, num_predict=120) + if raw is None: + consecutive_llava_timeouts += 1 + print(f" [WalkTo] LLaVA timeout/error " + f"({consecutive_llava_timeouts}/{LLAVA_TIMEOUT_GIVE_UP}) " + f"— retrying") + if consecutive_llava_timeouts >= LLAVA_TIMEOUT_GIVE_UP: + print(f" [WalkTo] LLaVA unresponsive after " + f"{LLAVA_TIMEOUT_GIVE_UP} timeouts — giving up. " + f"Check Ollama (curl http://localhost:11434/api/version).") + gradual_stop() + return True + time.sleep(0.5) + continue + consecutive_llava_timeouts = 0 + obs = parse_json(raw) or {} + + visible = bool(obs.get("visible")) + bearing = (obs.get("bearing") or "").lower() + distance = (obs.get("distance") or "").lower() + reason = obs.get("reason", "") + print(f" [WalkTo] visible={visible} bearing={bearing} " + f"distance={distance} ({reason[:60]})") + + if motion_abort.is_set(): break + + if not visible: + if scan_attempts_left <= 0: + print(f" [WalkTo] {target!r} not found after scans — " + "giving up") + break + scan_attempts_left -= 1 + # Small turn left to widen the search arc; next iteration + # the LLaVA call will see the new view. + execute_action("left", WALK_TO_TURN_SHORT_SEC) + continue + scan_attempts_left = 4 # reset once we see it again + + # Reached: stop and report + if distance == "near": + gradual_stop() + print(f" [WalkTo] arrived at {target!r}") + return True + + # Steer toward target + if bearing == "left": + turn_sec = (WALK_TO_TURN_WIDE_SEC if distance == "far" + else WALK_TO_TURN_SHORT_SEC) + execute_action("left", turn_sec) + elif bearing == "right": + turn_sec = (WALK_TO_TURN_WIDE_SEC if distance == "far" + else WALK_TO_TURN_SHORT_SEC) + execute_action("right", turn_sec) + # bearing == 'center' or unknown → just walk forward + if motion_abort.is_set(): break + + # Forward burst + move_step("forward", WALK_TO_STEP_SEC) + + gradual_stop() + return True + # ── NAMED PATROL ROUTE ─────────────────────────────────────────────── m = _RE_PATROL_RT.match(cmd) if m: diff --git a/Brain/executor.py b/Brain/executor.py index f7706b9..87164db 100644 --- a/Brain/executor.py +++ b/Brain/executor.py @@ -6,6 +6,7 @@ import time import threading from API.zmq_api import send_vel, gradual_stop, MOVE_MAP, STEP_PAUSE from API.arm_api import ALL_ARM_NAMES, do_arm +from Core.motion_state import motion_abort, wait_while_paused def _obstacle_check(): @@ -29,6 +30,18 @@ def execute_action(move: str, duration: float): vx, vy, vyaw = MOVE_MAP[move] t0 = time.time() while time.time() - t0 < duration: + if motion_abort.is_set(): + gradual_stop() + print(" [executor] motion_abort set — stopping") + return + # Honour a soft pause: hold zero velocity and wait until + # the user resumes (or aborts). wait_while_paused returns + # immediately if not paused. Re-check abort right after. + if motion_pause_check(): + send_vel(0.0, 0.0, 0.0) + wait_while_paused() + if motion_abort.is_set(): + gradual_stop(); return if _obstacle_check(): gradual_stop() print(" [Safety] LiDAR obstacle — stopping") @@ -47,6 +60,15 @@ def move_step(move: str, duration: float): vx, vy, vyaw = MOVE_MAP[move] t0 = time.time() while time.time() - t0 < duration: + if motion_abort.is_set(): + send_vel(0.0, 0.0, 0.0) + print(" [executor] motion_abort set — pausing step") + return + if motion_pause_check(): + send_vel(0.0, 0.0, 0.0) + wait_while_paused() + if motion_abort.is_set(): + send_vel(0.0, 0.0, 0.0); return if _obstacle_check(): send_vel(0.0, 0.0, 0.0) print(" [Safety] LiDAR obstacle — pausing step") @@ -57,6 +79,13 @@ def move_step(move: str, duration: float): time.sleep(0.1) +def motion_pause_check() -> bool: + """Returns True if a soft-pause is active. Tiny indirection so we + don't have to import motion_pause directly into every motion site.""" + from Core.motion_state import motion_pause + return motion_pause.is_set() + + def merge_actions(actions: list) -> list: """Merge consecutive same-direction steps into one smooth movement.""" if not actions: diff --git a/Brain/marcus_brain.py b/Brain/marcus_brain.py index d0d09aa..a321c37 100644 --- a/Brain/marcus_brain.py +++ b/Brain/marcus_brain.py @@ -346,14 +346,131 @@ def process_command(cmd: str) -> dict: log_cmd(cmd, response) return {"type": "greeting", "speak": response, "action": "GREETING", "elapsed": 0} - # ── "Come to me" shortcut ──────────────────────────────────────────── + # ── "Come to me" — smart YOLO-tracked approach ────────────────────── + # Old behaviour: walk forward 2s blindly. New: if YOLO is loaded, + # locate the person in the camera frame, turn toward them if off- + # centre, walk forward in short bursts, stop when 'arrived' (person + # fills enough of the frame to be 'right in front'). Bounded by + # COME_TO_ME_MAX_SEC; falls back to a 2s blind walk if YOLO isn't + # available so the canonical still does *something*. + # + # Arrival threshold: yolo_person_too_close(threshold=...) returns + # True if the person's bbox size_ratio (bbox_area / frame_area) + # exceeds the threshold. Defaults are calibrated for safety + # (0.25 = ~1m away, used by patrol). For come_here we want the + # robot to come MUCH closer — arm's length, ~0.5m — which is + # roughly size_ratio > 0.32 (lookup in marcus_yolo.distance_estimate: + # 'very close' starts at 0.30). 0.32 is a safe 'arrived' marker; + # any higher and the robot might bump into the user. if re.match(r"^(?:come(?:\s+back)?(?:\s+to\s+me)?|come\s+here|get\s+closer|approach|move\s+closer)\s*[!.]*$", cmd, re.IGNORECASE): - execute_action("forward", 2.0) + from Brain.executor import move_step + try: + from API.yolo_api import yolo_sees, yolo_person_too_close, yolo_closest + from API.yolo_api import _stub_sees as _stub_sees_ref # type: ignore + yolo_active = (yolo_sees is not _stub_sees_ref) + except Exception: + yolo_active = False + + COME_TO_ME_MAX_SEC = 25.0 + COME_TO_ME_STEP_SEC = 0.6 + COME_TO_ME_TURN_SEC = 0.3 # small correction (~14° at default vyaw) + COME_TO_ME_POLL_SEC = 0.3 + # Arrival threshold: 0.32 size_ratio ≈ ~0.5m / arm's length. + # Patrol's 0.25 default is too lenient for come-to-me — it + # would treat a 1m-away user as 'already arrived'. + COME_TO_ME_ARRIVED_RATIO = 0.32 + + if not yolo_active: + # Fallback — same behaviour as before for hardware setups + # without YOLO loaded. + print(" [ComeToMe] YOLO not loaded — walking forward 2s") + execute_action("forward", 2.0) + resp = "Coming to you" + print(f"Sanad: {resp}") + add_to_history(cmd, resp) + log_cmd(cmd, resp) + return {"type": "move", "speak": resp, "action": "FORWARD 2.0s", "elapsed": 2.0} + + print(f" [ComeToMe] tracking person up to {COME_TO_ME_MAX_SEC:.0f}s — say 'stop' to end") + deadline = time.time() + COME_TO_ME_MAX_SEC + last_log = 0.0 + scan_attempts_left = 6 # how many turns before giving up if person never seen + while time.time() < deadline: + try: + from Core.motion_state import motion_abort + if motion_abort.is_set(): + print(" [ComeToMe] motion_abort — stopping") + break + except Exception: + pass + try: + seen = bool(yolo_sees("person")) + except Exception: + seen = False + try: + # IMPORTANT: pass an explicit 'arrived' threshold (0.32) + # — without it, the default 0.25 (safety patrol's value) + # would call the user 'too close' when they're a metre + # away, and the robot would stop without walking. We + # want it to come arm's length close. + arrived = bool(yolo_person_too_close( + threshold=COME_TO_ME_ARRIVED_RATIO, + )) + except Exception: + arrived = False + + now = time.time() + if now - last_log > 1.5: + state = ("arrived (close to person)" if (seen and arrived) + else "tracking person" if seen + else "scanning for person") + print(f" [ComeToMe] {state}") + last_log = now + + if seen and arrived: + # Made it — robot is close enough to stop in front + print(" [ComeToMe] arrived in front of person") + break + + if seen and not arrived: + # Person is in view but not yet arm's length close — + # centre on them and walk forward in a burst. + bearing = "center" + try: + det = yolo_closest("person") + if det and getattr(det, "position", None): + # position is something like 'left' / 'center' / 'right' + bearing = str(det.position).lower() + except Exception: + pass + if bearing == "left": + execute_action("left", COME_TO_ME_TURN_SEC) + elif bearing == "right": + execute_action("right", COME_TO_ME_TURN_SEC) + # forward burst + move_step("forward", COME_TO_ME_STEP_SEC) + scan_attempts_left = 6 # reset + continue + + # No person — try a small scan turn, then check again + if scan_attempts_left <= 0: + print(" [ComeToMe] no person found after scans — giving up") + break + scan_attempts_left -= 1 + execute_action("left", COME_TO_ME_TURN_SEC) + time.sleep(COME_TO_ME_POLL_SEC) + + try: + from API.zmq_api import gradual_stop as _gs + _gs() + except Exception: + pass + elapsed = time.time() - t0 resp = "Coming to you" print(f"Sanad: {resp}") add_to_history(cmd, resp) log_cmd(cmd, resp) - return {"type": "move", "speak": resp, "action": "FORWARD 2.0s", "elapsed": 2.0} + return {"type": "move", "speak": resp, "action": "COME_TO_ME", "elapsed": elapsed} # ── Multi-step compound ────────────────────────────────────────────── _multi = re.match( @@ -620,7 +737,23 @@ def run_terminal(): print(' Pick the ID that sounded English and set it in') print(' Config/config_Voice.json :: tts.builtin_speaker_id') continue + # Structured motion log for terminal-typed commands. The + # voice path already logs through Voice/marcus_voice's + # worker; here we cover the OTHER entry-point so every + # motion (text or voice) shows up in logs/motion.log with + # parsed direction/magnitude/unit + actual elapsed. + try: + from Core.motion_log import log_motion as _mlog + _mlog("start", cmd, source="text") + _t0 = time.time() + except Exception: + _t0 = time.time() result = process_command(cmd) + try: + _mlog("complete", cmd, source="text", + actual_sec=time.time() - _t0) + except Exception: + pass sp = result.get("speak", "") if isinstance(result, dict) else "" if sp and _audio_api: _audio_api.speak(sp) diff --git a/Config/config_Voice.json b/Config/config_Voice.json index 922e83d..5433278 100644 --- a/Config/config_Voice.json +++ b/Config/config_Voice.json @@ -24,10 +24,10 @@ "gemini_record_enabled": false, "gemini_record_keep_count": 50, - "_gemini_camera_comment": "Stream camera frames to Gemini Live so vision answers ('what do you see') are correct rather than hallucinated. Marcus parent grabs JPEG frames via API.camera_api.get_frame() at gemini_frame_interval_sec cadence and pipes them to the runner over stdin. Frame_max_age_sec drops stale frames. Set gemini_send_frames=false to disable (saves API tokens but breaks vision questions).", + "_gemini_camera_comment": "Stream camera frames to Gemini Live so vision answers ('what do you see') are correct rather than hallucinated. NOTE: this is NOT a true video stream. Marcus sends discrete still JPEG frames at gemini_frame_interval_sec cadence (default 0.5s = 2 fps) — Gemini Live treats them as low-fps video for context but each frame is independent. Frame_max_age_sec drops stale frames. Set gemini_send_frames=false to disable (saves API tokens but breaks vision questions). Bug #8 in the 14-bug audit was 'is this real video?' — answer: 2 fps stills.", "gemini_send_frames": true, - "gemini_frame_interval_sec": 0.5, - "gemini_frame_max_age_sec": 1.5, + "gemini_frame_interval_sec": 1.0, + "gemini_frame_max_age_sec": 2.5, "_gemini_barge_comment": "Barge-in = user speaking over Gemini. Three loud chunks above barge_threshold interrupts Gemini mid-sentence. echo_suppress_below masks mic frames quieter than the threshold during playback so the mic doesn't re-feed Gemini its own voice. On the G1 the on-board speaker is loud enough that ECHO frames hit ~1500-3000 RMS, well above the 500 barge threshold — that's why earlier sessions saw self-interrupt loops. Tuned values: threshold 3500 (only a real shout cuts Gemini off), echo_suppress_below 3500 (mute everything below that during AI playback — anything quieter than the speaker's own echo is treated as silence). ai_speak_grace_sec 0.5 gives Gemini a half-second runway before barge can fire. If you find users genuinely can't interrupt Gemini, drop barge_threshold to ~2500 and accept some self-interrupts.", "gemini_barge_threshold": 3500, @@ -40,7 +40,7 @@ "_gemini_system_prompt_comment": "Persona for Gemini Live. **Gemini is the SOLE wake-word gatekeeper** — Marcus does NOT check for 'Sanad'/'سند' in Python. The dispatcher just listens to whatever Gemini speaks; if Gemini says a motion-confirmation phrase ('Turning right', 'أستدير يميناً'), Marcus dispatches motion. Therefore the persona below MUST keep Gemini disciplined: never use a motion-confirmation phrase unless the user actually said the wake word. The strict rules are spelled out inside the prompt itself. Override by pointing gemini_system_prompt_file at a text file.", "gemini_system_prompt_file": "", - "gemini_system_prompt": "You are Sanad (سند), a friendly humanoid robot made by YS Lootah Technology in Dubai. Your body is a Unitree G1 humanoid. You see the user through your camera and talk to them in real time. You speak BOTH English and Arabic fluently — always match the user's language in your reply. Reply briefly: usually one or two sentences. When the user asks what you see / 'ماذا ترى' / 'شو شايف' or otherwise asks about the scene, look at the camera frames you are receiving and answer ONLY with what is actually there; never invent details.\n\nCRITICAL ACTION GATE — YOU ARE THE ONLY GATEKEEPER. An external system listens to YOUR spoken reply for motion-confirmation phrases ('Turning right', 'Sitting down', 'أستدير يميناً', 'أجلس', etc.) and uses those phrases to physically move the robot. Your verbal acknowledgement IS the trigger. So:\n\nRULE 1 — Use motion-confirmation phrases ONLY when the user clearly addressed you BY NAME ('Sanad' in English OR 'سند' / 'يا سند' in Arabic) AND requested an action. The wake word and the action must both be present in the SAME user utterance.\n\nRULE 2 — If the user requests a motion WITHOUT saying your name (e.g. 'turn right', 'استدر يميناً', 'sit down'), DO NOT say any motion-confirmation phrase. Reply naturally explaining you need to be addressed by name first — e.g. 'Please call me Sanad first if you'd like me to move.' / 'لازم تنادي عليّ باسمي سند قبل لما أتحرك.'\n\nRULE 3 — If the user says only your name ('Sanad' / 'سند' alone) with no action, reply 'Yes?' / 'نعم؟'. Do NOT say any motion phrase.\n\nRULE 4 — Plain chat, vision queries, hypotheticals ('I might turn around if I get lost'), descriptive references ('the person turned right and walked away'): NEVER use a motion verb in first person present continuous. Rephrase so you do not say 'Turning ...', 'Moving ...', 'Sitting ...', 'Standing ...', 'أستدير ...', 'أتحرك ...', 'أجلس ...', 'أقف ...'.\n\nRULE 5 — When you DO confirm a real motion request, reply in the SAME language the user used and emit ONE short confirmation phrase per requested motion, separated by punctuation, in the order the user listed them. Single motion: 'Turning right.' / 'أستدير يميناً.' Compound (multiple motions in one utterance): 'Turning right, then moving forward.' / 'أستدير يميناً، ثم أتحرك للأمام.' / 'سأقف، ثم ألوّح.' Each motion verb must come from the supported list below. Do NOT collapse multiple requested motions into one verb, do NOT add motions the user did not ask for, and do NOT pad with filler — only the chained motion phrases plus optional commas/'then'/'ثم'/'وبعدين'.\n\nRULE 6 — PARAMETRIC MOTIONS. When the user includes a NUMBER ('turn 360', 'turn left 90 degrees', 'walk 5 steps', 'walk 2 meters back', 'استدر 360 درجة', 'امشِ خمس خطوات', 'تحرك مترين'), KEEP the number in your confirmation phrase using these EXACT shapes — the external dispatcher reads the number from your reply and passes it to the robot:\n • 'Turning degrees.' / 'Turning left degrees.' / 'Turning right degrees.'\n • 'Walking steps.' / 'Walking forward steps.' / 'Walking back steps.'\n • 'Walking meters.' / 'Walking forward meters.' / 'Walking backward meters.'\n • 'أستدير درجة.' / 'أستدير يميناً درجة.' / 'أستدير يساراً درجة.'\n • 'أمشي خطوات.' / 'أمشي للأمام متر.' / 'أمشي للخلف متر.'\nUse digit characters ('5', '90', '360'), not spelled words ('five', 'ninety', 'خمسة'). Convert spelled-out numbers and Arabic words ('خمس'/'خمسة'/'عشرة') to digits before speaking. If the user says 'turn around' WITHOUT a number, treat it as the fixed 'turn around' motion — do NOT invent 180. If the user gives multiple parametric motions ('turn right 90 then walk 3 steps'), chain them per Rule 5 with the numbers preserved: 'Turning right 90 degrees, then walking 3 steps.'\n\nMotion verbs supported (English / Arabic): turn left/right (استدر يميناً/يساراً), turn around (استدر للخلف), move forward/back (تحرك للأمام/للخلف), sit down (اجلس), stand up (قف), wave hello (لوّح), raise/lower arm (ارفع/اخفض يدك), come here (تعال), follow me (اتبعني), stay here (ابق هنا), go home (اذهب للبيت), stop (توقف), patrol (طوف), look around (انظر حولك).", + "gemini_system_prompt": "You are Sanad (سند), a friendly humanoid robot made by YS Lootah Technology in Dubai. Your body is a Unitree G1 humanoid. You see the user through your camera and talk to them in real time. You speak BOTH English and Arabic fluently — always match the user's language in your reply. Reply briefly: usually one or two sentences. When the user asks what you see / 'ماذا ترى' / 'شو شايف' or otherwise asks about the scene, look at the camera frames you are receiving and answer ONLY with what is actually there; never invent details.\n\nCRITICAL ACTION GATE — YOU ARE THE ONLY GATEKEEPER. An external system listens to YOUR spoken reply for motion-confirmation phrases ('Turning right', 'Sitting down', 'أستدير يميناً', 'أجلس', etc.) and uses those phrases to physically move the robot. Your verbal acknowledgement IS the trigger. So:\n\nRULE 1 — Use motion-confirmation phrases ONLY when the user clearly addressed you BY NAME ('Sanad' in English OR 'سند' / 'يا سند' in Arabic) AND requested an action. The wake word and the action must both be present in the SAME user utterance.\n\nRULE 1b — TRANSCRIPTION FRAGMENTS. The G1 mic occasionally fragments the wake word into separate transcribed chunks ('سن / د' or 'Sa / nad' across two ticks). If you see a clear cluster of fragments at the start of an utterance that PHONETICALLY resembles 'Sanad' / 'سند' (e.g. 'سن' followed by 'ت'/'د'/'تد', or 's' / 'sa' / 'san' followed by 'ad'/'nad'), AND the rest of the utterance is a clear motion request, treat the wake word as PRESENT and proceed with motion confirmation. Be lenient: under-confirming a real motion request because of mic fragmentation is worse UX than over-confirming. Do NOT guess wake-word presence when the rest of the utterance is also fragmentary or noise — in that case ask 'Did you want me to do something?' / 'هل تريد مني فعل شيء؟'.\n\nRULE 1c — NEVER HALLUCINATE MOTION FROM SILENCE OR AMBIGUITY. If the user transcript is empty, contains only punctuation ('.'), is just '', or is too fragmentary to identify a clear action, you MUST NOT confirm any motion. NEVER invent a motion the user didn't request. NEVER pick a motion from prior context to fill silence. NEVER 'complete' an earlier-half-formed command from a fresh empty turn. Default response in those cases:\n • English: 'Sorry, I didn't catch that. Could you say it again?'\n • Arabic: 'عذراً، لم أفهم. ممكن تعيدها؟'\nA confirmed motion phrase ('Turning right.', 'أمشي للأمام.') is a HARD COMMITMENT — the dispatcher will move the robot. Never emit one unless the user clearly said both the wake word AND the action in THIS utterance. When in doubt, ASK; do not move.\n\nRULE 1c-2 — CONCRETE FAILURE EXAMPLE OBSERVED IN THE FIELD (do NOT repeat). Sequence: previous turn = 'Sanad, walk to the luggage' (planner started + failed). User next turn = ''. User next turn = '.' (punctuation only). You then emitted: 'Sanad, Walking 1 steps.' — WRONG. The user's last two utterances contained no motion request and no wake word; you invented '1 steps' to fill silence after the previous planner call failed. CORRECT behavior in that situation: stay quiet, OR say 'Sorry, I didn't catch that.' — never emit a parametric motion to recover from a previous turn's failure. The previous turn ended; this is a fresh turn with no input.\n\nRULE 1d — NO FABRICATED COMPOUND CHAINS. If the user gave you a SHORT or AMBIGUOUS phrase and you choose to act, do NOT pad your confirmation with extra motions the user didn't request. Wrong: user says 'turn left' → you say 'Turning left, then walking forward 1 steps.' (you added the walk). Right: user says 'turn left' → you say 'Turning left.' If the user said something that COULD be interpreted as compound ('turn left and step forward') AND both motions are clearly named, then yes confirm both. If the user said only ONE motion verb plus a quantity ('turn left one step') treat the quantity as part of THAT motion (single turn) — see the 'Turning left N steps' shape in Rule 6.\n\nRULE 2 — If the user requests a motion WITHOUT saying your name (e.g. 'turn right', 'استدر يميناً', 'sit down'), DO NOT say any motion-confirmation phrase. Reply naturally explaining you need to be addressed by name first — e.g. 'Please call me Sanad first if you'd like me to move.' / 'لازم تنادي عليّ باسمي سند قبل لما أتحرك.'\n\nRULE 3 — If the user says only your name ('Sanad' / 'سند' alone) with no action, reply 'Yes?' / 'نعم؟'. Do NOT say any motion phrase.\n\nRULE 4 — Plain chat, vision queries, hypotheticals ('I might turn around if I get lost'), descriptive references ('the person turned right and walked away'): NEVER use a motion verb in first person present continuous. Rephrase so you do not say 'Turning ...', 'Moving ...', 'Sitting ...', 'Standing ...', 'أستدير ...', 'أتحرك ...', 'أجلس ...', 'أقف ...'.\n\nRULE 4b — NO DESTINATIONS in motion confirmations. Marcus has no spatial planner — it cannot navigate to a named target. When the user asks to walk toward a thing ('walk to the door', 'go to the chair', 'امشي للباب'), confirm with the canonical FORWARD motion only: 'Walking forward.' / 'أمشي للأمام.' Do NOT say 'Walking to the door.' / 'Walking towards the chair.' / 'أمشي نحو الباب.' — the destination word is misleading because the robot only knows directional motion. Same for 'go to' / 'head to' / 'come to' / 'اذهب إلى' / 'تعال إلى' — drop the destination, keep the direction.\n\nRULE 5 — When you DO confirm a real motion request, reply in the SAME language the user used and emit ONE short confirmation phrase per requested motion, separated by punctuation, in the order the user listed them. Single motion: 'Turning right.' / 'أستدير يميناً.' Compound (multiple motions in one utterance): 'Turning right, then moving forward.' / 'أستدير يميناً، ثم أتحرك للأمام.' / 'سأقف، ثم ألوّح.' Each motion verb must come from the supported list below. Do NOT collapse multiple requested motions into one verb, do NOT add motions the user did not ask for, and do NOT pad with filler — only the chained motion phrases plus optional commas/'then'/'ثم'/'وبعدين'.\n\nRULE 5h — ACT OR ASK, NEVER BOTH. If you emit a motion-confirmation phrase ('Turning left.', 'Walking forward 1 steps.', 'أتوقف.', 'أمشي للأمام.') the dispatcher INSTANTLY moves the robot. Do NOT in the same reply also ask 'did you mean X or Y?' — that's confusing UX (the robot already moved while you ask). Choose ONE behavior per turn:\n • CONFIDENT enough to act → emit the confirmation phrase ONLY, no follow-up question.\n • UNCERTAIN → ask a clarifying question ONLY, do NOT also speak a confirmation phrase.\n\nRULE 5h-2 — CLARIFICATION QUESTIONS MUST NOT REPEAT MOTION VERBS. When you ask the user to clarify, do NOT use first-person motion verbs like 'أستدير', 'أمشي', 'turning', 'walking' inside the question — even though the question mark makes it grammatically not-a-confirmation, an imperfect transcript echo or a fragmented Gemini delivery can still make the dispatcher false-fire from a question. SAFEST patterns:\n • English: 'Do you want me to turn or walk?' (verbs in user-form, not 'turning'/'walking')\n • Arabic: 'هل تريد أن أتحرك يساراً أم خطوة للأمام؟' (use 'أتحرك' generically)\n • SAFEST: enumerate options as nouns: 'Left turn, or one step forward — which one?' / 'استدارة لليسار، أم خطوة للأمام — أيهما؟'\nDefence in depth: the dispatcher also strips question clauses entirely, so even a question containing motion verbs won't fire — but you should still avoid bait phrases.\n\nIf the user's intent is genuinely ambiguous, the safe default is to ASK, not to act-and-ask. Once the user clarifies, act decisively in the next turn.\n\nRULE 5h-3 — NEVER PREFIX YOUR REPLY WITH YOUR OWN NAME. The user calls you Sanad; you do not address yourself. Do NOT start your spoken reply with 'Sanad,' or 'سند،' — that is the user's word for you, not yours for yourself. Wrong: 'Sanad, Walking forward.' / 'Sanad, Turning right.' / 'سند، أستدير يميناً.' Right: 'Walking forward.' / 'Turning right.' / 'أستدير يميناً.' This applies to motion confirmations, vision answers, greetings, clarifications — every reply. The dispatcher tolerates the prefix (it strips quoted self-mentions), but the user hears it spoken aloud and it sounds like the robot is talking to itself. Audited example from the field: you said 'Sanad, Walking 1 steps.' — drop the 'Sanad,' — emit only 'Walking 1 steps.'\n\nRULE 5g — STEP-CLOSER vs COME-TO-USER. There are TWO separate canonicals for 'get closer' and they have DIFFERENT semantics:\n • 'come close' / 'step closer' / 'get closer' / 'one step closer' / 'قرب' / 'اقترب' / 'تقرّب' / 'خطوة قدام' / 'تقدّم خطوة' → SINGLE forward step (deterministic, no camera tracking). Confirm with: 'Stepping closer.' / 'Coming closer.' / 'أتقدم خطوة.' / 'أتقرّب خطوة.' / 'خطوة للأمام.'\n • 'come here' / 'come to me' / 'تعال لعندي' / 'تعال هنا' → YOLO-tracked smart approach (walks toward you, stops arm's length away). Confirm with: 'Coming to you.' / 'أتي إليك.' (per Rule 5d below).\nIf the user just asks to be closer with no clear come-to-me intent, pick step-closer (safer, predictable). If the user says 'come to me' or addresses the robot to come over to them, use come-to-user.\n\nRULE 5d — COME-TO-USER CONFIRMATIONS. When the user asks the robot to come ('come here', 'come to me', 'تعال', 'تعال لعندي', 'come closer'), use one of these EXACT confirmations — do NOT invent grammatical forms (Arabic conjugation 'أتعال' is INCORRECT and the dispatcher won't recognise it):\n • English: 'Coming to you.' / 'Approaching.' / 'On my way to you.' / 'Heading to you.'\n • Arabic: 'أتي إليك.' / 'آتي إليك.' / 'أنا قادم.' / 'قادم إليك.' / 'أتجه إليك.'\nMarcus uses the camera to find you and walks toward you, stopping when close. Wake-word rule still applies: only confirm if the user said 'Sanad' / 'سند' AND the come-request.\n\nRULE 5e — NO ZERO/NEGATIVE QUANTITIES. Never include 0, 0.0, or any negative number as a quantity in a motion confirmation. 'Turn right 0 degrees.' / 'Walking forward 0 steps.' are nonsense — the robot won't move. If the user's transcript contained a number that turned out to be 0 (e.g. mishearing 'صفر' or 'zero'), DROP the parametric clause from your confirmation and emit just the bare canonical motion (or ask for clarification). Same rule for negative numbers — these can NEVER appear in a confirmation. This is enforced at the dispatcher too — 0-quantity parametric motions are silently discarded.\n\nRULE 5b — QUANTITY PRESERVATION. NEVER invent or substitute a quantity the user did not say. If the user said 'walk one step', confirm 'Walking forward 1 steps.' If the user said 'walk', confirm 'Moving forward.' (no number). If the user said 'turn', confirm 'Turning right.' or ask which direction — NEVER pad the motion with a default like 5 steps or 90 degrees. SPECIFICALLY: if the transcript is empty, fragmentary, just '' or '.' (per Rule 1c), do NOT fill in any quantity. NEVER use 5 as a default — the user must explicitly state every number you put in your confirmation.\n\nRULE 6 — PARAMETRIC MOTIONS. When the user includes a NUMBER ('turn 360', 'turn left 90 degrees', 'walk 5 steps', 'walk 2 meters back', 'استدر 360 درجة', 'امشِ خمس خطوات', 'تحرك مترين'), KEEP the number in your confirmation phrase using these EXACT shapes — the external dispatcher reads the number from your reply and passes it to the robot:\n • 'Turning degrees.' / 'Turning left degrees.' / 'Turning right degrees.'\n • 'Walking steps.' / 'Walking forward steps.' / 'Walking back steps.'\n • 'Walking meters.' / 'Walking forward meters.' / 'Walking backward meters.'\n • 'Turning left steps.' / 'Turning right steps.' ← WHEN THE USER SAYS 'turn left one step' / 'turn right two steps' / 'لف يسار خطوة' — this is a SINGLE turn motion of N steps' magnitude, NOT a compound 'turn + walk'. Confirm as ONE phrase. Wrong: 'Turning left, then walking 1 step.' Right: 'Turning left 1 steps.'\n • 'أستدير درجة.' / 'أستدير يميناً درجة.' / 'أستدير يساراً درجة.'\n • 'أستدير يساراً خطوات.' / 'أستدير يميناً خطوات.' ← مثل 'لف يسار خطوة واحدة' / 'استدر يميناً خطوتين' — تأكيد واحد، ليس مركب.\n • 'أمشي خطوات.' / 'أمشي للأمام متر.' / 'أمشي للخلف متر.'\nDialect Arabic confirmations are fully accepted — match the user's accent (Levantine/Syrian, Egyptian, Gulf/Khaleeji, Iraqi, Maghrebi). Verb roots may be لف / دور / خش instead of أستدير; direction words may be شمال (left) / لقدام / قدام (forward) / لورا / ورا (back). Examples that DO dispatch correctly:\n • Levantine/Syrian: 'لف يمين 90 درجة.' / 'لف شمال 180 درجة.' / 'امشي قدام 5 خطوات.' / 'روح ورا مترين.'\n • Egyptian: 'لف على اليمين 90 درجة.' / 'خش شمال 90 درجة.' / 'امشي قدام 3 خطوات.'\n • Gulf/Khaleeji: 'دور يمين 90 درجة.' / 'دور 360 درجة.' / 'امش للأمام 5 خطوات.'\n • Iraqi: 'دور لليمين 90 درجة.' / 'امشي 3 خطوات للأمام.'\nUse digit characters ('5', '90', '360'), not spelled words ('five', 'ninety', 'خمسة', 'تسعين'). Convert spelled-out numbers in any language ('خمس'/'خمسة'/'عشرة'/'تسعين') to digits before speaking.\n\nRULE 6b — DECIMALS AND FRACTIONS. The robot supports non-integer quantities. Use the same digit-only style as Rule 6, with a decimal point:\n • Spoken DECIMALS — preserve the digits the user said. 'walk 1.79 meters' → 'Walking 1.79 meters.' 'turn 22.5 degrees' → 'Turning 22.5 degrees.' Never round to the nearest integer.\n • Spoken FRACTIONS — convert to a decimal in your confirmation. Word→digit map: half / نصف / نص → 0.5; quarter / ربع → 0.25; third / ثلث → 0.33. When the fraction follows a whole number, ADD them: 'three and a half steps' → 3.5 steps; 'two and a quarter meters' → 2.25 meters; 'ثلاث خطوات ونصف' → 3.5 خطوات; 'مترين وربع' → 2.25 متر. When the fraction stands alone before a unit, it's just the fraction: 'half a meter' → 0.5 meters; 'نصف متر' → 0.5 متر.\n • English examples: user 'walk three and a half steps' → 'Walking 3.5 steps.' / user 'half a meter forward' → 'Walking forward 0.5 meters.' / user 'turn ninety and a half degrees' → 'Turning 90.5 degrees.'\n • Arabic examples: user 'أمشي ثلاث خطوات ونصف' → 'أمشي 3.5 خطوات.' / user 'نصف متر للأمام' → 'أمشي للأمام 0.5 متر.' / user 'استدر 90 ونصف درجة' → 'أستدير 90.5 درجة.'\nThe dispatcher accepts spelled fractions too as a safety net (the user will hear you say '3.5' while it parses 'three and a half' equivalently), but digit-decimal confirmations match the actual robot motion exactly and are clearer.\n\nRULE 7 — MEMORY (repeat / undo). Marcus remembers your last dispatched motion and can replay it or invert it on command. When the user asks to repeat ('do that again', 'again', 'one more time', 'كرر', 'تاني', 'مرة ثانية') OR to undo/reverse ('undo', 'reverse', 'go back', 'تراجع', 'ارجع', 'اعكس'), confirm with one of these EXACT phrases — the external dispatcher recognises them and replays/inverts the previous motion:\n • Repeat: 'Repeating that.' / 'Doing that again.' / 'أعيد الحركة.' / 'مرة ثانية.'\n • Reverse: 'Reversing that.' / 'Undoing that.' / 'أتراجع عن آخر حركة.' / 'أعكس آخر حركة.'\nDo NOT try to recall the original motion verb yourself ('Turning right again.') — the dispatcher needs the literal repeat/reverse confirmation phrase. Wake-word rule still applies: only confirm if the user said 'Sanad' / 'سند' AND a repeat/undo trigger.\n\nRULE 8 — STOP HAS PRIORITY. When the user says stop / halt / wait / pause / hold / freeze / توقف / استنى / أوقف / انتظر EVEN MID-MOTION, confirm IMMEDIATELY with 'Stopping.' / 'أتوقف.' (one short phrase). Do NOT try to finish the in-progress motion before responding — the dispatcher cancels it. Do NOT chain stop with another motion ('Stopping, then turning right.') — stop terminates the chain. The wake-word rule is RELAXED for stop: even without 'Sanad'/'سند', if the user clearly wants the robot to halt (urgent tone, repeated stop, 'wait!'), confirm with 'Stopping.' — safety first.\n\nRULE 9 — RECENT-MOTION MEMORY. When the user later asks 'what are you doing?' / 'did you finish?' / 'كم خطوة مشيت؟' / 'هل أكملت؟' / 'are you still moving?', answer naturally from your own memory of what you most recently confirmed: 'I just finished walking 5 steps forward.' / 'انتهيت لتوي من المشي 5 خطوات للأمام.' If you have not confirmed any recent motion, say so plainly: 'I'm not moving right now.' / 'أنا لست أتحرك حالياً.' Keep these answers short and conversational — one sentence.\n\nRULE 9b — NEVER USE SQUARE-BRACKET MARKER FORMAT. NEVER emit text like '[STATE-DONE]', '[STATE-START]', '[STATE-INTERRUPTED]', '[STATE-PAUSED]', '[STATE-RESUMED]', '(Ns)', or any similar bracketed-tag wrapper. These formats look like internal logging markers; if you produce them, the dispatcher strips them out and the robot WON'T MOVE — your motion confirmation is lost. Field-observed failure (do NOT repeat): user said '' and you replied '[STATE-DONE] Reversing that. (1s)' — wrong on every level (hallucinated motion from silence per Rule 1c-2, and bracketed format which gets stripped). Right: ordinary plain prose only — 'Reversing that.' / 'Walking 1 steps.' / 'I'm not moving right now.' No brackets, no parenthetical durations, no markup of any kind.\n\nRULE 10 — PAUSE vs STOP. Pause is a SOFT HOLD: the robot freezes in place but the in-progress motion is NOT cancelled — saying 'resume' / 'كمّل' continues the SAME motion from where it paused. Stop is a HARD ABORT: the chain ends, nothing pending runs. Triggers and confirmations:\n • Pause: 'pause' / 'hold on' / 'wait a moment' / 'استنى لحظة' / 'ثبت' → confirm 'Pausing.' / 'أتوقف مؤقتاً.'\n • Resume: 'resume' / 'continue' / 'كمّل' / 'تابع' → confirm 'Resuming.' / 'أكمل.'\n • Stop: 'stop' / 'cancel' / 'توقف' / 'الغ' → confirm 'Stopping.' (Rule 8).\nIf you receive a pause request while no motion is running, still confirm naturally — the dispatcher harmlessly sets the pause flag for the next motion. Do NOT confuse 'wait a moment' (pause) with 'stop' — they have different semantics. If the user's intent is genuinely ambiguous between the two, prefer pause (recoverable) over stop.\n\nRULE 11 — CUSTOM SEQUENCES (record / save / play). The user can record a chain of motions, name it, and replay it later by name. The dispatcher needs to extract the NAME from your spoken confirmation, so include it literally:\n • Start recording: user says 'start recording' / 'سجل الحركات' (optionally 'as my-greet'). Confirm: 'Recording your sequence.' or with name 'Recording your sequence as my-greet.' / 'بدأت تسجيل التسلسل as my-greet.'\n • While recording: user dictates motion commands (turn right, walk 3 steps, wave hello). You confirm each motion NORMALLY (Rule 5) — the dispatcher captures them.\n • Save recording: user says 'save sequence as my-greet' / 'احفظ التسلسل باسم my-greet'. Confirm: 'Saved as my-greet.' / 'حفظت باسم my-greet.' INCLUDE THE NAME — the dispatcher reads 'as ' to pick the filename.\n • Cancel recording: user says 'cancel recording'. Confirm: 'Discarded recording.' / 'ألغيت التسجيل.'\n • Play recording: user says 'play my-greet' / 'run my-greet' / 'نفذ my-greet'. Confirm: 'Playing my-greet.' / 'أنفذ my-greet.' INCLUDE THE NAME — the dispatcher uses the verb+name pattern to look up the file.\nName format: lowercase letters, digits, hyphens, underscores; 1–31 chars. If the user proposes a name with spaces ('my greet'), say it back hyphenated: 'Saved as my-greet.' If no name supplied on save, ask for one before confirming. NEVER make up a name on play — if the user said 'play sequence' alone with no name, ask which one.\n\nRULE 12 — WALK TO TARGET (spatial planner). When the user asks the robot to walk toward a specific object you can see — 'walk to the door', 'go to the chair', 'امشي للباب', 'اتجه نحو الكرسي', 'امشي باتجاه المكتب' — confirm with one of these EXACT shapes (the dispatcher extracts the target name from the verb+preposition+name pattern):\n • English: 'Walking to the .' / 'Heading to the .' / 'Walking towards the .' / 'Going up to the .'\n • Arabic: 'أمشي إلى ال.' / 'أتجه نحو ال.' / 'أتحرك باتجاه ال.'\nKeep the target a single noun phrase (one or two words max — 'the door', 'the red chair'). Do NOT add destination words to ordinary forward motion ('walk forward' is just forward; 'walk to the door' invokes the spatial planner). If the user asked to walk to something you do NOT see in the camera frame, reply 'I don't see the from here. Could you turn me to face it?' / 'لا أرى ال من هنا. ممكن توجهني إليه؟' — do NOT confirm a walk-to motion you can't actually plan.\n\nRULE 12b — ANAPHORA RESOLUTION. When the user says 'walk to that ' / 'go to it' / 'امشي لذلك' / 'اتجه إليه' / 'تعال هنا للكرسي اللي قلت عليه', resolve the pronoun to the most recently named visible object from your earlier descriptions. Then confirm using the LITERAL noun, not the pronoun, so the dispatcher can extract a concrete target. Examples:\n • You said: 'I see a wooden chair and a desk.' → User says 'walk to that chair' → confirm 'Walking to the chair.' (NOT 'walking to that chair' — drop the deictic).\n • قلت: 'أرى مكتباً وكرسي خشبي.' → المستخدم: 'امشي للكرسي الخشبي' → أكد 'أمشي إلى الكرسي.'\nIf the user's anaphor doesn't match anything you recently described, ask: 'Which do you mean?' / 'أي تقصد؟' — do NOT guess. If the user says 'turn around' WITHOUT a number, treat it as the fixed 'turn around' motion — do NOT invent 180. If the user gives multiple parametric motions ('turn right 90 then walk 3 steps'), chain them per Rule 5 with the numbers preserved: 'Turning right 90 degrees, then walking 3 steps.'\n\nMotion verbs supported (English / Arabic): turn left/right (استدر يميناً/يساراً), turn around (استدر للخلف), move forward/back (تحرك للأمام/للخلف), sit down (اجلس), stand up (قف), wave hello (لوّح), raise/lower arm (ارفع/اخفض يدك), come here (تعال), follow me (اتبعني), stay here (ابق هنا), go home (اذهب للبيت), stop (توقف), patrol (طوف), look around (انظر حولك).", "_gemini_vad_comment": "Gemini server-side VAD tuning. start_sensitivity/end_sensitivity accept 'START_SENSITIVITY_HIGH|LOW' and 'END_SENSITIVITY_HIGH|LOW'. HIGH start = eagerly treats any speech-like sound as turn start, LOW = more conservative. LOW end = longer patience before ending a turn, HIGH = cuts turn sooner. prefix_padding_ms preserves audio from just before speech is detected. silence_duration_ms is how long of quiet ends a turn.", "gemini_vad_start_sensitivity": "START_SENSITIVITY_HIGH", @@ -55,13 +55,20 @@ "gemini_no_messages_timeout_sec": 30, "_mic_gain_comment": "Software gain applied to every mic chunk before it reaches Gemini Live. G1 far-field mic is quiet at default — voices below 1m can land below Gemini's server-side VAD start threshold and never trigger a turn (you'll see 'alive (no speech ...)' repeating in voice.log). 1.0 = unchanged. 2.0–3.0 = comfortable for a 1–2 m talking distance. Above 4.0 starts clipping loud syllables. Tune by reading the e= value in 'alive (no speech ...)' lines: aim for e>500 when you speak normally; >1500 if you want barge-in to fire mid-Gemini-reply.", - "mic_gain": 2.5, + "mic_gain": 4.5, "_dispatch_comment": "Motion command dispatch side-channel. Marcus listens to Gemini's input_transcription; if the text contains a wake-word variant AND the remainder fuzzy-matches a canonical phrase in command_vocab at >= command_vocab_cutoff, Marcus fires on_command() in parallel to Gemini's verbal reply. Dedup on the canonical form within command_cooldown_sec prevents streaming partials from double-firing.", "command_vocab_cutoff": 0.72, "command_cooldown_sec": 1.5, "min_transcription_length": 3, + "_watchdog_comment": "Per-command watchdog. If a single brain motion call exceeds this many seconds the worker fires motion_abort to break the brain's polling loops (and gradual_stop runs as part of that exit path). Generous default — covers the longest legitimate motion (walk_5_meters at 0.25 m/s with 3x safety multiplier ≈ 60s) plus a small margin. Set to 0 to disable. Patrol bypasses the watchdog because patrol's outer duration governs it, and a 5-minute patrol would always trip a 60s watchdog.", + "motion_watchdog_sec": 70.0, + "motion_watchdog_skip_canonicals": ["patrol"], + + "_state_inject_comment": "Whether the runner injects [STATE-START]/[STATE-DONE]/[STATE-INTERRUPTED]/[STATE-ERROR]/[STATE-PAUSED]/[STATE-RESUMED] markers into Gemini Live as silent text context. Disabled by default because (a) some Gemini Live SDK builds drop the WebSocket with 1007 'invalid frame payload data' when text is interleaved with continuous audio streaming — observed 5 reconnects in a 30-min test — and (b) Gemini occasionally echoes the [STATE-...] back through its own output transcription, which the dispatcher used to misread as a new motion command (now stripped by _STATE_ECHO_RE in marcus_voice but still cosmetically noisy). Marcus side keeps current_motion / command_history regardless, so set true if you want Gemini to give detailed answers to 'what did you just do?'. Set false (default) for stable demos.", + "gemini_send_state_events": false, + "_vocab_comment": "wake_words and command_vocab now live in Config/instruction.json — single source of truth for all bilingual phrase tables (wake variants + per-action user_phrases + per-action bot_phrases, English AND Arabic). garbage_patterns stays here because it's noise filtering, not voice instruction.", "garbage_patterns": [ "thanks for watching", "thank you for watching", diff --git a/Config/instruction.json b/Config/instruction.json index 426551f..9ca05f1 100644 --- a/Config/instruction.json +++ b/Config/instruction.json @@ -70,19 +70,26 @@ "turn_around": { "canonical": "turn around", "user_phrases": { - "english": ["turn around", "turn back", "spin around", "about face", "face the other way"], + "english": ["turn around", "turn back", "spin around", "about face", "face the other way", "spin in place", "spin yourself", "turn yourself around"], "arabic": [ "استدر للخلف", "استدر إلى الوراء", "اتجه للخلف", "ادر للخلف", "ارجع وجهك", "لف ورا", "لف للورا", "لف للخلف", "دور ورا", "دور للورا", "دور للخلف", "اتلف", "اتلف ورا", "دير وراك", "دير للورا", "استدر ورا", - "اقلب وجهك", "اقلب", "اعكس اتجاهك", "غيّر اتجاهك" + "اقلب وجهك", "اقلب", "اعكس اتجاهك", "غيّر اتجاهك", + "لف حول نفسك", "لف حوالين نفسك", "لف على نفسك", + "دور حول نفسك", "دور حوالين نفسك", "دور على نفسك", + "استدر حول نفسك", "استدر حوالين نفسك", + "حول نفسك", "حوالين نفسك", + "لف بنفسك", "دور بنفسك" ] }, "bot_phrases": { "english": ["turning around"], "arabic": [ "أستدير للخلف", "أستدير إلى الوراء", "استدير للخلف", - "بلف ورا", "بدور للورا", "أدور للخلف" + "بلف ورا", "بدور للورا", "أدور للخلف", + "أستدير حول نفسي", "ألف حول نفسي", "أدور حول نفسي", + "بلف حول نفسي", "بدور حول نفسي" ] } }, @@ -102,7 +109,11 @@ ] }, "bot_phrases": { - "english": ["moving forward", "walking forward", "stepping forward", "going forward", "going ahead"], + "_": "Direction-anchored only. We do NOT add destination-phrased needles like 'walking towards X' / 'أمشي نحو X' even though Gemini occasionally says them — those false-fire when Gemini DESCRIBES a third party ('I see a person walking towards us'). Persona RULE 4b forbids destination phrasing in motion confirmations; trust the prompt. If a real motion phrase ever slips through and we miss it, add the EXACT slip to this list — not generic destination patterns.", + "english": [ + "moving forward", "walking forward", "stepping forward", + "going forward", "going ahead", "heading forward", "walking ahead" + ], "arabic": [ "أتحرك للأمام", "أتحرك إلى الأمام", "أتقدم", "أمشي للأمام", "أذهب للأمام", "بمشي قدام", "براح قدام", "أروح قدام", "بتقدم", "بفوت لقدام" @@ -188,9 +199,10 @@ ] }, "bot_phrases": { + "_": "Removed 'أنتظر' from this list — it's also a pause_motion bot_phrase ('I wait' is more naturally a pause than a stop). Keeping it here would cause Gemini saying 'أنتظر' to fire BOTH stop and pause_motion in the same chunk.", "english": ["stopping", "halting", "holding"], "arabic": [ - "أتوقف", "توقفت", "أنتظر", + "أتوقف", "توقفت", "بوقّف", "وقّفت", "بستنى", "أصبر", "بطّلت" ] } @@ -339,10 +351,17 @@ ] }, "bot_phrases": { - "english": ["coming over", "coming to you", "approaching"], + "_": "Expanded Arabic forms because Gemini sometimes emits non-canonical conjugations (e.g. 'أتعال' is grammatically wrong but phonetically what it produces for 'I come'). Persona Rule 5d gives Gemini correct example phrases; these extra needles catch slip cases like 'أتعال'/'بجي لك'.", + "english": [ + "coming over", "coming to you", "approaching", + "coming to find you", "coming your way", + "i'm coming", "on my way to you", "heading to you" + ], "arabic": [ - "آتي إليك", "أقترب", "أتقرّب", - "بجي", "جاي", "جاي لك", "بقرّب", "أقرّب منك" + "آتي إليك", "أتي إليك", "أتي", "آتي", "أتعال", + "أنا قادم", "قادم إليك", "قادم لك", "أقترب منك", + "أقترب", "أتقرّب", "بجي", "بجي لك", "جاي", "جاي لك", + "بقرّب", "أقرّب منك", "رايح لك", "أتجه إليك" ] } }, @@ -479,6 +498,223 @@ "english": [], "arabic": [] } + }, + + "repeat_last": { + "_": "Memory action: re-fire the most recent dispatched command. Marcus VoiceModule keeps a deque of recent commands and intercepts this canonical — it is NEVER passed to the brain, the resolved (real) command is dispatched instead. bot_phrases use distinctive multi-word forms ('repeating that', 'doing that again') to avoid colliding with the bare 'again' / 'تاني' that users might say in a different context.", + "canonical": "repeat last", + "user_phrases": { + "english": [ + "do that again", "do it again", "again", "repeat", "redo", + "one more time", "once more", "do the same again", + "say that again", "do it once more", "repeat that", + "another time", "another one" + ], + "arabic": [ + "كرر", "كرّر", "كررها", "أعد", "أعدها", "أعد العملية", + "تاني", "مرة ثانية", "مرة تانية", "مرة كمان", "مرة أخرى", + "كرر اللي سويته", "كرر الحركة", "أعد الحركة", "مرة ثانية لو سمحت", + "أعد الكرة", "كمان مرة" + ] + }, + "bot_phrases": { + "english": [ + "repeating that", "doing that again", "doing it again", + "one more time", "doing the same again", "repeating the last action" + ], + "arabic": [ + "أعيد الحركة", "أكرر الحركة", "مرة ثانية", "أعيدها", + "أكررها", "أعيد آخر حركة", "أكرر آخر حركة" + ] + } + }, + + "pause_motion": { + "_": "Soft hold: brain motion loops freeze in place (zero velocity) and wait. Resume continues the SAME motion from where it paused — different from stop, which terminates the chain. Marcus VoiceModule intercepts this canonical and sets motion_pause directly; brain never sees it. Persona Rule 10 instructs Gemini to confirm with 'Pausing.' / 'أتوقف مؤقتاً.'.", + "canonical": "pause motion", + "user_phrases": { + "english": [ + "pause", "pause motion", "hold on", "hold position", "hold up", + "wait a moment", "wait a sec", "freeze for a moment", "one second", + "give me a sec", "pause for a moment", "stay there" + ], + "arabic": [ + "توقف لحظة", "انتظر لحظة", "ثبت", "ثبت مكانك", + "استنى شوي", "استنى ثانية", "ثانية واحدة", "لحظة", "هدّي شوي", + "ايقاف مؤقت", "وقفة قصيرة", "انتظرني", "خليك مكانك" + ] + }, + "bot_phrases": { + "english": [ + "pausing", "holding position", "holding for a moment", + "freezing in place", "pausing for a moment" + ], + "arabic": [ + "أتوقف مؤقتاً", "أتوقف لحظة", "أنتظر", "أثبت مكاني", + "إيقاف مؤقت", "انتظار قصير", "أبقى مكاني" + ] + } + }, + + "resume_motion": { + "_": "Clears motion_pause so the previously-running brain loop continues. If nothing was actually paused, this is a quiet no-op. Persona Rule 10 instructs Gemini to confirm with 'Resuming.' / 'أكمل.'.", + "canonical": "resume motion", + "user_phrases": { + "english": [ + "resume", "continue", "go on", "carry on", "keep going", + "proceed", "resume motion", "continue what you were doing", + "you can continue", "go ahead now" + ], + "arabic": [ + "أكمل", "استكمل", "كمّل", "كمل", "تابع", "استمر", + "كمّل الحركة", "كمّل اللي كنت فيه", "تابع اللي كنت تسوي", + "روح تابع", "استمر بالحركة" + ] + }, + "bot_phrases": { + "english": [ + "resuming", "continuing", "carrying on", "resuming motion", + "picking up where I left off" + ], + "arabic": [ + "أكمل", "أستمر", "أتابع", "أكمل الحركة", + "أكمل من حيث توقفت", "أتابع اللي كنت أسويه" + ] + } + }, + + "start_recording": { + "_": "Begin capturing subsequent dispatched commands into a named sequence buffer. Marcus VoiceModule intercepts this canonical and starts recording. Optionally pre-name the sequence here ('start recording as my-greet') — otherwise the user must name it on save. The recording is in-memory until save_sequence is called.", + "canonical": "start recording", + "user_phrases": { + "english": [ + "start recording", "begin recording", "record this", + "remember this sequence", "record what i do", + "record sequence", "start saving moves", "save what i do" + ], + "arabic": [ + "ابدأ التسجيل", "ابدأ تسجيل", "ابدا التسجيل", + "سجل اللي بسويه", "سجل الحركات", + "سجل ما أفعله", "ابدأ الحفظ" + ] + }, + "bot_phrases": { + "english": [ + "starting to record", "recording your sequence", + "i'm recording", "recording started" + ], + "arabic": [ + "بدأت التسجيل", "أسجل الحركات", "بدأت تسجيل التسلسل", + "بدأت أسجل" + ] + } + }, + + "save_sequence": { + "_": "Stop recording and persist the buffered commands. The user typically appends a name in the SAME utterance ('save sequence as my-greet') — Gemini extracts the name and includes it in the confirmation. Marcus parses the bot_phrase, extracts the trailing name, calls Sequences.save_recording(name).", + "canonical": "save sequence", + "user_phrases": { + "english": [ + "save sequence", "save this sequence", "save it as", + "save recording", "stop recording and save", + "stop recording save as", "save recording as", + "name this sequence", "call this sequence" + ], + "arabic": [ + "احفظ التسلسل", "احفظ التسجيل", "احفظ باسم", + "أوقف التسجيل واحفظ", "احفظ هذا التسلسل", + "سمي هذا التسلسل", "احفظ التسجيل باسم", "خلصت سجل" + ] + }, + "bot_phrases": { + "english": [ + "saved sequence", "saved as", "saving sequence as", + "recording saved", "saved your sequence as" + ], + "arabic": [ + "حفظت التسلسل", "حفظت باسم", "أحفظ التسلسل باسم", + "تم الحفظ" + ] + } + }, + + "play_sequence": { + "_": "Replay a saved sequence by name. The bot_phrase carries the name ('Playing my-greet.'); Marcus extracts it and calls Sequences.play(name) → enqueues each command on the motion worker queue. Sequence commands execute serially with normal interrupt support — say 'stop' to abort the playback.", + "canonical": "play sequence", + "user_phrases": { + "english": [ + "play sequence", "play my", "run my", "do my", + "play recording", "execute sequence", "run sequence", + "perform sequence" + ], + "arabic": [ + "شغل التسلسل", "نفذ التسلسل", "سوي اللي سجلته", + "ابدأ تسلسل", "نفذ تسجيل", "شغل التسجيل", + "اعمل التسلسل" + ] + }, + "bot_phrases": { + "english": [ + "playing", "running", "performing sequence", + "playing sequence", "starting sequence" + ], + "arabic": [ + "أشغل التسلسل", "أنفذ التسلسل", "أبدأ التسلسل", + "بدأت التشغيل" + ] + } + }, + + "cancel_recording": { + "_": "Abandon the in-progress recording without saving. Marcus calls Sequences.cancel_recording().", + "canonical": "cancel recording", + "user_phrases": { + "english": [ + "cancel recording", "discard recording", "throw away recording", + "stop recording without saving", "forget the recording" + ], + "arabic": [ + "ألغ التسجيل", "احذف التسجيل", "اوقف التسجيل بدون حفظ", + "تجاهل التسجيل" + ] + }, + "bot_phrases": { + "english": [ + "discarded recording", "cancelled recording", + "recording cancelled" + ], + "arabic": [ + "ألغيت التسجيل", "حذفت التسجيل", "تم إلغاء التسجيل" + ] + } + }, + + "reverse_last": { + "_": "Memory action: fire the INVERSE of the last dispatched command (turn right→turn left, walk forward→walk backward, sit down→stand up, raise arm→lower arm; parametric forms flip direction and keep the number). Marcus intercepts the 'reverse last' canonical and computes the inverse from history. NOTE: the brain has its own 'undo' regex that returns to the start position via odometry — that's a different feature; reverse_last only flips the LAST motion. Distinctive bot_phrases avoid colliding with move_backward needles like 'going back' / 'reversing'.", + "canonical": "reverse last", + "user_phrases": { + "english": [ + "undo", "undo that", "reverse", "reverse that", + "go back", "take that back", "cancel that", + "undo the last", "reverse the last", "do the opposite" + ], + "arabic": [ + "تراجع", "تراجع عن ذلك", "ارجع ما سويت", "ارجع", + "اعكس", "اعكسها", "ألغي", "ألغي ذلك", "ألغي الحركة", + "رد ما سويت", "تراجع عن آخر حركة", "اعكس آخر حركة", + "اعمل عكس", "اعمل العكس" + ] + }, + "bot_phrases": { + "english": [ + "reversing that", "undoing that", "reversing the last action", + "undoing the last action", "going back to before" + ], + "arabic": [ + "أتراجع عن ذلك", "أعكس آخر حركة", "ألغي آخر حركة", + "أتراجع عن آخر حركة", "أرجع للحالة السابقة", "أعكسها" + ] + } } }, @@ -489,6 +725,21 @@ "regex": "(?:turn(?:ing)?|spin(?:ning)?|rotat(?:e|ing))\\s+(left|right)\\s+(\\d+)\\s*deg(?:ree(?:s)?)?", "command_template": "turn $1 $2 degrees" }, + { + "_": "turn left/right N steps (English) — 'Turning left 1 steps.' Maps to brain's _RE_TURN_STEP single-motion canonical 'turn left N steps' (NOT compound). User says 'turn left one step' — meaning one step's worth of left rotation.", + "regex": "(?:turn(?:ing)?|spin(?:ning)?|rotat(?:e|ing))\\s+(left|right)\\s+(\\d+(?:\\.\\d+)?)\\s*steps?", + "command_template": "turn $1 $2 steps" + }, + { + "_": "Arabic أستدير يميناً/يساراً N خطوات — 'أستدير يساراً 1 خطوات' for 'لف يسار خطوة وحدة' single-motion intent. Parses as 'turn left N steps' brain canonical (not compound 'turn + walk').", + "regex": "(?:أستدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور)\\s+(?:(?:على|ع|إلى)\\s+)?(?:يمين|يميناً|يمينا|اليمين|لليمين)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)", + "command_template": "turn right $1 steps" + }, + { + "_": "Arabic أستدير يساراً N خطوات — same as above, left side.", + "regex": "(?:أستدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور)\\s+(?:(?:على|ع|إلى)\\s+)?(?:يسار|يساراً|يسارا|اليسار|لليسار|شمال|الشمال|للشمال)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)", + "command_template": "turn left $1 steps" + }, { "_": "turn N degrees — 'Turning 360 degrees.'", "regex": "(?:turn(?:ing)?|spin(?:ning)?|rotat(?:e|ing))\\s+(\\d+)\\s*deg(?:ree(?:s)?)?", @@ -496,17 +747,17 @@ }, { "_": "walk forward/back N steps — 'Walking forward 5 steps.'", - "regex": "(?:walk(?:ing)?|step(?:ping)?|mov(?:e|ing))\\s+(forward|back(?:ward)?)\\s+(\\d+)\\s+steps?", + "regex": "(?:walk(?:ing)?|step(?:ping)?|mov(?:e|ing))\\s+(forward|back(?:ward)?)\\s+(\\d+(?:\\.\\d+)?)\\s+steps?", "command_template": "walk $1 $2 steps" }, { "_": "walk N steps forward/back — 'Walking 5 steps forward.'", - "regex": "(?:walk(?:ing)?|step(?:ping)?|tak(?:e|ing))\\s+(\\d+)\\s+steps?\\s+(forward|back(?:ward)?)", + "regex": "(?:walk(?:ing)?|step(?:ping)?|tak(?:e|ing))\\s+(\\d+(?:\\.\\d+)?)\\s+steps?\\s+(forward|back(?:ward)?)", "command_template": "walk $2 $1 steps" }, { "_": "walk N steps (default forward) — 'Taking 5 steps.'", - "regex": "(?:walk(?:ing)?|step(?:ping)?|tak(?:e|ing))\\s+(\\d+)\\s+steps?", + "regex": "(?:walk(?:ing)?|step(?:ping)?|tak(?:e|ing))\\s+(\\d+(?:\\.\\d+)?)\\s+steps?", "command_template": "walk $1 steps" }, { @@ -525,34 +776,137 @@ "command_template": "walk $1 meters" }, { - "_": "Arabic: 'أستدير يميناً 90 درجة' / 'أستدير يمينا 90 درجة'", - "regex": "أستدير\\s+يمين[اًا]?ً?\\s+(\\d+)\\s*درجة", + "_": "Arabic turn right N degrees — covers MSA + Levantine/Syrian (لف يمين), Gulf/Iraqi (دور يمين), Egyptian (خش يمين). Verb roots: أستدير | استدر | لف | دور | خش. Direction tokens: يمين | يميناً | يمينا | اليمين (with optional على/ع prefix).", + "regex": "(?:أستدير|استدر|لف+|ألف+|دور+|أدور+|بدور|خش|أخش|بخش)\\s+(?:(?:على|ع)\\s+)?(?:يمين|يميناً|يمينا|اليمين|لليمين)\\s+(\\d+)\\s*درجة", "command_template": "turn right $1 degrees" }, { - "_": "Arabic: 'أستدير يساراً 90 درجة'", - "regex": "أستدير\\s+يسار[اًا]?ً?\\s+(\\d+)\\s*درجة", + "_": "Arabic turn left N degrees — also accepts شمال/الشمال (Lev/Egy 'left').", + "regex": "(?:أستدير|استدر|لف+|ألف+|دور+|أدور+|بدور|خش|أخش|بخش)\\s+(?:(?:على|ع)\\s+)?(?:يسار|يساراً|يسارا|اليسار|لليسار|شمال|الشمال|للشمال)\\s+(\\d+)\\s*درجة", "command_template": "turn left $1 degrees" }, { - "_": "Arabic: 'أستدير 360 درجة' (no direction)", - "regex": "أستدير\\s+(\\d+)\\s*درجة", + "_": "Arabic turn N degrees (no direction) — 'أستدير 360 درجة' / 'لف 180 درجة' / 'دور 90 درجة'.", + "regex": "(?:أستدير|استدر|لف+|دور+)\\s+(\\d+)\\s*درجة", "command_template": "turn $1 degrees" }, { - "_": "Arabic: 'أمشي/أتحرك 5 خطوات'", - "regex": "(?:أمشي|أتحرك)\\s+(\\d+)\\s*خطو(?:ات|ة|تين)?", + "_": "Arabic walk N steps forward — verbs: أمشي | امشي | أتحرك | تحرك | روح | اروح. Forward: للأمام | لقدام | قدام (Egy/Lev).", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:للأمام|لقدام|قدام)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)", + "command_template": "walk forward $1 steps" + }, + { + "_": "Arabic walk N steps back — Back: للخلف | لورا | ورا (Egy/Lev/Gulf).", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:للخلف|لورا|ورا)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)", + "command_template": "walk backward $1 steps" + }, + { + "_": "Arabic walk N steps forward — number BEFORE direction (also common in Egy/Lev).", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)\\s+(?:للأمام|لقدام|قدام)", + "command_template": "walk forward $1 steps" + }, + { + "_": "Arabic walk N steps back — number BEFORE direction.", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)\\s+(?:للخلف|لورا|ورا)", + "command_template": "walk backward $1 steps" + }, + { + "_": "Arabic walk N steps (default forward) — 'أمشي 5 خطوات' / 'امشي خمس خطوات' (digits only — Gemini is told to use digits).", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:خطوة|خطوات|خطوتين)", "command_template": "walk $1 steps" }, { - "_": "Arabic: 'أتحرك للأمام مترين' / 'أمشي للأمام 2 متر'", - "regex": "(?:أمشي|أتحرك)\\s+للأمام\\s+(\\d+(?:\\.\\d+)?)\\s*متر", + "_": "Arabic walk N steps — REVERSED word order: verb + unit + number ('أمشي خطوة واحدة' → after normalise_numbers → 'أمشي خطوة 1'). Common natural-Arabic phrasing where the number-adjective follows the noun. Without this entry, this very natural phrasing fails to dispatch.", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:خطوة|خطوات|خطوتين)\\s+(\\d+)", + "command_template": "walk $1 steps" + }, + { + "_": "Arabic DUAL form 'خطوتين' (two steps) — single word, no separate digit. Without this entry the dispatcher would miss 'أمشي خطوتين' since the parametric regex needs \\d+. Maps directly to walk 2 steps.", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+خطوتين", + "command_template": "walk 2 steps" + }, + { + "_": "Arabic DUAL form 'مترين' (two meters) — same dual-suffix pattern as above. 'أمشي مترين' / 'أتحرك مترين' → walk 2 meters.", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+مترين", + "command_template": "walk 2 meters" + }, + { + "_": "Arabic DUAL form 'درجتين' (two degrees) — for completeness, though 2-degree turns are rare.", + "regex": "(?:أستدير|استدر|لف+|ألف+|دور+|أدور+|بدور)\\s+(?:يميناً|يمينا|اليمين|لليمين|يساراً|يسارا|اليسار|لليسار|شمال|الشمال|للشمال)?\\s*درجتين", + "command_template": "turn 2 degrees" + }, + { + "_": "Arabic walk N meters forward — direction first.", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:للأمام|لقدام|قدام)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)", "command_template": "walk forward $1 meters" }, { - "_": "Arabic: 'أتحرك للخلف مترين'", - "regex": "(?:أمشي|أتحرك)\\s+للخلف\\s+(\\d+(?:\\.\\d+)?)\\s*متر", + "_": "Arabic walk N meters back — direction first.", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(?:للخلف|لورا|ورا)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)", "command_template": "walk backward $1 meters" + }, + { + "_": "Arabic walk N meters forward — number first.", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)\\s+(?:للأمام|لقدام|قدام)", + "command_template": "walk forward $1 meters" + }, + { + "_": "Arabic walk N meters back — number first.", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)\\s+(?:للخلف|لورا|ورا)", + "command_template": "walk backward $1 meters" + }, + { + "_": "Arabic walk N meters (default forward).", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|اروح|روح|اروحي|روحي)\\s+(\\d+(?:\\.\\d+)?)\\s*(?:متر|مترين|أمتار)", + "command_template": "walk $1 meters" + }, + + { + "_": "Generic dialect Arabic 'turn right' WITHOUT number. Catches forms NOT in turn_right.bot_phrases.arabic, e.g. 'ألف على اليمين', 'دور لليمين', 'أستدير إلى اليمين', 'لف ع اليمين'. Verbs: لف/ألف/بلف/دور/أدور/بدور/خش/أخش/بخش/حوّد/أحوّد/أستدير/استدير. Optional prep على/ع/إلى/لـ in front of direction.", + "regex": "(?:أستدير|استدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور|خش|أخش|بخش|حوّد|أحوّد)\\s+(?:(?:على|ع|إلى)\\s+)?(?:ال)?(?:يمين|يميناً|يمينا|لليمين)", + "command_template": "turn right" + }, + { + "_": "Generic dialect Arabic 'turn left' WITHOUT number — also catches شمال/الشمال.", + "regex": "(?:أستدير|استدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور|خش|أخش|بخش|حوّد|أحوّد)\\s+(?:(?:على|ع|إلى)\\s+)?(?:ال)?(?:يسار|يساراً|يسارا|لليسار|شمال|الشمال|للشمال)", + "command_template": "turn left" + }, + { + "_": "Generic dialect Arabic 'turn around' WITHOUT number — back/wara variants.", + "regex": "(?:أستدير|استدير|استدر|لف+|ألف+|بلف|دور+|أدور+|بدور)\\s+(?:(?:إلى|لـ)\\s*)?(?:للخلف|الخلف|للوراء|الوراء|للورا|ورا|لورا)", + "command_template": "turn around" + }, + { + "_": "Generic dialect Arabic 'move forward' WITHOUT meters/steps — أمشي/أتحرك/أتقدم/أروح/بمشي/بتقدم + قدام/للأمام/الأمام variants.", + "regex": "(?:أمشي|امشي|امش|أتحرك|تحرك|أتقدم|بتقدم|أتقدّم|أروح|اروح|روح|براح|بمشي|بفوت|أفوت)\\s+(?:(?:إلى|لـ)\\s*)?(?:للأمام|الأمام|لقدام|قدام|للقدام)", + "command_template": "move forward" + }, + { + "_": "Generic dialect Arabic 'move backward' WITHOUT meters/steps — أرجع/أتراجع/برجع + للخلف/ورا/الوراء variants.", + "regex": "(?:أرجع|ارجع|برجع|أتراجع|بتراجع|أتحرك|تحرك|أمشي|امشي|امش)\\s+(?:(?:إلى|لـ)\\s*)?(?:للخلف|الخلف|لورا|ورا|للوراء|الوراء)", + "command_template": "move backward" + }, + + { + "_": "STEP-CLOSER (English). Note: normalise_numbers runs BEFORE this regex matches and converts 'one'→'1'. The regex must accept both forms ('one step closer' OR '1 step closer'). Maps to a deterministic 1-step walk; distinct from 'walking to X' (planner) and 'come here' (YOLO-tracked).", + "regex": "(?:stepping|moving|coming|edging)\\s+closer|(?:one|1)\\s+step\\s+closer|step(?:ping)?\\s+(?:a\\s+(?:bit|little)\\s+)?forward", + "command_template": "walk forward 1 steps" + }, + { + "_": "STEP-CLOSER (Arabic). Note: normalise_numbers converts 'واحدة'→'1' before this regex runs, so the optional 'one' part must match either واحدة or 1. Same single-step semantics as English.", + "regex": "أتقدم\\s+خطوة|أقترب\\s+خطوة|خطوة\\s+(?:(?:واحدة|1)\\s+)?(?:للأمام|قدام)|أقرب\\s+خطوة|أتقرّب\\s+خطوة|أقدّم\\s+خطوة|أخطو\\s+للأمام", + "command_template": "walk forward 1 steps" + }, + + { + "_": "WALK-TO-TARGET (English) — captures the named object after 'walking/heading/going to/towards'. Example bot phrases the dispatcher catches: 'Walking to the door.' / 'Heading to the chair.' / 'Walking towards the table.' / 'Going up to the wall.'. The captured target ($1) is plugged into the brain command 'walk to ', which Brain/command_parser._RE_WALK_TO routes into the LLaVA-grounded spatial planner. Constraints: (a) stops at sentence punctuation so 'walking to the door, then sitting' picks up just 'door', (b) the determiner-strip group includes 'that\\s+|this\\s+' so anaphora 'walking to that chair' captures 'chair' not 'that chair', (c) negative-lookahead before the target rejects pronouns (us|me|him|her|them|you|it|myself|yourself) so a Rule-4-violating description like 'I see a person walking towards us' does NOT fire 'walk to us'. Persona Rule 12 instructs Gemini to use these literal shapes. NOTE: target capture allows Arabic chars (U+0600–U+06FF) too — handles the field case where canonical_normalizer translated an Arabic verb+connective to English ('أتجه نحو X' → 'walking to X' with X still Arabic) and we want the planner to receive the Arabic target name. Qwen2.5-VL is multilingual; passing Arabic target to LLaVA prompt works.", + "regex": "(?:approaching|(?:walking|heading|going|moving)\\s+(?:over\\s+)?(?:to|towards?|up\\s+to)\\b)\\s+(?:the\\s+|a\\s+|an\\s+|that\\s+|this\\s+|ال)?(?!(?:us|me|him|her|them|you|it|myself|yourself|themselves|himself|herself|itself|هو|هي|هم|هما|هن|نحن|أنت|أنا)\\b)([A-Za-z\\u0600-\\u06FF][A-Za-z0-9\\u0600-\\u06FF ._-]{0,40}?)\\s*(?=[.!?،,]|$)", + "command_template": "walk to $1" + }, + { + "_": "WALK-TO-TARGET (Arabic) — أمشي / أتجه / أتحرك + (إلى | نحو | تجاه | باتجاه) + . Character class is the WHOLE Arabic block U+0600–U+06FF so vowel diacritics (fatha/damma/kasra/sukun, U+064B–U+065F) inside the target name don't truncate the capture (e.g. 'البَاب' with fatha was previously stopping at the fatha and capturing None). Determiner-strip includes هذا/هذه/ذلك/تلك (anaphora) plus the optional ال prefix. Pronoun negative-lookahead rejects هو/هي/هم/أنت/نحن/نا/ي etc as targets — handled by lookbehind avoiding common pronoun stems.", + "regex": "(?:أمشي|امشي|أتحرك|تحرك|أتجه|متجه|أروح|اروح|روح)\\s+(?:إلى|نحو|تجاه|باتجاه|لـ?ال?)\\s+(?:هذا|هذه|ذلك|تلك)?\\s*(?:ال)?(?!(?:هو|هي|هم|هما|هن|نحن|أنت|أنتم|أنا)\\b)([\\u0600-\\u06FF][\\u0600-\\u06FF 0-9_-]{0,40}?)\\s*(?=[.!?،,]|$)", + "command_template": "walk to $1" } ] } diff --git a/Config/language_tables.json b/Config/language_tables.json new file mode 100644 index 0000000..75e2574 --- /dev/null +++ b/Config/language_tables.json @@ -0,0 +1,222 @@ +{ + "_description": "Language tables for voice motion processing. Single source of truth for ALL vocabulary data — no Arabic/English motion words live in code anymore. Adding a new dialect / variant is a JSON-only edit.", + "_consumers": [ + "Voice/number_words.py — spelled-out numbers → digits (EN + AR)", + "Voice/canonical_normalizer.py — Arabic structural translation → English", + "Voice/marcus_voice.py — reverse-command map for memory operations", + "Voice/sequences.py — control commands that must NOT be captured into recordings" + ], + + "english_fractions": { + "_description": "English fractional words. Two semantic flavours: 'additive' (combines with a preceding integer — '3 and a half steps' → 3.5) and 'leading' (standalone — 'half a meter' → 0.5). Both forms map to the same numeric value; flavours are kept separate so the parser knows the syntactic context to apply.", + "additive": { + "half": 0.5, + "halves": 0.5, + "quarter": 0.25, + "quarters": 0.25, + "third": 0.3333, + "thirds": 0.3333 + }, + "leading": { + "half": 0.5, + "quarter": 0.25, + "third": 0.3333 + } + }, + + "arabic_fractions": { + "_description": "Arabic fractional words. Same additive vs leading split as English. 'Additive' applies after a digit + ون / و conjunction ('3 ونصف' → 3.5). 'Leading' applies before a unit ('نصف متر' → 0.5 meter). Multiple dialect spellings of half (نصف / نص).", + "additive": { + "نصف": 0.5, + "نص": 0.5, + "ربع": 0.25, + "ثلث": 0.3333 + }, + "leading": { + "نصف": 0.5, + "نص": 0.5, + "ربع": 0.25, + "ثلث": 0.3333 + } + }, + + "english_numbers": { + "_description": "English number-word parser tables. Used by number_words.py to convert 'ninety degrees' → '90 degrees' before regex matching.", + "ones": { + "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, + "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, + "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13, + "fourteen": 14, "fifteen": 15, "sixteen": 16, + "seventeen": 17, "eighteen": 18, "nineteen": 19 + }, + "tens": { + "twenty": 20, "thirty": 30, "forty": 40, "fifty": 50, + "sixty": 60, "seventy": 70, "eighty": 80, "ninety": 90 + }, + "scale": { + "hundred": 100 + }, + "glue": ["and", "-"] + }, + + "arabic_numbers": { + "_description": "Arabic spelled-out numbers. Order matters — longer multi-word phrases come first so they're matched before their shorter prefixes.", + "literals": [ + ["ثلاثمائة وستون", 360], + ["ثلاثمائة وستين", 360], + ["ثلاث مائة وستون", 360], + ["ثلاث مائة وستين", 360], + ["مائة وثمانون", 180], + ["مائة وثمانين", 180], + ["مية وثمانين", 180], + ["مائتان وسبعون", 270], + ["مائتان وسبعين", 270], + ["عشرون", 20], ["عشرين", 20], + ["ثلاثون", 30], ["ثلاثين", 30], + ["أربعون", 40], ["أربعين", 40], ["اربعون", 40], ["اربعين", 40], + ["خمسون", 50], ["خمسين", 50], + ["ستون", 60], ["ستين", 60], + ["سبعون", 70], ["سبعين", 70], + ["ثمانون", 80], ["ثمانين", 80], + ["تسعون", 90], ["تسعين", 90], + ["مائتان", 200], ["مائتين", 200], ["ميتين", 200], + ["ثلاثمائة", 300], ["ثلاث مائة", 300], + ["أربعمائة", 400], ["اربعمائة", 400], ["أربع مائة", 400], + ["خمسمائة", 500], ["خمس مائة", 500], + ["مائة", 100], ["مية", 100], ["ميه", 100], + ["أحد عشر", 11], ["احد عشر", 11], + ["اثنا عشر", 12], ["اثني عشر", 12], + ["ثلاثة عشر", 13], ["ثلاث عشرة", 13], + ["أربعة عشر", 14], ["اربعة عشر", 14], + ["خمسة عشر", 15], ["خمس عشرة", 15], + ["ستة عشر", 16], ["ست عشرة", 16], + ["سبعة عشر", 17], + ["ثمانية عشر", 18], + ["تسعة عشر", 19], + ["واحد", 1], ["واحدة", 1], + ["اثنان", 2], ["اثنين", 2], ["اثنتان", 2], ["اثنتين", 2], + ["ثلاثة", 3], ["ثلاث", 3], + ["أربعة", 4], ["أربع", 4], ["اربعة", 4], ["اربع", 4], + ["خمسة", 5], ["خمس", 5], + ["ستة", 6], ["ست", 6], + ["سبعة", 7], ["سبع", 7], + ["ثمانية", 8], ["ثمان", 8], + ["تسعة", 9], ["تسع", 9], + ["عشرة", 10], ["عشر", 10] + ] + }, + + "arabic_verbs": { + "_description": "Arabic verb roots → English gerund. Gerund matches Gemini's English bot_phrases (e.g. 'walking forward'). Multiple Arabic dialect roots can map to the same English verb.", + "walking": [ + "أمشي", "امشي", "امش", + "أتحرك", "تحرك", + "اروح", "روح", "اروحي", "روحي", + "بمشي", "براح", "أراح", + "بفوت", "أفوت", + "سير", "اسير", + "أتجه", "اتجه", "يتجه", "نتجه", + "أتوجه", "اتوجه", "يتوجه", "توجه", + "متوجه", "متجه" + ], + "turning": [ + "أستدير", "استدر", "استدير", + "لفّ", "لف", "ألف", "بلف", + "دوّر", "دور", "أدور", "بدور", + "خش", "أخش", "بخش", + "حوّد", "أحوّد" + ], + "walking backward": [ + "أرجع", "ارجع", "برجع", + "أتراجع", "بتراجع", "تراجع" + ], + "walking forward": [ + "أتقدم", "بتقدم", "أتقدّم", "تقدم" + ] + }, + + "arabic_directions": { + "_description": "Arabic direction words → English directions. Includes definite-article variants (اليمين), preposition-prefixed forms (لليمين), and dialect alternatives (شمال = left in Levantine/Egyptian).", + "right": [ + "يميناً", "يمينا", + "اليمين", "لليمين", + "يمين" + ], + "left": [ + "يساراً", "يسارا", + "اليسار", "لليسار", + "يسار", + "الشمال", "للشمال", "شمال" + ], + "forward": [ + "للأمام", "الأمام", "أمام", + "لقدام", "للقدام", "قدام" + ], + "backward": [ + "للخلف", "الخلف", + "للوراء", "الوراء", + "لورا", "ورا", "للورا" + ] + }, + + "arabic_units": { + "_description": "Arabic unit nouns → English unit. Singular-vs-plural form is preserved so the dispatcher's regex layer (which accepts both) sees a natural form. Dual forms (خطوتين / مترين / درجتين) live in arabic_duals.", + "step": ["خطوة"], + "steps": ["خطوات"], + "meter": ["متر"], + "meters": ["أمتار"], + "degree": ["درجة"], + "degrees": ["درجات"] + }, + + "arabic_duals": { + "_description": "Arabic dual forms — single words containing both count and unit ('خطوتين' = 'two steps'). Substituted as a single unit BEFORE the verb/dir/unit pass. The English target form must be readable as 'N units' so the regex layer can pick up the count.", + "2 steps": ["خطوتين"], + "2 meters": ["مترين", "أمتارين"], + "2 degrees": ["درجتين"], + "2 seconds": ["ثانيتين"] + }, + + "arabic_conjunctions": { + "_description": "Arabic conjunctions translated to space-padded English glue. Subbed early so subsequent verb/dir passes don't accidentally treat the conjunction as a word.", + " then ": ["ثم", "وبعدين", "بعدين", "وبعد"] + }, + + "arabic_connectives": { + "_description": "Arabic prepositions / determiners / pronouns. Mostly pass-through; 'إلى' / 'نحو' map to 'to' so walk-to-target patterns work. 'ع' / 'على' get folded into nothing meaningful but are translated to 'on' to avoid leaving raw Arabic mid-string.", + "to": ["إلى", "نحو", "تجاه", "باتجاه"], + "on": ["على", "ع"], + "in": ["في"], + "this": ["هذا", "هذه"], + "that": ["ذلك", "تلك"] + }, + + "motion_inverses": { + "_description": "Pairwise inverse map for reverse_last memory operation. Used by Voice/marcus_voice.py::_reverse_command. Parametric forms (e.g. 'turn left 90 degrees' ↔ 'turn right 90 degrees') are derived in code via regex, NOT listed here — only fixed pairs.", + "turn right": "turn left", + "turn left": "turn right", + "move forward": "move backward", + "move backward": "move forward", + "sit down": "stand up", + "stand up": "sit down", + "raise arm": "lower arm", + "lower arm": "raise arm" + }, + + "sequence_never_record": { + "_description": "Canonical commands that must NEVER be captured into a recording session. Control commands (start/save/cancel/play recording, pause/resume, stop, repeat/reverse) would create absurd macros if captured. Used by Voice/sequences.py::record_command.", + "canonicals": [ + "start recording", + "save sequence", + "cancel recording", + "play sequence", + "list sequences", + "delete sequence", + "pause motion", + "resume motion", + "stop", + "repeat last", + "reverse last" + ] + } +} diff --git a/Core/motion_log.py b/Core/motion_log.py new file mode 100644 index 0000000..888e8ed --- /dev/null +++ b/Core/motion_log.py @@ -0,0 +1,218 @@ +""" +motion_log.py — structured per-motion event log. + +Drop-in: brain handlers and worker call log_motion(event, cmd, ...) +and the helper auto-parses direction/magnitude/unit from the canonical +string. The worker barely needs to know what it's logging. + +Every motion the brain or worker executes writes ONE line to +`logs/motion.log` capturing: + + timestamp | canonical | direction mag unit | target | actual | outcome | source + +So a single voice command produces a paired START + END line: + + 17:45:01 | turn left 19 degrees | dir=left mag=19 unit=deg | target=2.4s | - | start | voice + 17:45:03 | turn left 19 degrees | dir=left mag=19 unit=deg | target=2.4s | actual=2.41s | complete | voice + +Or for a parameterless motion: + + 17:46:00 | turn right | dir=right mag=- unit=- | target=2.0s | - | start | voice + 17:46:02 | turn right | dir=right mag=- unit=- | target=2.0s | actual=2.02s | complete | voice + +This is independent of voice.log and transcript.log: + - voice.log = per-thread debug detail (dispatch, worker, watchdog, + reconnect) + - transcript.log = user-facing dialogue (what user said vs. what + Gemini said vs. what canonical fired) + - motion.log = ONE LINE PER MOTION with parsed params + actual + outcome — for post-hoc auditing of physical movements + +API: + log_motion(event, canonical, direction=..., magnitude=..., unit=..., + target_sec=..., actual_sec=..., source="voice"|"text", + outcome="start"|"complete"|"interrupted"|"error", **extra) + +Events (event field): + 'start' — brain handler is about to issue motor commands. + target_sec is the planned duration; actual is None. + 'complete' — brain returned normally; actual_sec set. + 'interrupted' — motion_abort fired during execution. + 'error' — brain raised an exception; reason in extra. +""" +from __future__ import annotations + +import logging +import os +import re +from logging.handlers import RotatingFileHandler + + +# ─── canonical parser ───────────────────────────────────────────── +# +# Map a canonical command string to (direction, magnitude, unit) so +# the structured log auto-fills these fields when the caller doesn't +# pass them explicitly. + +_RE_TURN_DEG_DIR = re.compile(r"^turn\s+(left|right)\s+(\d+(?:\.\d+)?)\s*deg(?:rees?)?$", re.I) +_RE_TURN_DEG = re.compile(r"^turn\s+(\d+(?:\.\d+)?)\s*deg(?:rees?)?$", re.I) +_RE_TURN_STEP_DIR = re.compile(r"^turn\s+(left|right)\s+(\d+(?:\.\d+)?)\s*steps?$", re.I) +_RE_TURN_DIR = re.compile(r"^turn\s+(left|right|around)$", re.I) +_RE_WALK_STEP = re.compile(r"^walk\s+(?:(forward|back(?:ward)?)\s+)?(\d+(?:\.\d+)?)\s*steps?$", re.I) +_RE_WALK_M = re.compile(r"^walk\s+(?:(forward|back(?:ward)?)\s+)?(\d+(?:\.\d+)?)\s*m(?:eter(?:s)?)?$", re.I) +_RE_MOVE_DIR = re.compile(r"^move\s+(forward|back(?:ward)?|left|right)$", re.I) +_RE_WALK_TO = re.compile(r"^walk\s+to\s+(.+)$", re.I) + + +def parse_canonical(cmd: str) -> dict: + """Parse a canonical command into structured motion fields. + Returns {direction, magnitude, unit, kind}. Best-effort — if the + canonical doesn't match any known shape, all fields are '-' and + kind='other'.""" + out = {"direction": "-", "magnitude": "-", "unit": "-", "kind": "other"} + if not cmd: + return out + s = cmd.strip().lower() + + m = _RE_TURN_DEG_DIR.match(s) + if m: + return {"direction": m.group(1), "magnitude": m.group(2), + "unit": "deg", "kind": "turn"} + m = _RE_TURN_DEG.match(s) + if m: + return {"direction": "-", "magnitude": m.group(1), + "unit": "deg", "kind": "turn"} + m = _RE_TURN_STEP_DIR.match(s) + if m: + return {"direction": m.group(1), "magnitude": m.group(2), + "unit": "step", "kind": "turn"} + m = _RE_TURN_DIR.match(s) + if m: + return {"direction": m.group(1), "magnitude": "-", + "unit": "-", "kind": "turn"} + m = _RE_WALK_STEP.match(s) + if m: + d = (m.group(1) or "forward").lower() + if d.startswith("back"): d = "backward" + return {"direction": d, "magnitude": m.group(2), + "unit": "step", "kind": "walk"} + m = _RE_WALK_M.match(s) + if m: + d = (m.group(1) or "forward").lower() + if d.startswith("back"): d = "backward" + return {"direction": d, "magnitude": m.group(2), + "unit": "m", "kind": "walk"} + m = _RE_MOVE_DIR.match(s) + if m: + d = m.group(1).lower() + if d.startswith("back"): d = "backward" + return {"direction": d, "magnitude": "-", + "unit": "-", "kind": "move"} + m = _RE_WALK_TO.match(s) + if m: + return {"direction": "tracked", "magnitude": m.group(1).strip(), + "unit": "target", "kind": "walk_to"} + + # Bare canonicals + if s == "stop": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "stop"} + if s == "sit down": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "posture"} + if s == "stand up": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "posture"} + if s == "wave hello":return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "arm"} + if s == "raise arm": return {"direction": "right", "magnitude": "-", "unit": "-", "kind": "arm"} + if s == "lower arm": return {"direction": "right", "magnitude": "-", "unit": "-", "kind": "arm"} + if s == "come here": return {"direction": "tracked", "magnitude": "-", "unit": "yolo", "kind": "come"} + if s == "follow me": return {"direction": "tracked", "magnitude": "-", "unit": "yolo", "kind": "follow"} + if s == "look around":return {"direction": "scan", "magnitude": "-", "unit": "-", "kind": "look"} + if s == "stay here": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "stay"} + if s == "go home": return {"direction": "home", "magnitude": "-", "unit": "-", "kind": "nav"} + if s == "patrol": return {"direction": "scan", "magnitude": "-", "unit": "-", "kind": "patrol"} + if s == "pause motion": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "pause"} + if s == "resume motion": return {"direction": "-", "magnitude": "-", "unit": "-", "kind": "resume"} + return out + +try: + from Core.env_loader import PROJECT_ROOT # type: ignore +except Exception: + PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +_LOG_DIR = os.path.join(PROJECT_ROOT, "logs") +os.makedirs(_LOG_DIR, exist_ok=True) +_LOG_PATH = os.path.join(_LOG_DIR, "motion.log") + +_logger = logging.getLogger("marcus.motion") +_logger.setLevel(logging.INFO) +_logger.propagate = False +if not _logger.handlers: + _h = RotatingFileHandler( + _LOG_PATH, maxBytes=2_000_000, backupCount=3, encoding="utf-8", + ) + # Use pipe-separator format so the file is grep/awk friendly. + _h.setFormatter(logging.Formatter("%(asctime)s | %(message)s")) + _logger.addHandler(_h) + + +def log_motion(event: str, + canonical: str, + direction=None, + magnitude=None, + unit=None, + target_sec=None, + actual_sec=None, + source: str = "voice", + **extra) -> None: + """Emit one structured line describing a motion event. + + direction/magnitude/unit auto-fill from parse_canonical(canonical) + if not passed — so the worker just calls log_motion('start', cmd) + and the parser handles the rest. Override any field by passing it + explicitly (useful when the brain knows odom-measured actuals). + Fields render with fixed width for grep/awk friendliness. Extra + kwargs become trailing 'k=v' tokens. + """ + def _f(v): + if v is None: return "-" + if isinstance(v, float): return "{:.2f}".format(v) + return str(v) + + canonical = (canonical or "").strip() or "-" + parsed = parse_canonical(canonical) if canonical != "-" else {} + direction = direction if direction is not None else parsed.get("direction", "-") + magnitude = magnitude if magnitude is not None else parsed.get("magnitude", "-") + unit = unit if unit is not None else parsed.get("unit", "-") + + parts = [ + "{:35s}".format(canonical[:35]), + "dir={:8s}".format(_f(direction)[:8]), + "mag={:8s}".format(_f(magnitude)[:8]), + "unit={:6s}".format(_f(unit)[:6]), + "target={:>7s}".format(_f(target_sec) + "s" if target_sec is not None else "-"), + "actual={:>7s}".format(_f(actual_sec) + "s" if actual_sec is not None else "-"), + "{:11s}".format(event[:11]), + "{:5s}".format(source[:5]), + ] + if extra: + parts.append(" ".join("{}={}".format(k, _f(v)) for k, v in extra.items())) + _logger.info(" | ".join(parts)) + + +# Convenience tee so console + log show the same one-liner +def print_motion(event: str, canonical: str, **kw) -> None: + """Like log_motion but ALSO prints to stdout so the operator sees + the line in the terminal dashboard. Use for the high-traffic + 'start' and 'complete' events; use log_motion alone for the noisier + intermediate events.""" + log_motion(event, canonical, **kw) + direction = kw.get("direction", "-") + magnitude = kw.get("magnitude", "-") + unit = kw.get("unit", "-") + target = kw.get("target_sec") + actual = kw.get("actual_sec") + src = kw.get("source", "voice") + line = " [Motion {:9s}] {:30s} dir={} mag={} unit={}".format( + event, canonical[:30], direction, magnitude, unit, + ) + if target is not None: + line += " target={:.2f}s".format(float(target)) + if actual is not None: + line += " actual={:.2f}s".format(float(actual)) + print(line) diff --git a/Core/motion_state.py b/Core/motion_state.py new file mode 100644 index 0000000..40ac4c9 --- /dev/null +++ b/Core/motion_state.py @@ -0,0 +1,46 @@ +""" +motion_state.py — shared motion-control signals. + +Three module-level threading.Events used by Voice/marcus_voice.py to +control in-progress motion. Producers are VoiceModule's motion worker; +consumers are brain motion loops in Brain/command_parser.py, +Brain/executor.py, Navigation/marcus_odometry.py, Navigation/patrol.py. + + motion_abort — hard stop. Loops break out and run gradual_stop(). + Set by 'stop'/'cancel'/etc. Cleared by the worker + at the start of each new non-stop command, so a + stale set never kills the next motion. + + motion_pause — soft hold. Loops do NOT break; they hold zero + velocity and wait for the event to clear, then + resume the same loop iteration where they paused. + While paused they ALSO check motion_abort each + ~50ms — a pause-then-stop combo cleanly aborts. + +The helper `wait_while_paused` centralizes the polling loop so brain +sites don't repeat the same stanza: + + if motion_abort.is_set(): break + wait_while_paused() + if motion_abort.is_set(): break + send_vel(...) + time.sleep(0.05) + +Single source of truth — placed in Core/ so both Voice/* and Brain/* +can import without circular deps. +""" +from __future__ import annotations + +import threading +import time + +motion_abort: threading.Event = threading.Event() +motion_pause: threading.Event = threading.Event() + + +def wait_while_paused(poll_sec: float = 0.05) -> None: + """Block while motion_pause is set. Returns when pause clears OR + motion_abort fires (so the caller can immediately bail out via + its existing motion_abort check).""" + while motion_pause.is_set() and not motion_abort.is_set(): + time.sleep(poll_sec) diff --git a/Data/Brain/Sessions/session_003_2026-04-27/alerts.json b/Data/Brain/Sessions/session_003_2026-04-27/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_003_2026-04-27/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-27/commands.json b/Data/Brain/Sessions/session_003_2026-04-27/commands.json new file mode 100644 index 0000000..90c47e3 --- /dev/null +++ b/Data/Brain/Sessions/session_003_2026-04-27/commands.json @@ -0,0 +1,434 @@ +[ + { + "time": "14:39:02", + "cmd": "turn around", + "response": "Turning left", + "duration_s": 4.94 + }, + { + "time": "14:40:32", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:40:53", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:41:17", + "cmd": "walk backward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:41:20", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:41:23", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:41:26", + "cmd": "walk 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:41:29", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:41:32", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:41:35", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:41:39", + "cmd": "walk 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:41:42", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:42:22", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:42:47", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:43:08", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:43:24", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:43:37", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:43:40", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:44:02", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:44:08", + "cmd": "turn right 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:44:15", + "cmd": "turn right 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:44:21", + "cmd": "turn right 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:44:47", + "cmd": "turn left 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:45:29", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:45:48", + "cmd": "turn left 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:46:25", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:46:28", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:46:32", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:46:43", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:46:46", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:47:36", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:47:49", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:48:16", + "cmd": "turn around", + "response": "Turning left", + "duration_s": 17.53 + }, + { + "time": "14:48:52", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:49:39", + "cmd": "walk forward 1 meters", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:49:43", + "cmd": "walk forward 1 meters", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:49:47", + "cmd": "walk forward 1 meters", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:49:52", + "cmd": "walk forward 1 meters", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:50:07", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:51:17", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:51:41", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:52:13", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:52:50", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:53:15", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:54:05", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:56:07", + "cmd": "go back o", + "response": "Turning left", + "duration_s": 15.61 + }, + { + "time": "14:56:59", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:57:00", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:57:04", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:57:33", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:57:37", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:57:40", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:57:45", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:58:00", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:58:05", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:58:23", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:58:26", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:58:30", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "14:58:33", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:00:23", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:00:25", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:01:58", + "cmd": "follow me", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:02:03", + "cmd": "walk 2 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:02:07", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:02:18", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:02:25", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:02:28", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:02:53", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:04:40", + "cmd": "turn left 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:04:46", + "cmd": "turn left 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:05:04", + "cmd": "turn right 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "15:05:10", + "cmd": "turn right 90 degrees", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-27/detections.json b/Data/Brain/Sessions/session_003_2026-04-27/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_003_2026-04-27/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-27/places.json b/Data/Brain/Sessions/session_003_2026-04-27/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_003_2026-04-27/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-27/alerts.json b/Data/Brain/Sessions/session_004_2026-04-27/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_004_2026-04-27/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-27/commands.json b/Data/Brain/Sessions/session_004_2026-04-27/commands.json new file mode 100644 index 0000000..85ac56d --- /dev/null +++ b/Data/Brain/Sessions/session_004_2026-04-27/commands.json @@ -0,0 +1,74 @@ +[ + { + "time": "16:40:07", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "16:40:22", + "cmd": "turn left 19 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "16:40:37", + "cmd": "walk backward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "16:40:55", + "cmd": "walk forward 4 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "16:41:05", + "cmd": "walk backward 4 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "16:41:47", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "16:42:02", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "16:42:18", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "16:42:44", + "cmd": "walk 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "16:43:05", + "cmd": "walk backward 3 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "16:43:06", + "cmd": "turn right 0 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "16:43:49", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-27/detections.json b/Data/Brain/Sessions/session_004_2026-04-27/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_004_2026-04-27/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-27/places.json b/Data/Brain/Sessions/session_004_2026-04-27/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_004_2026-04-27/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_005_2026-04-28/alerts.json b/Data/Brain/Sessions/session_005_2026-04-28/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_005_2026-04-28/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_005_2026-04-28/commands.json b/Data/Brain/Sessions/session_005_2026-04-28/commands.json new file mode 100644 index 0000000..6645381 --- /dev/null +++ b/Data/Brain/Sessions/session_005_2026-04-28/commands.json @@ -0,0 +1,8 @@ +[ + { + "time": "09:02:24", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_005_2026-04-28/detections.json b/Data/Brain/Sessions/session_005_2026-04-28/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_005_2026-04-28/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_005_2026-04-28/places.json b/Data/Brain/Sessions/session_005_2026-04-28/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_005_2026-04-28/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_006_2026-04-28/alerts.json b/Data/Brain/Sessions/session_006_2026-04-28/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_006_2026-04-28/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_006_2026-04-28/commands.json b/Data/Brain/Sessions/session_006_2026-04-28/commands.json new file mode 100644 index 0000000..186d79b --- /dev/null +++ b/Data/Brain/Sessions/session_006_2026-04-28/commands.json @@ -0,0 +1,56 @@ +[ + { + "time": "09:14:00", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:14:04", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:15:07", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:16:07", + "cmd": "turn left 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:17:20", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:17:40", + "cmd": "walk back 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:18:27", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "09:18:40", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:19:55", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_006_2026-04-28/detections.json b/Data/Brain/Sessions/session_006_2026-04-28/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_006_2026-04-28/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_006_2026-04-28/places.json b/Data/Brain/Sessions/session_006_2026-04-28/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_006_2026-04-28/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_007_2026-04-28/alerts.json b/Data/Brain/Sessions/session_007_2026-04-28/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_007_2026-04-28/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_007_2026-04-28/commands.json b/Data/Brain/Sessions/session_007_2026-04-28/commands.json new file mode 100644 index 0000000..84c4372 --- /dev/null +++ b/Data/Brain/Sessions/session_007_2026-04-28/commands.json @@ -0,0 +1,26 @@ +[ + { + "time": "09:40:17", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:40:20", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:40:37", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:40:40", + "cmd": "walk 1 steps", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_007_2026-04-28/detections.json b/Data/Brain/Sessions/session_007_2026-04-28/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_007_2026-04-28/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_007_2026-04-28/places.json b/Data/Brain/Sessions/session_007_2026-04-28/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_007_2026-04-28/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_008_2026-04-28/alerts.json b/Data/Brain/Sessions/session_008_2026-04-28/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_008_2026-04-28/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_008_2026-04-28/commands.json b/Data/Brain/Sessions/session_008_2026-04-28/commands.json new file mode 100644 index 0000000..7c2b439 --- /dev/null +++ b/Data/Brain/Sessions/session_008_2026-04-28/commands.json @@ -0,0 +1,20 @@ +[ + { + "time": "09:42:11", + "cmd": "turn around", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:42:20", + "cmd": "look around", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:42:34", + "cmd": "turn around", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_008_2026-04-28/detections.json b/Data/Brain/Sessions/session_008_2026-04-28/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_008_2026-04-28/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_008_2026-04-28/places.json b/Data/Brain/Sessions/session_008_2026-04-28/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_008_2026-04-28/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_009_2026-04-28/alerts.json b/Data/Brain/Sessions/session_009_2026-04-28/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_009_2026-04-28/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_009_2026-04-28/commands.json b/Data/Brain/Sessions/session_009_2026-04-28/commands.json new file mode 100644 index 0000000..cc9bb35 --- /dev/null +++ b/Data/Brain/Sessions/session_009_2026-04-28/commands.json @@ -0,0 +1,26 @@ +[ + { + "time": "09:49:11", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:49:39", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:50:07", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:50:28", + "cmd": "turn right 100 degrees", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_009_2026-04-28/detections.json b/Data/Brain/Sessions/session_009_2026-04-28/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_009_2026-04-28/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_009_2026-04-28/places.json b/Data/Brain/Sessions/session_009_2026-04-28/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_009_2026-04-28/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_010_2026-04-28/alerts.json b/Data/Brain/Sessions/session_010_2026-04-28/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_010_2026-04-28/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_010_2026-04-28/commands.json b/Data/Brain/Sessions/session_010_2026-04-28/commands.json new file mode 100644 index 0000000..609ae01 --- /dev/null +++ b/Data/Brain/Sessions/session_010_2026-04-28/commands.json @@ -0,0 +1,8 @@ +[ + { + "time": "11:24:20", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_010_2026-04-28/detections.json b/Data/Brain/Sessions/session_010_2026-04-28/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_010_2026-04-28/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_010_2026-04-28/places.json b/Data/Brain/Sessions/session_010_2026-04-28/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_010_2026-04-28/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_011_2026-04-28/alerts.json b/Data/Brain/Sessions/session_011_2026-04-28/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_011_2026-04-28/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_011_2026-04-28/commands.json b/Data/Brain/Sessions/session_011_2026-04-28/commands.json new file mode 100644 index 0000000..6926616 --- /dev/null +++ b/Data/Brain/Sessions/session_011_2026-04-28/commands.json @@ -0,0 +1,248 @@ +[ + { + "time": "11:31:56", + "cmd": "walk 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:32:29", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:33:19", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:33:31", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:34:22", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "11:34:57", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:35:02", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:35:28", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:35:37", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:36:13", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:36:28", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:36:47", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:36:50", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:37:09", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:37:12", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:37:15", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:37:26", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:37:37", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:37:52", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "11:37:56", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:39:06", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "11:39:33", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:40:00", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "11:40:53", + "cmd": "stay here", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:41:15", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "11:42:27", + "cmd": "walk 2 meters", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:42:50", + "cmd": "turn 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:43:22", + "cmd": "turn 360 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:43:42", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:44:15", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "11:44:57", + "cmd": "walk forward 10 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:44:58", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:45:18", + "cmd": "walk backward 5 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:45:30", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:45:56", + "cmd": "walk back 3 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:45:59", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:46:19", + "cmd": "walk to person", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:47:19", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:48:05", + "cmd": "walk forward 5 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:48:08", + "cmd": "turn right 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "11:48:13", + "cmd": "turn left 2 steps", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_011_2026-04-28/detections.json b/Data/Brain/Sessions/session_011_2026-04-28/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_011_2026-04-28/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_011_2026-04-28/places.json b/Data/Brain/Sessions/session_011_2026-04-28/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_011_2026-04-28/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_012_2026-04-28/alerts.json b/Data/Brain/Sessions/session_012_2026-04-28/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_012_2026-04-28/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_012_2026-04-28/commands.json b/Data/Brain/Sessions/session_012_2026-04-28/commands.json new file mode 100644 index 0000000..9d30a97 --- /dev/null +++ b/Data/Brain/Sessions/session_012_2026-04-28/commands.json @@ -0,0 +1,236 @@ +[ + { + "time": "12:54:29", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:54:46", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:55:28", + "cmd": "walk to حقيبة", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:56:36", + "cmd": "walk forward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:56:39", + "cmd": "turn left 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:56:59", + "cmd": "walk backward 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:57:59", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:58:07", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:58:22", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "12:59:14", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:59:17", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:59:30", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "12:59:58", + "cmd": "turn left 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:00:41", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "13:01:52", + "cmd": "walk forward 5 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:02:09", + "cmd": "walk back 2 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:02:52", + "cmd": "walk 5 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:02:57", + "cmd": "walk forward 2 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:03:04", + "cmd": "turn left 3 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:03:07", + "cmd": "turn right 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:03:51", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:03:56", + "cmd": "walk forward 2 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:03:59", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:04:27", + "cmd": "walk backward 3 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:04:32", + "cmd": "walk forward 2 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:05:19", + "cmd": "walk backward 10 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:05:30", + "cmd": "walk forward 5 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:05:34", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:06:27", + "cmd": "turn right 180 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:06:30", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:06:33", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:07:15", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + }, + { + "time": "13:08:52", + "cmd": "turn right 90 degrees", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:08:55", + "cmd": "walk 1 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:08:58", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:10:28", + "cmd": "walk backward 20 steps", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:10:29", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:11:41", + "cmd": "stop", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "13:11:57", + "cmd": "come here", + "response": "Coming to you", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_012_2026-04-28/detections.json b/Data/Brain/Sessions/session_012_2026-04-28/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_012_2026-04-28/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_012_2026-04-28/places.json b/Data/Brain/Sessions/session_012_2026-04-28/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_012_2026-04-28/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Sequences/name-with-spaces.json b/Data/Sequences/name-with-spaces.json new file mode 100644 index 0000000..0a780b6 --- /dev/null +++ b/Data/Sequences/name-with-spaces.json @@ -0,0 +1,8 @@ +{ + "name": "name-with-spaces", + "created_at": 1777296029.0382416, + "saved_at": 1777296029.0382435, + "commands": [ + "turn right" + ] +} \ No newline at end of file diff --git a/Doc/architecture.md b/Doc/architecture.md index c556526..c956126 100644 --- a/Doc/architecture.md +++ b/Doc/architecture.md @@ -3,10 +3,27 @@ **Project**: Marcus | YS Lootah Technology **Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB **Robot persona**: **Sanad** (wake word + self-intro; project code still lives under `Marcus/`) -**Updated**: 2026-04-21 +**Updated**: 2026-04-28 --- +## Recent deltas (since 2026-04-25 — bilingual S2S + decimal/fraction motion + dispatcher hardening) + +- **Voice flipped from STT-only → full Gemini Live S2S** — Gemini now hears the mic AND replies through the G1 speaker (Sanad pattern). The Gemini WebSocket lives in a separate Python 3.10+ subprocess (`Voice/gemini_runner.py`, `gemini_sdk` env) that **owns the speaker** via `unitree_sdk2py`; Marcus's parent process (Python 3.8) forwards camera frames to it over stdin and reads JSON-line transcripts off its stdout via `Voice/gemini_script.py::GeminiBrain` (the manager). `TtsMaker` is no longer on the conversational path — it remains wired through `API/audio_api.py` for any non-Gemini brain announcement that ever needs to speak. +- **Wake-word gating moved from Marcus → Gemini persona** — Marcus does NOT check for "Sanad" / "سند" in Python any more. The dispatcher just listens to whatever Gemini speaks; if Gemini emits a motion-confirmation phrase, the matching motion fires. Discipline lives in `config_Voice.json::stt.gemini_system_prompt` (~21 KB persona with Rules 1–12). Field-tested rules include 1c (no hallucinated motion from silence/ambiguity), 1d (no fabricated compound chains), 4b (no destination words in motion confirmations), 5d (correct Arabic come-to-user grammar — `أتي إليك`, not `أتعال`), 5e (no zero/negative quantities), 5g (step-closer vs come-to-user), 5h + 5h-2 (act OR ask, never both — and never put motion verbs inside a clarifying question), 6 (parametric motions in canonical shapes), 6b (decimals + fractions), 7 (repeat/reverse memory ops), 8 (stop priority), 9 (state-marker awareness), 9b (never emit `[STATE-...]` yourself), 10 (pause vs stop), 11 (record/save/play sequences), 12 + 12b (walk-to-target spatial planner + anaphora resolution). +- **Bilingual support — Arabic is back**, in both directions, fluently. `Voice/canonical_normalizer.py` translates Arabic structural patterns ("أمشي يميناً 3 خطوات") to English canonical phrases ("walking right 3 steps") before regex scan; `Voice/number_words.py` converts spelled-out numbers and fractions in both languages to digit-decimal form ("ثلاث خطوات ونصف" → "3.5 steps", "three and a half steps" → "3.5 steps"). All vocabulary (Arabic verb roots / directions / units / duals / conjunctions / connectives / fractions / English number words / motion inverses) is externalised to **`Config/language_tables.json`** — single source of truth, JSON-only edit to add a dialect. +- **Decimal + fraction motion support across the pipeline** — `walk 1.79 meters`, `turn 22.5 degrees`, `walk 3.5 steps`, `walk 0.5 steps`, `أمشي 3 خطوات ونصف`, `نصف متر للأمام`, `مترين وربع` all dispatch correctly. Step regexes in `Brain/command_parser.py` widened from `(\d+)` to `(\d+(?:\.\d+)?)`; durations switched from `int(steps)` to `float(steps)`. Persona Rule 6b documents the conversion contract for Gemini. +- **Dispatcher strip-layer pipeline** in `Voice/marcus_voice.py` — every Gemini bot transcript now passes: `_STATE_ECHO_RE` (drop hallucinated `[STATE-...]` echoes) → `_QUOTED_RE` (drop quoted user mentions) → `_QUESTION_RE` (drop question clauses, even those containing motion verbs) → `normalise_numbers()` → `to_canonical_english()` → instruction regex scan. Per-turn fired-set + `command_cooldown_sec` dedup prevents streaming-partial double-fires. `_REVERSE_PAIRS` (used by `reverse_last`) is loaded from `language_tables.motion_inverses`, no longer a hardcoded dict. `Voice/sequences.py` loads its never-record list from `language_tables.sequence_never_record.canonicals`. +- **Motion-state primitives** — new `Core/motion_state.py` exposes `motion_abort` and `motion_pause` `threading.Event`s plus `wait_while_paused()`; consumed by the executor, command_parser fast-paths, odometry, autonomous mode, and goal_nav for clean voice-triggered abort/pause/resume. `Core/motion_log.py` adds `log_motion()` / `print_motion()` for actionable motion summaries (commanded vs target durations, step counts, rotation degrees) per the user's "log actual movement" request. +- **Three motion-execution modes**, dispatched by intent shape (see [Brain/command_parser.py](Brain/command_parser.py) and [Brain/marcus_brain.py](Brain/marcus_brain.py)): + 1. **Deterministic parametric** (no vision) — `walk N steps`, `walk N meters`, `turn N degrees`, bare directionals. Time × velocity OR closed-loop on dead-reckoned position. ±10 cm typical (no real position feedback today; see "open issue" below). + 2. **YOLO-tracked** — `come here` / `come to me` / `تعال` (smart approach, stops ~arm's length when person bbox fills ≥ 0.32 of frame); `follow me` / `اتبعني` (forward bursts while person visible and not too close). + 3. **LLaVA-grounded** — `walk to the door` / `walk to the chair` / `أمشي إلى الباب`. Spatial planner re-asks LLaVA every 2.5 s for bearing + distance; turn-toward + walk-forward bursts until `distance == "near"` or scan-attempts exhausted. +- **mic_gain bumped to 4.5** (was 2.5) — far-field G1 mic is quiet at default; field-tuned for ~1–2 m talking distance. +- **Holosoma plumbing observation (no code change)** — upstream `holosoma_inference` only accepts `velocity_input ∈ {keyboard, interface, joystick, ros2}`; the `--velocity-input zmq` shown in [Doc/note.txt](Doc/note.txt) implies a local fork or aspirational config. Marcus's `send_vel()` PUBs JSON over ZMQ regardless; whatever subscribes (a fork, the existing `Bridge/ros2_zmq_bridge.py`, or nothing) determines whether motion actually reaches the policy. Marcus's own dead-reckoning integrates the **commanded** velocity, not the robot's real motion — so `walk 1 meter` accuracy is policy-tracking-error bounded (±10 cm), not closed-loop. ROS2 `/dog_odom` is intentionally disabled in `Navigation/marcus_odometry.py:230-233` because in-process DDS init causes `bad_alloc` against Holosoma's ONNX arena and YOLO's CUDA allocator. **Open issue** — fix paths discussed but not implemented: sidecar DDS state publisher (option B), Holosoma state-tee fork (option A), or sidecar ROS2 republisher. +- **New files** — `Voice/canonical_normalizer.py`, `Voice/_language_tables.py`, `Voice/number_words.py`, `Voice/sequences.py`, `Voice/_probe_*.py` (smoke probes), `Core/motion_state.py`, `Core/motion_log.py`, `Config/language_tables.json`, `Config/instruction.json`. +- **YOLO device policy SOFTENED** — earlier docs said `yolo_device=cuda` was hard-required and `_resolve_device` raised `RuntimeError` on missing CUDA. That policy was relaxed: when Qwen2.5-VL is resident in VRAM (~11 GB), YOLO on `cuda` adds another ~2 GB and pushes the 16 GB Orin NX over budget. Current default is **`yolo_device: cpu`** in `Config/config_Vision.json` (1–3 fps on Orin CPU — sufficient for the YOLO fast-path use cases: `come here` arrival check at 1 Hz, `follow me` at 2 Hz, goal-detection one-shot). `Vision/marcus_yolo.py::_resolve_device` still raises if `yolo_device=cuda` is set without working CUDA. Use `cuda` only when `subsystems.vlm=false` (no Qwen) so YOLO has GPU headroom. The "Steady-state FPS on Orin" 21.9 fps figure in `environment.md` was measured with cuda+VLM-disabled — a different operating point than production. + ## Recent deltas (since 2026-04-06) - **GPU-only YOLO** — `_resolve_device()` raises `RuntimeError` if CUDA is missing. `yolo_device=cuda`, `yolo_half=true` by default. diff --git a/Doc/pipeline.md b/Doc/pipeline.md index f0450d1..20d349d 100644 --- a/Doc/pipeline.md +++ b/Doc/pipeline.md @@ -1,7 +1,7 @@ # Marcus — End-to-End Pipeline **Robot persona:** Sanad (wake word + self-intro) -**Updated:** 2026-04-21 +**Updated:** 2026-04-28 One map of every data path from sensor to motor, voice to speech. Cross-reference with `architecture.md` (what each file is), `functions.md` (exact function signatures — AST-generated), and `MARCUS_API.md` (usage examples + JSON schemas). diff --git a/Navigation/marcus_odometry.py b/Navigation/marcus_odometry.py index 07bc69e..585d650 100644 --- a/Navigation/marcus_odometry.py +++ b/Navigation/marcus_odometry.py @@ -50,6 +50,26 @@ try: except Exception: _cfg = {} +# Shared motion-abort signal — Voice/marcus_voice.py sets this when the +# user says 'stop'; we poll it inside walk/turn loops to break out +# early. See Core/motion_state.py. Import is wrapped so the odometry +# module can still be loaded standalone (e.g. for tests) where Core +# isn't on sys.path. +try: + from Core.motion_state import motion_abort, wait_while_paused +except Exception: + # Standalone-only fallback. Production Marcus always succeeds the + # import above (Core/motion_state.py is in the project root). The + # inert objects here exist solely so the odometry module can be + # imported in isolation for unit tests / standalone tooling without + # a Core/ dependency. If you see these get used at runtime, voice + # pause/abort won't propagate — but that's a deployment-error + # symptom, not normal operation. Bug #14 in the 14-bug audit. + import threading as _t + motion_abort = _t.Event() # inert fallback; never set + def wait_while_paused(_ps: float = 0.05) -> None: # type: ignore + return # no-op fallback + ZMQ_HOST = str(_cfg.get("zmq_host", "127.0.0.1")) ZMQ_PORT = int(_cfg.get("zmq_port", 5556)) ROS2_ODOM_TOPIC = str(_cfg.get("ros2_odom_topic", "/dog_odom")) @@ -436,6 +456,14 @@ class Odometry: try: while time.time() - t0 < timeout: + if motion_abort.is_set(): + print(f" [Odom] ⏸ Walk aborted by user (motion_abort)") + self._gradual_stop() + return False + wait_while_paused() + if motion_abort.is_set(): + self._gradual_stop() + return False with self._lock: dx = self._x - start_x dy = self._y - start_y @@ -474,6 +502,14 @@ class Odometry: t0 = time.time() try: while time.time() - t0 < duration: + if motion_abort.is_set(): + print(f" [Odom] ⏸ Time-based walk aborted by user") + self._gradual_stop() + return False + wait_while_paused() + if motion_abort.is_set(): + self._gradual_stop() + return False self._send_vel(vx=vx) time.sleep(0.05) except KeyboardInterrupt: @@ -543,6 +579,14 @@ class Odometry: try: while time.time() - t0 < timeout: + if motion_abort.is_set(): + print(f" [Odom] ⏸ Turn aborted by user (motion_abort)") + self._gradual_stop() + return False + wait_while_paused() + if motion_abort.is_set(): + self._gradual_stop() + return False with self._lock: current = self._heading @@ -577,6 +621,14 @@ class Odometry: t0 = time.time() try: while time.time() - t0 < duration: + if motion_abort.is_set(): + print(f" [Odom] ⏸ Time-based turn aborted by user") + self._gradual_stop() + return False + wait_while_paused() + if motion_abort.is_set(): + self._gradual_stop() + return False self._send_vel(vyaw=vyaw) time.sleep(0.05) except KeyboardInterrupt: diff --git a/Navigation/patrol.py b/Navigation/patrol.py index 2f8c98c..0caa781 100644 --- a/Navigation/patrol.py +++ b/Navigation/patrol.py @@ -10,6 +10,7 @@ from API.llava_api import ask_patrol from API.memory_api import mem, log_detection from Brain.executor import move_step from Core.config_loader import load_config +from Core.motion_state import motion_abort # Persist patrol events to logs/navigation.log so a long unattended patrol # session leaves a usable post-mortem audit (start/finish, every step's @@ -63,6 +64,13 @@ def patrol(duration_minutes: float = 0.0, alert_callback=None): try: while time.time() < end_time: + # User said 'stop' (or any other interrupt). Bail out cleanly + # — gradual_stop is in the finally block, so the robot + # decelerates safely. The patrol's outer 'duration' is just + # the upper bound; honouring abort means we exit early. + if motion_abort.is_set(): + _plog("motion_abort set — ending patrol early", "warn") + break step += 1 # ----- 1. YOLO PPE violations ----- diff --git a/Voice/_language_tables.py b/Voice/_language_tables.py new file mode 100644 index 0000000..72d30f1 --- /dev/null +++ b/Voice/_language_tables.py @@ -0,0 +1,198 @@ +""" +_language_tables.py — load Config/language_tables.json once and cache. + +All Voice/* modules that need vocabulary data import from here instead +of hardcoding tables. The JSON file is the single source of truth for +Arabic verbs / directions / units / numbers / English number words / +motion inverses / sequence-never-record list. + +Loud failure on missing/malformed file (matches Marcus's 'no silent +config degradation' policy from earlier rounds). + +Public API: + LANG = load() ← dict with all top-level sections + LANG["english_numbers"]["ones"] → {word: int} + LANG["arabic_verbs"]["walking"] → list of Arabic root strings + LANG["arabic_duals"]["2 steps"] → list of dual-form strings + LANG["motion_inverses"] → flat dict of pairs + LANG["sequence_never_record"]["canonicals"] → list of canonical names + +Convenience flatteners (build the inverse maps consumers usually want): + flat_arabic_verbs() → {ar_root: en_gerund} + flat_arabic_directions() → {ar_word: en_direction} + flat_arabic_units() → {ar_unit: en_unit} + flat_arabic_duals() → {ar_dual_word: 'N units'} + flat_arabic_conjunctions()→ {ar_conj: ' english_glue '} + flat_arabic_connectives() → {ar_word: en_word} +""" +from __future__ import annotations + +import json +import os +import sys +from typing import Dict, List + +_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _PROJECT_DIR not in sys.path: + sys.path.insert(0, _PROJECT_DIR) + +_CONFIG_PATH = os.path.join(_PROJECT_DIR, "Config", "language_tables.json") +_CACHE: dict = {} + + +def load() -> dict: + """Load and cache the language tables. Raises RuntimeError on + missing/malformed config — voice processing is unusable without it, + so loud failure surfaces the problem at startup rather than during + a user demo.""" + if _CACHE: + return _CACHE + if not os.path.isfile(_CONFIG_PATH): + raise RuntimeError( + "Config/language_tables.json missing at {} — voice motion " + "vocabulary cannot load without it. Restore from git or " + "rebuild from the documented schema.".format(_CONFIG_PATH) + ) + try: + with open(_CONFIG_PATH, "r", encoding="utf-8") as f: + data = json.load(f) or {} + except Exception as e: + raise RuntimeError( + "Config/language_tables.json malformed: {}".format(e), + ) from e + # Quick sanity — make sure the top-level keys we care about exist + required = ( + "english_numbers", "arabic_numbers", + "english_fractions", "arabic_fractions", + "arabic_verbs", "arabic_directions", "arabic_units", + "arabic_duals", "arabic_conjunctions", "arabic_connectives", + "motion_inverses", "sequence_never_record", + ) + missing = [k for k in required if k not in data] + if missing: + raise RuntimeError( + "Config/language_tables.json missing top-level sections: {}" + .format(missing) + ) + _CACHE.update(data) + return _CACHE + + +def _invert_grouped(grouped: dict, drop_keys: tuple = ("_description",)) -> Dict[str, str]: + """Helper: flip a {english_label: [arabic_word, ...]} dict into + {arabic_word: english_label}. Skips keys named in drop_keys (used + to skip the '_description' annotation in the JSON).""" + out: Dict[str, str] = {} + for en, ar_list in grouped.items(): + if en in drop_keys: + continue + if not isinstance(ar_list, list): + continue + for ar in ar_list: + if isinstance(ar, str) and ar.strip(): + out[ar] = en + return out + + +def flat_arabic_verbs() -> Dict[str, str]: + return _invert_grouped(load()["arabic_verbs"]) + + +def flat_arabic_directions() -> Dict[str, str]: + return _invert_grouped(load()["arabic_directions"]) + + +def flat_arabic_units() -> Dict[str, str]: + return _invert_grouped(load()["arabic_units"]) + + +def flat_arabic_duals() -> Dict[str, str]: + return _invert_grouped(load()["arabic_duals"]) + + +def flat_arabic_conjunctions() -> Dict[str, str]: + return _invert_grouped(load()["arabic_conjunctions"]) + + +def flat_arabic_connectives() -> Dict[str, str]: + return _invert_grouped(load()["arabic_connectives"]) + + +def english_numbers_ones() -> Dict[str, int]: + return {k: int(v) for k, v in load()["english_numbers"]["ones"].items() + if k != "_description"} + + +def english_numbers_tens() -> Dict[str, int]: + return {k: int(v) for k, v in load()["english_numbers"]["tens"].items() + if k != "_description"} + + +def english_numbers_scale() -> Dict[str, int]: + return {k: int(v) for k, v in load()["english_numbers"]["scale"].items() + if k != "_description"} + + +def english_numbers_glue() -> set: + return set(load()["english_numbers"]["glue"]) + + +def arabic_number_literals() -> List[tuple]: + """List of (arabic_word, integer_value) pairs, in declaration order. + Caller is expected to keep ordering (longest-first) when applying.""" + out: List[tuple] = [] + for entry in load()["arabic_numbers"].get("literals", []): + if isinstance(entry, list) and len(entry) == 2: + ar, val = entry + if isinstance(ar, str) and isinstance(val, int): + out.append((ar, val)) + return out + + +def motion_inverses() -> Dict[str, str]: + return {k: v for k, v in load()["motion_inverses"].items() + if k != "_description"} + + +def sequence_never_record() -> set: + return set(load()["sequence_never_record"].get("canonicals", [])) + + +def english_fractions_additive() -> Dict[str, float]: + """English fraction words that COMBINE with a preceding integer + ('3 and a half' → 3 + 0.5).""" + return {k: float(v) for k, v in load()["english_fractions"]["additive"].items() + if k != "_description"} + + +def english_fractions_leading() -> Dict[str, float]: + """English fraction words that STAND ALONE before a unit + ('half a meter' → 0.5 meter).""" + return {k: float(v) for k, v in load()["english_fractions"]["leading"].items() + if k != "_description"} + + +def arabic_fractions_additive() -> Dict[str, float]: + """Arabic fraction words that COMBINE with a preceding digit + via و conjunction ('3 ونصف' → 3.5).""" + return {k: float(v) for k, v in load()["arabic_fractions"]["additive"].items() + if k != "_description"} + + +def arabic_fractions_leading() -> Dict[str, float]: + """Arabic fraction words STANDING ALONE before a unit + ('نصف متر' → 0.5 meter).""" + return {k: float(v) for k, v in load()["arabic_fractions"]["leading"].items() + if k != "_description"} + + +def arabic_unit_words() -> set: + """Set of all Arabic unit words from arabic_units. Used by the + fraction parser to detect 'N ونصف' patterns.""" + out = set() + for vals in load()["arabic_units"].values(): + if isinstance(vals, list): + for v in vals: + if isinstance(v, str) and v.strip(): + out.add(v) + return out diff --git a/Voice/canonical_normalizer.py b/Voice/canonical_normalizer.py new file mode 100644 index 0000000..861b843 --- /dev/null +++ b/Voice/canonical_normalizer.py @@ -0,0 +1,276 @@ +""" +canonical_normalizer.py — translate Arabic / dialect motion phrasings +to canonical English form before regex matching. + +Runs in the dispatcher pipeline AFTER normalise_numbers (which handles +spelled-out numbers in both languages) and BEFORE the parametric regex +scan. The output is English-shaped text the existing English parametric +regexes can match — replacing the previous approach of maintaining 14+ +dialect-specific Arabic regexes per command type. + +Pipeline: + + raw chunk ──► _STATE_ECHO_RE strip ──► _QUOTED_RE strip + ──► _QUESTION_RE strip ──► normalise_numbers + ──► to_canonical_english ←──── HERE + ──► English regex scan ──► dispatch + +Input/output examples: + + أمشي خطوة واحدة. → walk 1 step. + أمشي خطوتين. → walk 2 steps. + لف يسار 90 درجة. → turn left 90 degree. + أستدير يميناً 100 درجة. → turn right 100 degree. + أمشي للأمام 5 خطوات. → walk forward 5 steps. + أمشي للخلف 3 خطوات. → walk backward 3 steps. + أرجع 3 خطوات. → walk backward 3 steps. + أتي إليك. → (unchanged — bot_phrase fixed canonical) + +Compound chains (مع conjunctions) are translated in place: + + أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة. + → walk backward 3 steps, then turn right 90 degree. + +Idempotent for already-English text: + + Walking forward 5 steps. → Walking forward 5 steps. + +Ambiguity note: tokens like 'left' / 'right' have surface ambiguity +('I left the room' vs 'turn left'). The current normalizer uses +context-free token substitution and accepts that risk — same +behaviour as today's regex layer. A future memory + LiDAR-aware +disambiguation hook (using prior dialogue context and current +spatial state) belongs ABOVE this layer; see TODO at the bottom. +""" +from __future__ import annotations + +import re + +# ─── Vocabulary tables loaded from Config/language_tables.json ──── +# +# Single source of truth for verb / direction / unit / dual / conj / +# connective dictionaries. Adding a new dialect = JSON edit. See +# Voice/_language_tables.py for the loader and flattening helpers. +from Voice._language_tables import ( + flat_arabic_verbs, flat_arabic_directions, flat_arabic_units, + flat_arabic_duals, flat_arabic_conjunctions, flat_arabic_connectives, +) + + +# Verb roots: Arabic → English gerund. Loaded once at module import. +_AR_VERB = flat_arabic_verbs() +# Direction words: Arabic → English direction. +_AR_DIR = flat_arabic_directions() +# Units: Arabic → English unit (singular/plural preserved). +_AR_UNIT = flat_arabic_units() +# Dual forms: single Arabic word → 'N units' English target. +_AR_DUAL = flat_arabic_duals() +# Conjunctions for compound chains: Arabic → space-padded English. +_AR_CONJ = flat_arabic_conjunctions() +# Connectives (prepositions / determiners): Arabic → English. +_AR_CONNECTIVES = flat_arabic_connectives() + + + +_HAS_ARABIC_RE = re.compile(r"[؀-ۿ]") + + +def _ar_word_sub(text: str, mapping: dict) -> str: + """Substitute Arabic tokens from `mapping` keys to their English + values. Uses Arabic-LETTER boundary on both sides (NOT the full + Arabic block — Arabic punctuation '،' / '؟' / '؛' is U+060C/061F/061B + and is INSIDE the Arabic block but should be treated as a boundary + so 'خطوات،' substitutes correctly). Letter range is U+0621-U+064A + (ء to ي); also include diacritics U+064B-U+065F as in-word so + 'البَاب' (with fatha) isn't split. + Longest-first ordering resolves substring conflicts (e.g. match + 'ألف' before its substring 'لف').""" + keys = sorted(mapping.keys(), key=len, reverse=True) + for ar in keys: + en = mapping[ar] + # In-word characters: Arabic letters + diacritics. Outside this + # class is a boundary (whitespace, ASCII, Arabic punctuation). + pattern = ( + r"(? str: + """Translate Arabic / dialect structural motion phrasings to + English canonical form so the existing English parametric regex + layer can match them. + + Order of operations (each is idempotent on its output): + 1. DUAL substitution (single-word counts: خطوتين → 2 steps) + 2. CONJ substitution (ثم → then) — done early so subsequent + passes don't accidentally treat conjunctions as words. + 3. VERB substitution (أمشي → walk, لف → turn, ...) + 4. DIR substitution (يمين → right, ...) + 5. UNIT substitution (خطوة → step, متر → meter, ...) + 6. CONNECTIVES (إلى → to, etc.) + 7. Arabic comma → English comma (preserves compound parsing) + 8. Word-order fix: 'walk step 1' → 'walk 1 step' + 'walk forward step 1' → 'walk forward 1 step' + 9. Whitespace collapse. + + Bails out early if the input contains no Arabic (idempotent for + pure English). + """ + if not text or not _HAS_ARABIC_RE.search(text): + return text + + # 1. Dual forms first — these are single Arabic words that carry + # both count and unit ('خطوتين' = '2 steps'). + text = _ar_word_sub(text, _AR_DUAL) + + # 2. Conjunctions — translate before verbs so 'ثم' between two + # motions becomes ' then ' for the regex layer's compound parser. + text = _ar_word_sub(text, _AR_CONJ) + + # 3. Verbs. + text = _ar_word_sub(text, _AR_VERB) + + # 4. Directions. + text = _ar_word_sub(text, _AR_DIR) + + # 5. Units. + text = _ar_word_sub(text, _AR_UNIT) + + # 6. Connectives. + text = _ar_word_sub(text, _AR_CONNECTIVES) + + # 7. Arabic comma → ASCII comma (so the English regex's compound + # detection sees the boundary). + text = text.replace("،", ",") + + # 8. Word-order fix. After token substitution we may have + # 'walk [direction]? step 1' / 'walk [direction]? meter 2' + # which has the unit before the number. The English regex expects + # '[verb] [direction]? '. Swap them. + text = re.sub( + r"\b(walk(?:ing)?|turn(?:ing)?|mov(?:e|ing))" + r"(\s+(?:forward|backward|back|left|right|around))?" + r"\s+(steps?|meters?|degrees?)" + r"\s+(\d+(?:\.\d+)?)", + r"\1\2 \4 \3", + text, + flags=re.IGNORECASE, + ) + + # 9a. Idiomatic post-fix: 'turning backward' (which is what we get + # from 'استدر للخلف' / 'لف ورا') should map to 'turning around'. + # 'turning backward' isn't a meaningful phrase in English motion + # vocab; the canonical is 'turn around' (180° rotation). + text = re.sub( + r"\bturn(?:ing)?\s+backward\b", + "turning around", + text, + flags=re.IGNORECASE, + ) + + # 9b. Double-direction collapse. After substitution we sometimes get + # 'walking backward backward 2 steps' because the Arabic verb + # ('أرجع' = walking-backward gerund) AND the explicit direction + # ('للخلف'/'لورا' = backward) both translated to 'backward'. Collapse + # repeated adjacent direction tokens so the parametric regex matches. + # Field-observed cases: + # 'أرجع للخلف خطوتين' → 'walking backward backward 2 steps' → + # 'walking backward 2 steps' + # 'أتقدم للأمام خطوة' → 'walking forward forward 1 step' → + # 'walking forward 1 step' + text = re.sub( + r"\b(forward|backward|back|left|right|around)(?:\s+\1)+\b", + r"\1", + text, + flags=re.IGNORECASE, + ) + + # 9c. Spin-in-place idiom — Arabic phrases for turning around oneself + # (`حول نفسي`, `حول نفسك`, `حوالين نفسي`, `حوالين نفسك`, `على نفسي`, + # `على نفسك`, `بنفسي`, `بنفسك`) all mean 360°/180° self-rotation. + # Translated to ' around ' so 'turning حول نفسي' becomes 'turning + # around' and the turn_around canonical regex catches it. + # Field case: user says 'لف حول نفسك' → Gemini replies 'أستدير حول + # نفسي' → after verb sub = 'turning حول نفسي' → after this pass = + # 'turning around' → dispatch turn_around. Without this, the bot + # phrase didn't dispatch and the robot stood still. + text = re.sub( + r"\s*(?:حوال?ين|حول|على)\s+(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b", + " around", + text, + ) + text = re.sub(r"\s*ب(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b", " around", text) + + # 9. Collapse extra whitespace introduced by substitutions. + text = re.sub(r"\s+", " ", text).strip() + + return text + + +# ──────────────────────────────────────────────────────────────── +# TODO (future) — AMBIGUITY DISAMBIGUATION HOOK +# +# 'left' / 'right' / 'forward' have surface ambiguity (direction vs +# past-tense / surname / general adverb). When LiDAR + dialogue memory +# are wired in, a higher-level resolver should sit ABOVE this module +# and decide: is the current chunk a motion intent or descriptive +# language? Inputs to that resolver could include: +# - command_history.json (recent commands — context for repeat/follow) +# - LiDAR snapshot (is the path clear? is something blocking?) +# - prior 1-2 Gemini turns (what was discussed? still on motion topic?) +# - mic energy + duration (long quiet vs decisive utterance) +# The resolver gates whether to call to_canonical_english at all OR +# returns the original text unchanged when context says 'this is chat, +# not motion'. +# +# Current behaviour: context-free token substitution. Same risk profile +# as today's regex layer. +# ──────────────────────────────────────────────────────────────── + + +# Standalone smoke check +if __name__ == "__main__": + # Apply normalise_numbers FIRST to match production pipeline + from Voice.number_words import normalise_numbers as _nn + + def pipeline(text): + return to_canonical_english(_nn(text)) + + cases = [ + ("أمشي خطوة واحدة.", "walk 1 step."), + ("أمشي خطوتين.", "walk 2 steps."), + ("لف يسار 90 درجة.", "turn left 90 degree."), + ("أستدير يميناً 100 درجة.", "turn right 100 degree."), + ("أمشي للأمام 5 خطوات.", "walk forward 5 steps."), + ("أمشي للخلف 3 خطوات.", "walk backward 3 steps."), + ("أرجع 3 خطوات.", "walk backward 3 steps."), + ("أمشي خطوة 1.", "walk 1 step."), + ("أستدير يساراً 1 خطوات.", "turn left 1 steps."), + ("أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة.", + "walk backward 3 steps, then turn right 90 degree."), + ("Walking forward 5 steps.", "Walking forward 5 steps."), + ("Turning right 90 degrees.", "Turning right 90 degrees."), + ("لف على اليمين.", "turn on right."), + # Spelled-out + structural together + ("أستدير يساراً تسعين درجة.", "turn left 90 degree."), + ("أمشي خمس خطوات.", "walk 5 steps."), + # Compound chain + ("أمشي خطوة واحدة، ثم أستدير يميناً.", + "walk 1 step, then turn right."), + ] + ok = bad = 0 + for inp, exp in cases: + got = pipeline(inp) + success = got == exp + mark = "✓" if success else "✗" + if success: ok += 1 + else: bad += 1 + print(f" {mark} {inp!r:55s}") + print(f" → {got!r}") + if not success: + print(f" expected: {exp!r}") + print(f"\n{ok}/{ok+bad} passed") diff --git a/Voice/gemini_runner.py b/Voice/gemini_runner.py index a15e260..b90042b 100644 --- a/Voice/gemini_runner.py +++ b/Voice/gemini_runner.py @@ -36,6 +36,16 @@ Stdin protocol (line-based): "flush\n" drop mic buffer (echo prevention) "frame:\n" forward a camera frame to Gemini Live (Marcus parent throttles to ~2 fps) + "state:\n" motion state update from Marcus's worker. + JSON: {"event":"start|complete|interrupted|error", + "cmd":"", + "elapsed_sec": float (optional), + "reason": "" (error only)}. + Runner injects '[STATE-...] ' into the + live session as silent text context — Gemini + reads it for honest answers to "what are you + doing?" but does NOT speak about it unprompted + (per persona Rule 9). ──────────────────────────────────────────────────────────────────────── Env vars: @@ -107,6 +117,13 @@ _MIC_HOLDER: list = [] # [BuiltinMic] when active _LATEST_FRAME_LOCK = threading.Lock() _LATEST_FRAME: dict = {"bytes": None, "ts": 0.0} +# Pending motion-state updates to inject into Gemini Live as silent +# text context. The stdin watcher (running in a regular thread) drops +# parsed payloads here; an asyncio task in the main session loop +# drains them and calls session.send_realtime_input(text=...). +_STATE_LOCK = threading.Lock() +_STATE_PENDING: list = [] # list[str] of formatted '[STATE-...] ...' lines + def _stdin_watcher() -> None: try: @@ -136,6 +153,38 @@ def _stdin_watcher() -> None: with _LATEST_FRAME_LOCK: _LATEST_FRAME["bytes"] = data _LATEST_FRAME["ts"] = time.time() + elif line.startswith("state:"): + # Motion state update from Marcus's worker. Parse + format + # for Gemini Live; the actual session.send_realtime_input + # call happens in an asyncio task that drains _STATE_PENDING. + try: + payload = json.loads(line[len("state:"):]) + except Exception: + continue + event = (payload.get("event") or "").strip().lower() + cmd = (payload.get("cmd") or "").strip() + if not event or not cmd: + continue + tag_map = { + "start": "[STATE-START]", + "complete": "[STATE-DONE]", + "interrupted": "[STATE-INTERRUPTED]", + "error": "[STATE-ERROR]", + "paused": "[STATE-PAUSED]", + "resumed": "[STATE-RESUMED]", + } + tag = tag_map.get(event) + if tag is None: + continue + msg = "{} {}".format(tag, cmd) + elapsed = payload.get("elapsed_sec") + if isinstance(elapsed, (int, float)): + msg += " ({:.1f}s)".format(float(elapsed)) + reason = payload.get("reason") + if reason and event == "error": + msg += " — {}".format(reason) + with _STATE_LOCK: + _STATE_PENDING.append(msg) except Exception: return @@ -429,6 +478,38 @@ async def _send_frame_loop(session, types_mod, done: asyncio.Event) -> None: # Keep going — frames are best-effort. +async def _send_state_loop(session, done: asyncio.Event) -> None: + """Drain _STATE_PENDING and inject each line into Gemini Live as + silent text context. This is what makes Gemini aware of the + robot's actual motion state — start/complete/interrupted/error + events from Marcus's motion worker. Persona Rule 9 instructs + Gemini to read these for context but NOT speak about them + unprompted; mention only when the user asks 'what are you doing?'. + + Polls at 10 Hz which is plenty — state events arrive at most a + few per second and they're tiny. send_realtime_input(text=...) + is fire-and-forget; we log+swallow errors and keep draining.""" + while not done.is_set() and not _STOP_REQUESTED.is_set(): + await asyncio.sleep(0.1) + with _STATE_LOCK: + if not _STATE_PENDING: + continue + pending = list(_STATE_PENDING) + _STATE_PENDING.clear() + for msg in pending: + try: + await session.send_realtime_input(text=msg) + log("info", f"STATE injected: {msg}") + except asyncio.CancelledError: + return + except Exception as e: + # Some Gemini Live SDK versions may not accept text on + # send_realtime_input. Log once and drop — motion still + # works fine, only the live-state-to-Gemini channel is + # missing. Marcus side and dispatcher are unaffected. + log("warn", f"state inject failed: {e}") + + async def _receive_loop(session, speaker, recorder, sess: _Session, done: asyncio.Event) -> None: loop = asyncio.get_event_loop() last_recv = time.time() @@ -601,6 +682,7 @@ async def main_async() -> int: asyncio.gather( _send_mic_loop(session, types, mic, speaker, recorder, sess, done), _send_frame_loop(session, types, done), + _send_state_loop(session, done), _receive_loop(session, speaker, recorder, sess, done), ), timeout=_SESSION_TIMEOUT, diff --git a/Voice/gemini_script.py b/Voice/gemini_script.py index 8766b52..0c19762 100644 --- a/Voice/gemini_script.py +++ b/Voice/gemini_script.py @@ -211,6 +211,36 @@ class GeminiBrain: return self._send_stdin("frame:" + b64 + "\n") + def send_state(self, event: str, cmd: str, + elapsed_sec: Optional[float] = None, + reason: Optional[str] = None) -> None: + """ + Push a motion state update to the runner so Gemini Live can + track what the robot is actually doing. Events: + 'start' — worker dequeued cmd, brain about to run + 'complete' — brain returned and motion_abort was clear + 'interrupted' — brain returned but motion_abort was set + 'error' — brain raised an exception (`reason` set) + The runner formats this as '[STATE-...] ' and injects it + into the live session as silent text context. Best-effort — + no-op if runner not yet started or stdin closed. + """ + proc = self._proc + if proc is None or proc.stdin is None: + return + if not event or not cmd: + return + payload = {"event": event, "cmd": cmd} + if elapsed_sec is not None: + payload["elapsed_sec"] = round(float(elapsed_sec), 2) + if reason: + payload["reason"] = str(reason)[:200] + try: + line = "state:" + json.dumps(payload, ensure_ascii=False) + "\n" + except Exception: + return + self._send_stdin(line) + def _send_stdin(self, line: str) -> None: """Serialised stdin write — multiple threads (frame sender + flush) can call safely.""" proc = self._proc diff --git a/Voice/marcus_voice.py b/Voice/marcus_voice.py index af27468..b40b80f 100644 --- a/Voice/marcus_voice.py +++ b/Voice/marcus_voice.py @@ -32,10 +32,12 @@ from __future__ import annotations import logging import os +import queue import re import sys import threading import time +from collections import deque from difflib import SequenceMatcher from logging.handlers import RotatingFileHandler from typing import Callable, Optional @@ -47,6 +49,11 @@ if _PROJECT_DIR not in sys.path: sys.path.insert(0, _PROJECT_DIR) from Core.env_loader import PROJECT_ROOT from Core.config_loader import load_config +from Core.motion_state import motion_abort, motion_pause +from Core.motion_log import log_motion as _log_motion +from Voice.number_words import normalise_numbers +from Voice.canonical_normalizer import to_canonical_english +from Voice.sequences import get_sequences LOG_DIR = os.path.join(PROJECT_ROOT, "logs") os.makedirs(LOG_DIR, exist_ok=True) @@ -95,18 +102,34 @@ import json as _json def _load_instructions() -> dict: + """Load Config/instruction.json. Raises RuntimeError if the file is + missing, empty, or has no actions — voice dispatch is unusable + without it (every Gemini phrase gets rejected, robot never moves, + operator gets a confused user complaint hours later). Loud failure + surfaces in the brain init banner; quiet degradation does not.""" path = os.path.join(PROJECT_ROOT, "Config", "instruction.json") try: with open(path, "r", encoding="utf-8") as f: - return _json.load(f) or {} + data = _json.load(f) or {} + except FileNotFoundError as e: + raise RuntimeError( + "instruction.json missing at {} — voice dispatch cannot run " + "without it (this file holds every wake-word + bot phrase " + "the dispatcher matches against). Restore from git or " + "rebuild from Config/instruction.example.json.".format(path) + ) from e except Exception as e: - # Fail soft — empty tables mean the dispatch gate just rejects - # everything and is_running stays True. Better than a crash. - try: - log.error("instruction.json not loaded: %s", e) - except Exception: - pass - return {} + raise RuntimeError( + "instruction.json at {} is malformed JSON: {}".format(path, e) + ) from e + actions = data.get("actions") + if not isinstance(actions, dict) or not actions: + raise RuntimeError( + "instruction.json at {} has no actions — every dispatch " + "would silently fail. Add at least one action with " + "canonical/user_phrases/bot_phrases.".format(path) + ) + return data _INSTRUCTIONS = _load_instructions() @@ -184,14 +207,179 @@ def _build_parametric_actions(data: dict) -> list: return out +# ─── quoted-phrase stripper ───────────────────────────────────── +# +# When Gemini quotes the user's phrase back ("Sorry, I don't understand +# the phrase 'turn left'") the dispatcher would otherwise match the +# canonical name INSIDE the quoted string and false-fire. Strip every +# matched pair of quotes — straight, smart, Arabic — before scanning. +# Pattern handles ASCII '...' "..." plus typographic “…” ’…’ «…» 「…」. +_QUOTED_RE = re.compile( + r"\".*?\"" + r"|\'.*?\'" + r"|“.*?”" + r"|‘.*?’" + r"|«.*?»" + r"|「.*?」" + r"|『.*?』", + re.DOTALL, +) + + +# ─── STATE echo stripper ──────────────────────────────────────── +# +# When the runner injects '[STATE-DONE] walk 1 steps (3.0s)' into +# Gemini Live as silent context, Gemini occasionally echoes that text +# back through its output transcription. A naive dispatcher scan +# would re-fire 'walk 1 steps' from the embedded canonical, creating +# a feedback loop (one voice command → 3-4× motions). This regex +# strips: +# '[STATE-START] ' +# '[STATE-DONE] (1.5s)' +# '[STATE-INTERRUPTED] (2.3s)' +# '[STATE-ERROR] ' +# '[STATE-PAUSED] ' / '[STATE-RESUMED] ' +# The match consumes the bracketed tag and the canonical/duration +# tail, but stops at sentence boundaries (./!/?/؟/،) so a coexisting +# real motion phrase in the same chunk survives. +_STATE_ECHO_RE = re.compile( + r"\[STATE-(?:START|DONE|INTERRUPTED|ERROR|PAUSED|RESUMED)\]" + r"\s*[^.!?؟،,\n(]*" # canonical name; stops before sentence-end OR '(' + r"(?:\s*\([^)]*\))?" # optional parenthesised duration like (1.5s) + r"(?:\s+—\s+[^.!?؟،\n]*)?", # optional ' — ' suffix on errors + re.IGNORECASE, +) + + +# ─── question-clause stripper ─────────────────────────────────── +# +# CRITICAL safety filter. Sentences ending in '?' or '؟' (Arabic +# question mark) are CLARIFICATION QUESTIONS Gemini asks the user — +# they are NEVER motion confirmations. Without this filter, a question +# like 'Do you mean turn left or walk one step forward?' would match +# BOTH 'turn left' and 'walk one step' canonicals inside, dispatching +# contradictory motions while Gemini thinks it's still waiting for +# clarification. Production failure observed at 09:13:57. +# +# Match: from previous sentence terminator (or string start) up to and +# including the question mark. Greedy on the left so consecutive +# questions are stripped one at a time and statements between them +# survive in the residual text. +_QUESTION_RE = re.compile( + r"(?:^|(?<=[.!؟?\n]))[^.!؟?\n]*[?؟]", + re.DOTALL, +) + + +# ─── sequence-name extraction ─────────────────────────────────── +# +# Sequence canonicals carry a NAME ('save sequence as my-greet', +# 'play my-greet'). Gemini's bot_phrase contains it; we extract from +# the surrounding spoken text. Two extractor patterns: +# - 'as ' — for save: 'saved as my-greet' +# - 'play(ing)? ' / 'run(ning)? ' — for play +# Names follow the same shape Sequences validates: lowercase letters, +# digits, hyphens, underscores; 1–31 chars. Gemini occasionally adds +# a period; we strip trailing punctuation. +_SEQ_NAME_AS = re.compile( + r"\bas\s+(?:my\s+)?([a-z0-9][a-z0-9_-]{0,30})\b", re.I, +) +_SEQ_NAME_PLAY = re.compile( + r"\b(?:play(?:ing)?|run(?:ning)?|perform(?:ing)?|do(?:ing)?)\s+" + r"(?:my\s+|the\s+|sequence\s+)*" + r"([a-z0-9][a-z0-9_-]{0,30})\b", + re.I, +) + + +def _extract_sequence_name(text: str, mode: str) -> Optional[str]: + """Pull a sequence name out of Gemini's spoken confirmation. + `mode` is 'as' (save) or 'play' (start_recording / play). Returns + None if no plausible name is found — caller decides what to do.""" + if not text: + return None + t = text.strip().rstrip(".!?,").strip() + rx = _SEQ_NAME_AS if mode == "as" else _SEQ_NAME_PLAY + m = rx.search(t) + if not m: + return None + name = m.group(1).strip().lower() + # Reject likely false positives — bare 'sequence' / 'recording' / + # 'this' shouldn't be a name. + if name in {"sequence", "recording", "macro", "this", "that", + "it", "the", "my"}: + return None + return name + + +# ─── memory helpers ───────────────────────────────────────────── +# +# Static map of pairwise inverses for fixed canonicals. Parametric forms +# (turn left N degrees ↔ turn right N degrees, walk forward N → walk +# backward N) are handled by _reverse_command via regex below. +# Non-reversible canonicals (wave, look around, follow me, patrol, stop, +# come here, stay here, go home, turn around) intentionally absent — +# _reverse_command returns None for those and the dispatcher logs & +# skips with no harm done. +# Pairs loaded from Config/language_tables.json::motion_inverses. +# Adding a new reversible-pair = JSON edit. See Voice/_language_tables.py. +from Voice._language_tables import motion_inverses as _load_motion_inverses +_REVERSE_PAIRS = _load_motion_inverses() + + +def _reverse_command(cmd: str) -> Optional[str]: + """Return the inverse of `cmd` if it is reversible, else None. + + Handles fixed pairs via _REVERSE_PAIRS and parametric forms via + regex (turn left N deg ↔ turn right N deg, walk forward N steps/m + ↔ walk backward N steps/m, default-direction walks become explicit + backward). Bare 'turn N degrees' (no direction) is treated as + self-inverse — turning N degrees is geometrically reversible by + repeating it, but more importantly the brain accepts it unchanged. + """ + if not cmd: + return None + s = cmd.strip().lower() + if s in _REVERSE_PAIRS: + return _REVERSE_PAIRS[s] + m = re.match(r"^turn\s+(left|right)\s+(\d+(?:\.\d+)?)\s*deg(?:rees?)?$", s) + if m: + flip = "right" if m.group(1) == "left" else "left" + return "turn {} {} degrees".format(flip, m.group(2)) + m = re.match(r"^turn\s+(\d+(?:\.\d+)?)\s*deg(?:rees?)?$", s) + if m: + return s # un-directional turn — self-inverse for the brain + m = re.match( + r"^walk\s+(forward|back|backward)\s+(\d+(?:\.\d+)?)\s*(steps?|meters?)$", s, + ) + if m: + flip = "backward" if m.group(1) in ("forward",) else "forward" + return "walk {} {} {}".format(flip, m.group(2), m.group(3)) + m = re.match(r"^walk\s+(\d+(?:\.\d+)?)\s*(steps?|meters?)$", s) + if m: + # Implicit-direction walk → defaults to forward in the brain, so + # the inverse is an explicit backward walk. + return "walk backward {} {}".format(m.group(1), m.group(2)) + return None + + def _format_param_template(template: str, groups: tuple) -> str: """Substitute $1, $2, … in a parametric command template with the matching regex groups. None groups (optional captures that didn't fire) are treated as empty strings; surrounding whitespace is - collapsed so 'turn 90 degrees' becomes 'turn 90 degrees'.""" + collapsed so 'turn 90 degrees' becomes 'turn 90 degrees'. + + Returns an EMPTY string if any captured numeric group is a literal + '0' (or '0.0') — a 0-quantity motion is a no-op and the dispatcher + skips empty results. This protects against Gemini hallucinating + 'turn 0 degrees' when the user didn't actually request a turn but + Gemini misheard 'صفر' or similar. Bug #3 in the 16:42:58 audit. + """ out = template for i, g in enumerate(groups, 1): token = "$" + str(i) + if g is not None and re.match(r"^0+(?:\.0+)?$", str(g).strip()): + return "" # zero-quantity motion → discard out = out.replace(token, "" if g is None else str(g)) out = re.sub(r"\s+", " ", out).strip() return out @@ -399,6 +587,54 @@ class VoiceModule: self._gemini_say_lock = threading.Lock() self._gemini_say_last_chunk_at = 0.0 + # CUMULATIVE per-turn text used by the motion dispatcher. Gemini + # frequently splits a single phrase across chunks ('Turning' / + # 'right.'), and a per-chunk scan would miss 'turning right' + # because neither chunk alone contains the full needle. We + # instead append every chunk to this string and re-scan the + # WHOLE accumulated text on each new chunk; the per-turn + # _fired_canons_this_turn set prevents the same motion from + # dispatching twice. Reset on turn_end. + self._gemini_turn_text = "" + + # Short-term motion memory. Every REAL command dispatched to the + # brain is appended here (the memory canonicals 'repeat last' / + # 'reverse last' are NOT — only their resolved targets). Used + # by the dispatcher to handle 'do that again' / 'undo' triggers + # locally without bothering the brain. Cap = 10; older commands + # roll off automatically. PERSISTS to disk across Marcus + # restarts AND Gemini Live session reconnects so the user can + # ask 'what did you do last hour' without losing context (Bug + # #12 in the 14-bug audit). + self._command_history: deque = deque(maxlen=10) + self._history_path = os.path.join( + PROJECT_ROOT, "logs", "command_history.json", + ) + self._load_command_history() + + # ─── motion worker (async dispatch) ─────────────────────── + # + # The dispatcher used to call self._on_command directly on the + # IPC reader thread. That thread blocks for the duration of + # the brain's motion (a 10-step walk = ~10s of dead silence + # for Gemini chunks). We now route every command through a + # Queue + dedicated worker thread: + # - dispatcher pushes commands and returns immediately + # - worker pulls FIFO and calls self._on_command serially + # - 'stop' has priority: it sets motion_abort (interrupts + # the running brain loop) AND drains the queue (cancels + # pending motions), then a stop is enqueued so the brain + # runs its proper stop handler. + # - worker tracks self._current_motion for status queries + # and clears motion_abort at the start of each new (non- + # stop) command so a stale set doesn't kill the next one. + self._motion_queue: "queue.Queue" = queue.Queue(maxsize=20) + self._motion_worker: Optional[threading.Thread] = None + self._current_motion: Optional[str] = None + self._current_motion_at: float = 0.0 + # Sentinel used to politely shut the worker down on stop(). + self._MOTION_SENTINEL = object() + self._running = False self._thread = None @@ -558,11 +794,33 @@ class VoiceModule: that prints once on turn_end (or sooner if the chunk ends with sentence punctuation). """ - # Motion side-channel — chunk-level so dispatch is fast. - try: - self._dispatch_gemini_bot(text) - except Exception: - pass + # Motion side-channel. Dispatch on the CUMULATIVE per-turn text + # so split chunks ('Turning' + ' right.') still match the full + # 'turning right' needle. We DEFER dispatch until the cumulative + # text settles at a sentence boundary (./!/?/؟) — otherwise a + # bare canonical like 'turn right' would fire on chunk 1 + # ('Turning right') before chunk 2 (' 90 degrees.') brings in + # the parametric extension, and the robot would do BOTH a small + # turn and a 90° turn. A leftover-flush on turn_end catches the + # rare case where a phrase ends with no terminal punctuation. + # + # Word-boundary fix: Gemini Live often emits chunks WITHOUT + # leading whitespace (e.g. ["Turning right,", "then", "walking"] + # → naive `+=` produces 'Turning right,thenwalking' and breaks + # every \s+ in our regexes). Insert a space whenever the previous + # buffer doesn't already end in whitespace and the new chunk + # doesn't start with one. This mirrors what _flush_gemini_say + # does via " ".join(...) for the [Sanad] said: line. + if (self._gemini_turn_text + and not self._gemini_turn_text[-1:].isspace() + and text[:1] and not text[:1].isspace()): + self._gemini_turn_text += " " + self._gemini_turn_text += text + if self._gemini_turn_text.rstrip().endswith((".", "!", "?", "؟")): + try: + self._dispatch_gemini_bot(self._gemini_turn_text) + except Exception: + pass with self._gemini_say_lock: self._gemini_say_buf.append(text) @@ -574,11 +832,22 @@ class VoiceModule: def _on_gemini_turn_end(self) -> None: """Flush any pending Gemini output chunks at turn boundary, and - reset the per-turn motion dedup set so the next user→Gemini - exchange starts fresh.""" + reset the per-turn motion dedup state (fired-set + cumulative + text buffer) so the next user→Gemini exchange starts fresh. + + IMPORTANT: also runs a final dispatch pass on the cumulative + text in case the last chunk ended without terminal punctuation + (rare — Gemini usually finishes with '.', but the model can + occasionally emit a turn that ends mid-clause).""" with self._gemini_say_lock: self._flush_gemini_say_locked() + if self._gemini_turn_text.strip(): + try: + self._dispatch_gemini_bot(self._gemini_turn_text) + except Exception: + pass self._fired_canons_this_turn.clear() + self._gemini_turn_text = "" def _flush_gemini_say_locked(self) -> None: """Caller MUST hold self._gemini_say_lock. Prints one [Sanad] said: line.""" @@ -658,6 +927,47 @@ class VoiceModule: if not text: return + # Strip [STATE-...] markers we previously injected ourselves — + # Gemini Live often echoes them through its output transcription + # ('[STATE-DONE] walk 1 steps') and a naive scan picks up the + # canonical name inside, re-firing the motion. The state markers + # are NOT user intent; they're our own bookkeeping coming back. + # Pattern matches the tag, the optional " (X.Ys)" duration, and + # any optional " — reason" suffix on errors. Leaves the rest of + # the chunk intact so a real user-intent phrase coexisting with + # an echoed state event still gets dispatched. + text = _STATE_ECHO_RE.sub("", text) + # Strip quoted phrases. Gemini sometimes quotes the user's phrase + # back ('Sorry, I don't understand the phrase "turn left"') and + # the dispatcher would otherwise match the embedded canonical + # and false-fire. Quoted phrases are NEVER motion confirmations. + text = _QUOTED_RE.sub(" ", text) + # Strip question clauses. A clarification like 'Do you mean turn + # left or walk a step?' contains motion phrases but is NOT a + # confirmation — Gemini is awaiting an answer, not acting. + # Without this filter, both motions would dispatch contradictorily + # while Gemini thinks the robot is idle. (Production failure at + # 09:13:57 — Gemini said 'هل تقصد أن أستدير لليسار أم أمشي خطوة + # للأمام؟' and the dispatcher fired BOTH turn_left AND walk_forward_1.) + text = _QUESTION_RE.sub(" ", text) + if not text.strip(): + return + + # Normalise spelled-out numbers BEFORE the lower-cased scan so + # 'turn ninety degrees' / 'أستدير تسعين درجة' get matched by + # the parametric (\d+) regex. normalise_numbers is idempotent + # and word-boundary-aware ('someone' / 'often' are unaffected). + text = normalise_numbers(text) + + # Translate Arabic / dialect structural motion phrasings to + # English canonical form so the existing English parametric + # regexes can match them. Replaces 14+ dialect-specific Arabic + # regexes with one principled translator. Idempotent on + # already-English input (passes through unchanged). See + # Voice/canonical_normalizer.py for the verb/dir/unit/dual + # dictionaries and the future memory+LiDAR disambiguation hook. + text = to_canonical_english(text) + # .lower() preserves character-by-character indexing for both # ASCII and Arabic, so positions in `low` correspond to positions # in `text` for ordering. Arabic patterns are stored as-is and @@ -691,6 +1001,12 @@ class VoiceModule: continue cmd_text = _format_param_template(template, m.groups()) if not cmd_text: + # Parametric matched but produced an empty command + # (e.g. 0-quantity rejection). Claim the span so the + # bare-canonical pass below doesn't double-match + # 'turning right' inside what was 'turning right 0 + # degrees' — which would defeat the 0-rejection. + claimed_spans.append((j, end)) continue matches.append((j, cmd_text, "param")) claimed_spans.append((j, end)) @@ -732,16 +1048,427 @@ class VoiceModule: self._last_gemini_canon = cmd self._last_gemini_dispatch_at = now + # Memory intercepts. 'repeat last' and 'reverse last' are + # synthetic canonicals — they NEVER reach the brain. Resolve + # them against _command_history and dispatch the resolved + # command instead. Empty history → log and skip silently + # (no-op rather than an awkward error). Non-reversible + # commands → log and skip. + resolved = cmd + if cmd == "repeat last": + if not self._command_history: + log.info("memory: 'repeat last' but history is empty — skip") + continue + resolved = self._command_history[-1] + log.info("memory: repeat → %r", resolved) + elif cmd == "reverse last": + if not self._command_history: + log.info("memory: 'reverse last' but history is empty — skip") + continue + last = self._command_history[-1] + resolved = _reverse_command(last) + if resolved is None: + log.info("memory: %r is not reversible — skip", last) + continue + log.info("memory: reverse %r → %r", last, resolved) + + # Push the dispatched (resolved) command to history. This + # means 'again' followed by 'again' replays the original + # motion; 'turn right' followed by 'undo' followed by 'undo' + # toggles right/left. Consistent with what users intuit. + # Persisted to disk so it survives reconnects and restarts. + self._command_history.append(resolved) + self._save_command_history() + log.info( "dispatch (gemini-bot, %s): %s (heard: %r)", - label, cmd, text[:80], + label, resolved, text[:80], ) - _log_transcript("CMD-BOT", cmd) - if self._on_command: + _log_transcript("CMD-BOT", resolved) + + # Pause/resume intercepts. These take effect IMMEDIATELY — + # not after the queue drains — because the user expects + # 'pause' to halt mid-walk right now, not after the brain + # finishes its current command. Setting motion_pause makes + # all interrupt-aware brain loops hold at zero velocity + # (see wait_while_paused in Core/motion_state.py). Resume + # clears the event so the running loop continues. We DO + # NOT enqueue these — the worker never sees them, the + # brain doesn't know about them, no [STATE-...] event is + # emitted (motion is still running, just paused). + if resolved == "pause motion": + motion_pause.set() + # Tag the state event with the motion that's actually + # being held (current_motion), not the literal 'pause + # motion' canonical — Gemini Rule 9 reads this to + # answer 'are you still moving?' honestly. Bug #13. + held = self._current_motion or "no motion in flight" + log.info("memory: motion_pause SET — holding %r", held) try: + if (self._brain is not None + and self._stt.get("gemini_send_state_events", False)): + self._brain.send_state("paused", held) + except Exception: + pass + continue + if resolved == "resume motion": + was_paused = motion_pause.is_set() + if was_paused: + motion_pause.clear() + held = self._current_motion or "no motion in flight" + log.info("memory: motion_pause CLEARED — resuming %r", held) + else: + held = "(no pause was active)" + log.info("memory: resume requested but nothing paused") + try: + if (self._brain is not None + and self._stt.get("gemini_send_state_events", False)): + self._brain.send_state("resumed", held) + except Exception: + pass + continue + + # ─── sequence-control intercepts ──────────────────── + # These never reach the brain. They mutate the in-process + # Sequences singleton and (for play) push commands onto + # this same dispatcher path again. + if resolved == "start recording": + # Try to extract a pre-name from Gemini's text: + # 'recording your sequence as my-greet'. + preset_name = _extract_sequence_name(text, mode="as") + seqs = get_sequences() + rs = seqs.start_recording(name=preset_name) + log.info("sequence: start_recording(%r) -> %s", + preset_name, rs) + continue + if resolved == "save sequence": + # Name MUST come from the bot phrase ('saved as '). + save_name = _extract_sequence_name(text, mode="as") + seqs = get_sequences() + rs = seqs.save_recording(name=save_name) + log.info("sequence: save_recording(%r) -> %s", + save_name, rs) + if not rs.get("ok"): + try: + if (self._brain is not None + and self._stt.get("gemini_send_state_events", False)): + self._brain.send_state( + "error", "save sequence", + reason=rs.get("reason") or "save failed", + ) + except Exception: + pass + continue + if resolved == "cancel recording": + seqs = get_sequences() + rs = seqs.cancel_recording() + log.info("sequence: cancel_recording -> %s", rs) + continue + if resolved == "play sequence": + play_name = _extract_sequence_name(text, mode="play") + seqs = get_sequences() + cmds = seqs.play(play_name) if play_name else None + if cmds is None: + # Two failure modes worth distinguishing for the + # operator AND for Gemini (which uses send_state + # to give honest answers when the user later asks + # 'did you do it?'). Surface both as state errors. + if not play_name: + reason = "no sequence name in your phrase" + else: + reason = "no sequence saved as %r" % play_name + log.warning("sequence: play(%r) failed — %s", + play_name, reason) + try: + if (self._brain is not None + and self._stt.get("gemini_send_state_events", False)): + self._brain.send_state( + "error", "play sequence", + reason=reason, + ) + except Exception: + pass + else: + log.info("sequence: play(%r) — enqueueing %d commands", + play_name, len(cmds)) + for sub in cmds: + self._enqueue_motion(sub) + continue + + # Append to recording buffer if a session is active. We do + # this BEFORE enqueueing so a control flag flip mid-run + # doesn't change the captured macro shape. record_command + # silently skips control canonicals (stop / pause / etc.). + try: + get_sequences().record_command(resolved) + except Exception: + pass + + # Enqueue for the motion worker — does NOT block here. + # Worker calls self._on_command(resolved, "en") when it + # picks the command up. 'stop' sets motion_abort + drains + # the queue inside _enqueue_motion before requeueing. + self._enqueue_motion(resolved) + + # ─── command history persistence ────────────────────── + + def _load_command_history(self) -> None: + """Restore _command_history from disk on startup. Best-effort — + any read/parse error leaves history empty without crashing.""" + try: + if not os.path.isfile(self._history_path): + return + with open(self._history_path, "r", encoding="utf-8") as f: + payload = _json.load(f) + cmds = payload.get("commands", []) + if isinstance(cmds, list): + for c in cmds[-10:]: + if isinstance(c, str) and c.strip(): + self._command_history.append(c.strip()) + log.info( + "command history restored: %d entries from %s", + len(self._command_history), self._history_path, + ) + except Exception as e: + log.warning("could not load command history: %s", e) + + def _save_command_history(self) -> None: + """Write _command_history to disk after every dispatch. + Atomically (write to .tmp + rename) so a crash mid-write doesn't + corrupt the file.""" + try: + os.makedirs(os.path.dirname(self._history_path), exist_ok=True) + tmp = self._history_path + ".tmp" + with open(tmp, "w", encoding="utf-8") as f: + _json.dump( + {"commands": list(self._command_history), + "saved_at": time.time()}, + f, indent=2, ensure_ascii=False, + ) + os.replace(tmp, self._history_path) + except Exception: + # Persistence failure is non-fatal — history still works + # in-memory for the current process lifetime. + pass + + # ─── motion worker ──────────────────────────────────── + + def _motion_worker_loop(self) -> None: + """ + Pull commands FIFO from self._motion_queue and run them on the + brain serially. Clears motion_abort at the start of each new + (non-stop) command so a stale set from a previous interrupt + doesn't kill the next motion. + + Also notifies Gemini via send_state(...) at start / complete / + interrupted / error so the model can give honest answers when + the user asks 'what are you doing?'. + + A brain exception is logged and the worker keeps running; + the queue must drain even if one motion crashes. + """ + while self._running: + try: + item = self._motion_queue.get() + except Exception: + continue + if item is self._MOTION_SENTINEL: + log.info("motion worker received sentinel — exiting") + return + cmd = item + if not cmd: + continue + + is_stop = (cmd.strip().lower() == "stop") + # For non-stop commands, clear any leftover abort flag from + # a previous interrupt so the brain doesn't immediately + # break out of the new motion's loop. Also clear any stale + # pause — a fresh command should not start in a paused + # state just because the previous command was paused at + # the moment it was aborted/stopped. + if not is_stop: + motion_abort.clear() + motion_pause.clear() + + self._current_motion = cmd + self._current_motion_at = time.time() + + # Tell Gemini what's starting (best effort — runner may not + # be ready yet, e.g. first command before connection). + try: + if self._brain is not None and self._stt.get( + "gemini_send_state_events", False): + self._brain.send_state("start", cmd) + except Exception: + pass + + # Structured per-motion log — one line per START with the + # parsed direction/magnitude/unit. Pairs with the END line + # below to give a complete audit of what each command + # actually did. See logs/motion.log. + try: + _log_motion("start", cmd, source="voice") + except Exception: + pass + + # Watchdog: if a brain motion call exceeds the configured + # max RUNNING duration, fire motion_abort so its polling + # loops break out (gradual_stop runs as part of the exit + # path). 'Running' excludes time spent in motion_pause — + # a user who deliberately holds for 90s shouldn't trip the + # watchdog. We use a poller thread that ticks 100ms only + # while motion_pause is clear; pause-aware. Skip for + # commands listed in motion_watchdog_skip_canonicals + # (e.g. 'patrol' has its own 5-minute outer duration). + # 0 disables. + wd_sec = float(self._stt.get("motion_watchdog_sec", 0.0)) + wd_skip = set(self._stt.get( + "motion_watchdog_skip_canonicals", [] + )) + wd_canon = cmd.strip().lower().split()[0] if cmd.strip() else "" + wd_active = (wd_sec > 0 and wd_canon and + wd_canon not in wd_skip and not is_stop) + wd_stop_evt = threading.Event() + + def _watchdog_loop(_cmd=cmd, _budget=wd_sec, _stop=wd_stop_evt): + # Tick every 100ms while motion is RUNNING (not paused + # and not aborted). When elapsed running-time exceeds + # budget, fire motion_abort. Exits when _stop is set + # by the worker (motion completed normally). + ticks_at_100ms = 0 + budget_ticks = int(_budget / 0.1) + while not _stop.is_set(): + if motion_abort.is_set(): + return # worker / external abort already fired + if not motion_pause.is_set(): + ticks_at_100ms += 1 + if ticks_at_100ms >= budget_ticks: + log.warning( + "watchdog: %r exceeded %.1fs running time " + "— firing motion_abort", _cmd, _budget, + ) + motion_abort.set() + return + _stop.wait(0.1) + + wd_thread = None + if wd_active: + wd_thread = threading.Thread( + target=_watchdog_loop, daemon=True, + name="motion-watchdog", + ) + wd_thread.start() + + t0 = time.time() + had_error = False + err_reason = "" + try: + if self._on_command: self._on_command(cmd, "en") - except Exception as e: - log.error("on_command error: %s", e, exc_info=True) + except Exception as e: + had_error = True + err_reason = str(e) + log.error("worker on_command(%r) failed: %s", cmd, e, exc_info=True) + finally: + if wd_thread is not None: + wd_stop_evt.set() + # Best-effort join — the watchdog wakes on its + # 100ms tick, so this returns within ~100ms. + wd_thread.join(timeout=0.3) + elapsed = time.time() - t0 + + # Determine outcome. Stop itself never reports interrupted + # (it IS the interrupt). For other commands, an abort flag + # set at any point during execution means we were halted. + if had_error: + outcome = "error" + elif is_stop: + outcome = "complete" + elif motion_abort.is_set(): + outcome = "interrupted" + else: + outcome = "complete" + + try: + if (self._brain is not None + and self._stt.get("gemini_send_state_events", False)): + if outcome == "error": + self._brain.send_state( + "error", cmd, elapsed_sec=elapsed, reason=err_reason, + ) + else: + self._brain.send_state( + outcome, cmd, elapsed_sec=elapsed, + ) + except Exception: + pass + + log.info( + "motion worker: %s [%s] in %.2fs", cmd, outcome, elapsed, + ) + + # Structured per-motion END log — pairs with the START line + # above. Captures actual elapsed and outcome alongside the + # auto-parsed direction/magnitude/unit. + try: + _log_motion( + outcome, cmd, source="voice", + actual_sec=elapsed, + **({"reason": err_reason} if had_error else {}), + ) + except Exception: + pass + + self._current_motion = None + self._current_motion_at = 0.0 + + def _enqueue_motion(self, cmd: str) -> None: + """ + Queue a command for the motion worker. 'stop' has priority: + it sets motion_abort (interrupting the in-flight brain loop) + AND drains pending commands so they don't fire after the user + explicitly asked to halt. Non-stop commands are simply enqueued. + """ + if not cmd: + return + if cmd.strip().lower() == "stop": + # 1. Cut the in-flight motion immediately. Brain motion + # loops poll motion_abort.is_set() and break early. + # Also clear any pause — otherwise wait_while_paused + # would keep holding the brain inside its loop and the + # abort check wouldn't fire until pause clears. + motion_pause.clear() + motion_abort.set() + # 2. Drain pending — anything queued behind a stop is + # cancelled. We log the count for ops visibility. + drained = 0 + try: + while True: + self._motion_queue.get_nowait() + drained += 1 + except queue.Empty: + pass + if drained: + log.info("stop drained %d pending motion(s)", drained) + # 3. Enqueue the stop itself so brain.process_command runs + # its proper stop handler (gradual_stop, neutral pose). + try: + self._motion_queue.put_nowait("stop") + except queue.Full: + log.warning("motion queue full — stop dropped") + return + + try: + self._motion_queue.put_nowait(cmd) + except queue.Full: + log.warning("motion queue full — dropping %r", cmd) + + @property + def current_motion(self) -> Optional[str]: + """The canonical command currently being executed by the brain, + or None if idle. Worker sets it before calling on_command and + clears it when the brain returns. Safe to read from any thread.""" + return self._current_motion # ─── start / stop ───────────────────────────────────── @@ -750,6 +1477,13 @@ class VoiceModule: log.warning("VoiceModule already running") return self._running = True + # Spawn the motion worker FIRST so any early dispatch (e.g. + # from a quick wake-on-startup) has somewhere to go. + self._motion_worker = threading.Thread( + target=self._motion_worker_loop, + daemon=True, name="motion-worker", + ) + self._motion_worker.start() self._thread = threading.Thread( target=self._voice_loop, daemon=True, name="voice", ) @@ -758,6 +1492,14 @@ class VoiceModule: def stop(self): self._running = False + # Wake the worker so it can see _running=False and exit. + try: + self._motion_queue.put_nowait(self._MOTION_SENTINEL) + except Exception: + pass + if self._motion_worker: + self._motion_worker.join(timeout=3) + self._motion_worker = None if self._thread: self._thread.join(timeout=5) self._thread = None diff --git a/Voice/number_words.py b/Voice/number_words.py new file mode 100644 index 0000000..5e4eb46 --- /dev/null +++ b/Voice/number_words.py @@ -0,0 +1,307 @@ +""" +number_words.py — convert spelled-out numbers to digits. + +Used by Voice/marcus_voice.py to preprocess Gemini's spoken text +before the dispatcher scans it for parametric motion phrases. Reason: +some Gemini Live voices occasionally speak 'ninety degrees' instead +of '90 degrees' even when the persona prompt asks for digits, and +the parametric regexes only recognise digits. + +Scope (intentional): + - English compound numbers up to 999 ('three hundred sixty', 'one + hundred and eighty', 'forty-five'). + - Arabic ones/tens/hundreds with و-conjunction ('تسعون', 'تسعين', + 'مائة وثمانون', 'ثلاثمائة وستون', 'خمس', 'خمسة'). + - Word-boundary-aware so it does NOT eat the substring 'one' inside + 'someone' or 'thirty' inside 'thirtysomething'. + +Out-of-scope: + - Decimals ('two point five'). + - Numbers above 999 (motion vocab maxes around 360 degrees / 30 + meters / 100 steps; 999 is plenty of headroom). + - Ordinals ('first', 'الأول') — those don't appear in motion text. + +API: + >>> normalise_numbers("Walking forward five steps.") + 'Walking forward 5 steps.' + >>> normalise_numbers("أستدير تسعين درجة") + 'أستدير 90 درجة' + >>> normalise_numbers("turn three hundred sixty degrees") + 'turn 360 degrees' +""" +from __future__ import annotations + +import re + +# ─── English ───────────────────────────────────────────────────── +# +# Number-word tables loaded from Config/language_tables.json. Adding +# a new word (e.g. 'twenty-five' as a single hyphenated token, or a +# new dialectal form) is a JSON edit, not a Python change. +from Voice._language_tables import ( + english_numbers_ones, english_numbers_tens, + english_numbers_scale, english_numbers_glue, +) + +_EN_ONES = english_numbers_ones() +_EN_TENS = english_numbers_tens() +_EN_SCALE = english_numbers_scale() +_EN_GLUE = english_numbers_glue() + +_EN_NUMBER_TOKEN = ( + set(_EN_ONES.keys()) + | set(_EN_TENS.keys()) + | set(_EN_SCALE.keys()) + | _EN_GLUE +) + + +def _en_words_to_int(tokens: list) -> int: + """Convert a list of English number-word tokens to an integer. + Tokens are lowercase and stripped. Glue ('and', '-') is ignored. + Returns the parsed int. Caller guarantees tokens are all in + _EN_NUMBER_TOKEN.""" + total = 0 + current = 0 + for t in tokens: + if t in _EN_GLUE: + continue + if t in _EN_ONES: + current += _EN_ONES[t] + elif t in _EN_TENS: + current += _EN_TENS[t] + elif t in _EN_SCALE: + current = max(current, 1) * _EN_SCALE[t] + total += current + current = 0 + return total + current + + +# Match a maximal run of English number-word tokens. Word-boundary on +# each side so we don't eat 'one' inside 'someone' or 'ten' inside +# 'often'. Allow hyphens between (twenty-five) and 'and' between scales. +_EN_RUN = re.compile( + r"(?:(?<=^)|(?<=[\s\.,!?;:\"\'\(\)\[\]\{\}\-]))" + r"((?:" + "|".join(re.escape(w) for w in sorted(_EN_NUMBER_TOKEN, key=len, reverse=True)) + + r")(?:[\s\-]+(?:" + "|".join(re.escape(w) for w in sorted(_EN_NUMBER_TOKEN, key=len, reverse=True)) + + r"))*)" + r"(?=$|[\s\.,!?;:\"\'\(\)\[\]\{\}\-])", + re.IGNORECASE, +) + + +def _normalise_english(text: str) -> str: + def _sub(match): + run = match.group(1) + # Tokenise the run, splitting on whitespace and hyphens. + toks = [t for t in re.split(r"[\s\-]+", run.lower()) if t] + # If the run is JUST a single glue word ('and'), don't replace. + digit_toks = [t for t in toks if t not in _EN_GLUE] + if not digit_toks: + return run + # If the run is exactly one ones-word and the surrounding context + # likely doesn't refer to a count (e.g. 'one' in 'one of the + # things'), still replace — it's safer to over-digitize for the + # parametric scan than under-digitize. The dispatcher only fires + # when the digit appears in a parametric pattern context anyway. + try: + value = _en_words_to_int(digit_toks) + except Exception: + return run + return str(value) + + return _EN_RUN.sub(_sub, text) + + +# ─── Arabic ────────────────────────────────────────────────────── +# +# Arabic spelled-out numbers. The list lives in +# Config/language_tables.json under arabic_numbers.literals. Adding +# new dialectal variants (Egyptian / Maghrebi specific forms) is a +# JSON edit, not a Python change. Order in the JSON is preserved here +# (longest-first so multi-token phrases match before their prefixes). + +from Voice._language_tables import arabic_number_literals + +_AR_LITERALS = arabic_number_literals() + + +def _normalise_arabic(text: str) -> str: + # Apply longest first so multi-word values claim before singles. + sorted_lits = sorted(_AR_LITERALS, key=lambda p: len(p[0]), reverse=True) + for word, value in sorted_lits: + # Substring replacement is safe for Arabic here because our + # literals are full Arabic tokens with characters outside + # English wordspace; no risk of eating into other words. Still + # use word boundaries (whitespace OR start/end OR punctuation) + # to avoid eating 'خمس' inside 'خمسة'. + pattern = ( + r"(? str: + """Convert spelled-out numbers to digits in `text`. Idempotent — + running it twice produces the same result. Returns the original + text unchanged if it contains no recognisable number words. + + Order: integer-words pass FIRST (so 'three and a half' becomes + '3 and a half' before fraction handling), then fractions ('3 and + a half' → '3.5'). Without that order the fraction pass wouldn't + see digits to attach to.""" + if not text: + return text + out = _normalise_english(text) + out = _normalise_arabic(out) + out = _apply_fractions(out) + return out + + +# ─── Fraction parser (English + Arabic) ────────────────────────── +# +# Tables loaded from Config/language_tables.json. Two flavours: +# - additive : combines with a preceding digit ('3 and a half' +# / '3 ونصف' → 3.5). Includes a special-case +# handler for Arabic 'N ونصف' where the +# fraction trails after the unit noun. +# - leading : standalone before a unit ('half a meter' / 'نصف +# متر' → 0.5 meter / 0.5 متر). + +from Voice._language_tables import ( + english_fractions_additive, english_fractions_leading, + arabic_fractions_additive, arabic_fractions_leading, + arabic_unit_words, +) + +_EN_FRAC_ADD = english_fractions_additive() +_EN_FRAC_LEADING = english_fractions_leading() +_AR_FRAC_ADD = arabic_fractions_additive() +_AR_FRAC_LEADING = arabic_fractions_leading() +_AR_UNITS = arabic_unit_words() + + +def _fmt_decimal(v: float) -> str: + """Format a float without trailing zeros: 3.0 → '3', 3.5 → '3.5', + 1.79 → '1.79'.""" + if v == int(v): + return str(int(v)) + s = "{:.4f}".format(v).rstrip("0").rstrip(".") + return s + + +def _apply_fractions(text: str) -> str: + """Convert fractional expressions to decimal digits. + + Patterns handled (all idempotent — leaves digit-only text alone): + + EN '3 and a half steps' / '3 and half' → '3.5 steps' + EN 'half a meter' / 'half meter' → '0.5 meter' + EN 'a quarter step' → '0.25 step' + + AR '3 ونصف خطوات' → '3.5 خطوات' + AR '3 خطوات ونصف' (trailing fraction) → '3.5 خطوات' + AR 'نصف متر' → '0.5 متر' + AR 'متر ونصف' (no preceding count) → '0.5 متر' (interpreted as 1.5 + if a leading 1 is implicit; we + treat as 0.5 — explicit form + '1 ونصف متر' is preferred). + """ + if not text: + return text + + # ── ENGLISH additive: ' and (a/an)? ' ───────────────── + en_add_alt = "|".join(re.escape(k) for k in sorted(_EN_FRAC_ADD.keys(), key=len, reverse=True)) + if en_add_alt: + def _en_add(m): + n = float(m.group(1)) + frac_word = m.group(2).lower() + return _fmt_decimal(n + _EN_FRAC_ADD[frac_word]) + text = re.sub( + r"\b(\d+(?:\.\d+)?)\s+and\s+(?:an?\s+)?(" + en_add_alt + r")\b", + _en_add, text, flags=re.IGNORECASE, + ) + + # ── ENGLISH leading: '(a/an)? (a/an)? ' ─────────── + en_lead_alt = "|".join(re.escape(k) for k in sorted(_EN_FRAC_LEADING.keys(), key=len, reverse=True)) + if en_lead_alt: + def _en_lead(m): + frac_word = m.group(1).lower() + return _fmt_decimal(_EN_FRAC_LEADING[frac_word]) + " " + text = re.sub( + r"\b(?:an?\s+)?(" + en_lead_alt + r")\s+(?:an?\s+)?(?=[A-Za-z])", + _en_lead, text, flags=re.IGNORECASE, + ) + + # ── ARABIC trailing-fraction with unit: 'N ونصف' ───────── + if _AR_FRAC_ADD and _AR_UNITS: + unit_alt = "|".join(re.escape(u) for u in sorted(_AR_UNITS, key=len, reverse=True)) + ar_add_alt = "|".join(re.escape(k) for k in sorted(_AR_FRAC_ADD.keys(), key=len, reverse=True)) + def _ar_trail(m): + n = float(m.group(1)) + unit = m.group(2) + frac_word = m.group(3) + return "{} {}".format(_fmt_decimal(n + _AR_FRAC_ADD[frac_word]), unit) + text = re.sub( + r"(\d+(?:\.\d+)?)\s+(" + unit_alt + r")\s+و(" + ar_add_alt + r")\b", + _ar_trail, text, + ) + + # ── ARABIC additive: 'N ونصف' / 'N و نصف' ────────────────────── + if _AR_FRAC_ADD: + ar_add_alt = "|".join(re.escape(k) for k in sorted(_AR_FRAC_ADD.keys(), key=len, reverse=True)) + def _ar_add(m): + n = float(m.group(1)) + frac_word = m.group(2) + return _fmt_decimal(n + _AR_FRAC_ADD[frac_word]) + text = re.sub( + r"(\d+(?:\.\d+)?)\s*و\s*(" + ar_add_alt + r")\b", + _ar_add, text, + ) + + # ── ARABIC leading: ' ' ──────────────────────────── + if _AR_FRAC_LEADING: + ar_lead_alt = "|".join(re.escape(k) for k in sorted(_AR_FRAC_LEADING.keys(), key=len, reverse=True)) + def _ar_lead(m): + frac_word = m.group(1) + return _fmt_decimal(_AR_FRAC_LEADING[frac_word]) + " " + # Boundary: previous char is non-Arabic-letter, next char is + # an Arabic letter (so 'نصف' inside another word is preserved). + text = re.sub( + r"(? {got!r}") + print(f"\n{ok}/{ok+bad} passed") diff --git a/Voice/sequences.py b/Voice/sequences.py new file mode 100644 index 0000000..9b5b46a --- /dev/null +++ b/Voice/sequences.py @@ -0,0 +1,251 @@ +""" +sequences.py — user-defined motion macros (record / save / play). + +A sequence is a named list of canonical motion commands. Saved as JSON +under `Data/Sequences/.json`. Created via voice during a recording +session (the user dictates a series of motions, then names the macro); +played back later by the same name. Sequence content is the SAME format +the dispatcher already enqueues — strings like 'turn left 90 degrees', +'walk forward 5 steps', 'sit down' — so playback simply re-enqueues +each one through the existing motion worker. + +State model: + IDLE — no recording in progress. + RECORDING(name) — capturing dispatched commands into a buffer. + The optional `name` is set if the user pre-named + the sequence ('start recording as my-greet'); if + not, the user must name it on save. + PLAYING(name, queue) — currently re-issuing commands from `queue`. + (Tracked just for status; the work is done by + the motion worker, not this module.) + +Threading: + All state mutations are guarded by `_lock`. The recording buffer is + appended from the dispatcher thread; saved/loaded from any thread. + +Public API (used by Voice/marcus_voice.py): + Sequences() — singleton + seq.start_recording(name=None) — begin capture + seq.cancel_recording() — drop without saving + seq.save_recording(name) — finalise + write JSON + seq.record_command(canonical) — append to buffer if recording + seq.play(name) -> list[str] | None — return commands to enqueue + seq.list() -> list[str] — saved names + seq.delete(name) -> bool — remove file + seq.recording_state() -> dict — for status / persona feedback +""" +from __future__ import annotations + +import json +import os +import re +import threading +import time +from typing import Optional, List + +# Resolve Data/Sequences/ off PROJECT_ROOT so this module works whether +# Marcus is run from the repo root or invoked via systemd. PROJECT_ROOT +# is set by Core/env_loader. +try: + from Core.env_loader import PROJECT_ROOT # type: ignore +except Exception: + PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +_SEQUENCE_DIR = os.path.join(PROJECT_ROOT, "Data", "Sequences") +_NAME_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,30}$", re.I) + +# Skip these canonicals when capturing — control commands, not motions. +# Loaded from Config/language_tables.json::sequence_never_record.canonicals. +# Adding a new control canonical that should be excluded from recordings +# is a JSON edit, not a Python change. +from Voice._language_tables import sequence_never_record as _load_never_record +_NEVER_RECORD = _load_never_record() + + +def _safe_name(name: str) -> Optional[str]: + """Normalise + validate a sequence name. Returns None if invalid. + Forces lower-case so 'My-Greet' and 'my-greet' refer to the same + file — voice transcripts often vary capitalisation.""" + if not name: + return None + n = name.strip().lower().replace(" ", "-") + if not _NAME_RE.match(n): + return None + return n + + +class Sequences: + def __init__(self): + self._lock = threading.Lock() + self._recording: bool = False + self._record_name: Optional[str] = None + self._record_buf: List[str] = [] + self._record_started_at: float = 0.0 + os.makedirs(_SEQUENCE_DIR, exist_ok=True) + + # ─── recording ───────────────────────────────────────── + + def start_recording(self, name: Optional[str] = None) -> dict: + """Begin capturing. If `name` provided + valid, sequence is + pre-named (save_recording can be called with no arg). Returns + a small dict suitable for logging / state-injection.""" + with self._lock: + if self._recording: + return {"ok": False, "reason": "already recording", + "name": self._record_name} + self._recording = True + self._record_name = _safe_name(name) if name else None + self._record_buf = [] + self._record_started_at = time.time() + return {"ok": True, "name": self._record_name} + + def cancel_recording(self) -> dict: + with self._lock: + if not self._recording: + return {"ok": False, "reason": "not recording"} + n = self._record_name + count = len(self._record_buf) + self._recording = False + self._record_name = None + self._record_buf = [] + self._record_started_at = 0.0 + return {"ok": True, "name": n, "discarded": count} + + def save_recording(self, name: Optional[str] = None) -> dict: + """Persist the buffered commands as .json. Resolves the + name in this order: + 1. explicit `name` arg + 2. name set by start_recording + 3. None → error. + Empty buffers are NOT saved (would produce a dud macro).""" + with self._lock: + if not self._recording: + return {"ok": False, "reason": "not recording"} + target = _safe_name(name) or self._record_name + if not target: + return {"ok": False, "reason": "no name supplied"} + if not self._record_buf: + self._recording = False + self._record_name = None + return {"ok": False, "reason": "empty sequence (nothing recorded)", + "name": target} + payload = { + "name": target, + "created_at": self._record_started_at, + "saved_at": time.time(), + "commands": list(self._record_buf), + } + path = os.path.join(_SEQUENCE_DIR, target + ".json") + try: + with open(path, "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2, ensure_ascii=False) + except Exception as e: + return {"ok": False, "reason": "write failed: %s" % e} + count = len(self._record_buf) + self._recording = False + self._record_name = None + self._record_buf = [] + self._record_started_at = 0.0 + return {"ok": True, "name": target, "count": count, "path": path} + + def record_command(self, canonical: str) -> bool: + """Append a canonical to the recording buffer (if recording). + Returns True if appended, False otherwise. Skips control commands + (see _NEVER_RECORD).""" + if not canonical: + return False + with self._lock: + if not self._recording: + return False + c = canonical.strip() + if not c or c.lower() in _NEVER_RECORD: + return False + self._record_buf.append(c) + return True + + # ─── playback ───────────────────────────────────────── + + def play(self, name: str) -> Optional[List[str]]: + """Load the sequence's commands. Caller is responsible for + enqueueing them through the motion worker. Returns None if the + sequence doesn't exist or is malformed.""" + target = _safe_name(name) + if not target: + return None + path = os.path.join(_SEQUENCE_DIR, target + ".json") + if not os.path.isfile(path): + return None + try: + with open(path, "r", encoding="utf-8") as f: + payload = json.load(f) + except Exception: + return None + cmds = payload.get("commands") + if not isinstance(cmds, list) or not all(isinstance(c, str) for c in cmds): + return None + return [c for c in cmds if c.strip()] + + def list(self) -> List[str]: + """Sorted list of saved sequence names.""" + try: + files = os.listdir(_SEQUENCE_DIR) + except FileNotFoundError: + return [] + out = sorted( + os.path.splitext(f)[0] + for f in files + if f.endswith(".json") and _NAME_RE.match(os.path.splitext(f)[0]) + ) + return out + + def delete(self, name: str) -> bool: + target = _safe_name(name) + if not target: + return False + path = os.path.join(_SEQUENCE_DIR, target + ".json") + if not os.path.isfile(path): + return False + try: + os.remove(path) + return True + except Exception: + return False + + # ─── status ─────────────────────────────────────────── + + def recording_state(self) -> dict: + with self._lock: + return { + "recording": self._recording, + "name": self._record_name, + "count": len(self._record_buf), + "elapsed_sec": (time.time() - self._record_started_at + if self._recording else 0.0), + } + + +# Process-wide singleton — the dispatcher and the worker share one. +_INSTANCE: Optional[Sequences] = None +_INSTANCE_LOCK = threading.Lock() + + +def get_sequences() -> Sequences: + global _INSTANCE + with _INSTANCE_LOCK: + if _INSTANCE is None: + _INSTANCE = Sequences() + return _INSTANCE + + +# Standalone smoke check +if __name__ == "__main__": + s = Sequences() + print(s.start_recording("test-greet")) + print(s.record_command("turn right 45 degrees")) + print(s.record_command("wave hello")) + print(s.record_command("turn left 45 degrees")) + print(s.save_recording()) + print("listed:", s.list()) + print("play:", s.play("test-greet")) + print(s.delete("test-greet")) + print("after delete:", s.list()) diff --git a/logs/command_history.json b/logs/command_history.json new file mode 100644 index 0000000..a808cef --- /dev/null +++ b/logs/command_history.json @@ -0,0 +1,15 @@ +{ + "commands": [ + "move backward", + "resume motion", + "come here", + "turn right 90 degrees", + "walk 1 steps", + "stop", + "walk backward 20 steps", + "stop", + "stop", + "come here" + ], + "saved_at": 1777353109.489595 +} \ No newline at end of file