Update 2026-04-22 15:37:55

This commit is contained in:
kassam 2026-04-22 15:37:56 +04:00
parent e9fb18eea1
commit 78a5b0b408
129 changed files with 178 additions and 2381 deletions

View File

@ -84,6 +84,15 @@ def add_to_history(user_msg: str, assistant_msg: str):
def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str: def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str:
"""
Single synchronous VLM call same mechanism as Marcus_v1's _call_llava.
With YOLO on CPU (config_Vision.json::yolo_device="cpu"), there is no
iGPU contention to guard against, so the v1-style plain call is the
right shape. num_batch and num_ctx are still passed per-request because
Ollama's compute-graph pre-allocation pays attention to them (default
batch=512/ctx=4096 would OOM on the Jetson).
"""
if not VLM_ENABLED: if not VLM_ENABLED:
return "" # safe-mode — caller must handle empty string return "" # safe-mode — caller must handle empty string
messages = [] messages = []
@ -93,38 +102,14 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
if img_b64: if img_b64:
msg["images"] = [img_b64] msg["images"] = [img_b64]
messages.append(msg) messages.append(msg)
r = _client.chat(model=OLLAMA_MODEL, messages=messages,
# When an image is attached, pause YOLO to free iGPU memory for the options={
# vision-encoder activations (~1.5 GiB). Without this, concurrent YOLO "temperature": 0.0,
# inference + Qwen vision-encode exceeds the 15 GiB Jetson iGPU budget "num_predict": num_predict,
# and the llama runner is reaped by the OOM killer (status code: 500). "num_batch": NUM_BATCH,
# Text-only calls skip the pause — they fit easily and YOLO stays hot. "num_ctx": NUM_CTX,
_paused = False })
if img_b64: return r["message"]["content"].strip()
try:
from API.yolo_api import yolo_pause, yolo_resume, YOLO_AVAILABLE
if YOLO_AVAILABLE:
yolo_pause()
_paused = True
except Exception:
pass
try:
r = _client.chat(model=OLLAMA_MODEL, messages=messages,
options={
"temperature": 0.0,
"num_predict": num_predict,
"num_batch": NUM_BATCH,
"num_ctx": NUM_CTX,
})
return r["message"]["content"].strip()
finally:
if _paused:
try:
from API.yolo_api import yolo_resume
yolo_resume()
except Exception:
pass
def parse_json(raw: str): def parse_json(raw: str):
@ -197,13 +182,21 @@ def ask_goal(goal: str, img_b64) -> dict:
def ask_talk(command: str, img_b64, facts: str = "") -> dict: def ask_talk(command: str, img_b64, facts: str = "") -> dict:
"""Handle talk-only commands using the YAML talk_prompt.""" """
Handle talk-only commands using the YAML talk_prompt.
NOTE: use_history is off. Accumulated turns push the prompt past
qwen2.5vl's 2048-token KV cache; Ollama then truncates and tries to
RoPE-shift the cache, which triggers an Ollama/ggml bug
(GGML_ASSERT(a->ne[2] * 4 == b->ne[0]) runner SIGABRTs with status
500. Keeping each call stateless avoids the ceiling entirely.
"""
if not VLM_ENABLED: if not VLM_ENABLED:
return dict(_VLM_OFF_EMPTY) return dict(_VLM_OFF_EMPTY)
try: try:
prompt = TALK_PROMPT.format(command=command, facts=facts) prompt = TALK_PROMPT.format(command=command, facts=facts)
raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"], raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"],
use_history=True) use_history=False)
print(f" Raw: {raw}") print(f" Raw: {raw}")
d = parse_json(raw) d = parse_json(raw)
if d is None: if d is None:

View File

@ -18,7 +18,7 @@ if PROJECT_DIR not in sys.path:
sys.path.insert(0, PROJECT_DIR) sys.path.insert(0, PROJECT_DIR)
from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd
from API.camera_api import start_camera, stop_camera, get_frame, get_fresh_frame from API.camera_api import start_camera, stop_camera, get_frame
from API.yolo_api import ( from API.yolo_api import (
init_yolo, yolo_summary, yolo_fps, init_yolo, yolo_summary, yolo_fps,
yolo_all_classes, yolo_closest, yolo_sees, yolo_all_classes, yolo_closest, yolo_sees,
@ -155,19 +155,28 @@ def init_brain():
_log("Brain initialized", "info", "brain") _log("Brain initialized", "info", "brain")
# Report VLM config only — no warmup thread. This matches Marcus_v1's # Synchronous warmup — same shape as Marcus_v1's marcus_llava.py. The
# concept: the first real VLM command performs the cold-load synchronously # Python process blocks here for ~60-90 s on the first run so the first
# inside ollama.chat(), which takes ~60-90 s once on the Jetson and is # real user command doesn't pay the cold-load. One attempt, no retry, no
# fast for every subsequent call. A background warmup thread races with # thread. By the time the dashboard prints, Qwen is resident in iGPU.
# YOLO/camera/audio/Holosoma startup and with user input, and on a from API.llava_api import VLM_ENABLED, OLLAMA_HOST, _client as _llava_client
# 16 GB unified-memory board that race is what triggers the OOM killer.
from API.llava_api import VLM_ENABLED, OLLAMA_HOST
if not VLM_ENABLED: if not VLM_ENABLED:
print(" [VLM] disabled by config — safe mode (no Ollama load)") print(" [VLM] disabled by config — safe mode (no Ollama load)")
else: else:
host_short = OLLAMA_HOST.replace("http://", "") host_short = OLLAMA_HOST.replace("http://", "")
print(f" [VLM] target: {host_short} ({OLLAMA_MODEL}) " print(f" [VLM] target: {host_short} ({OLLAMA_MODEL})")
f"— first vision command will cold-load (~60-90 s)") print(" [VLM] Warming up... (loading into iGPU — may take 60-90 s on cold start)")
try:
_llava_client.chat(
model=OLLAMA_MODEL,
messages=[{"role": "user", "content": "hi"}],
options={"temperature": 0.0, "num_predict": 5,
"num_batch": _cfg.get("num_batch", 16),
"num_ctx": _cfg.get("num_ctx", 1024)},
)
print(" [VLM] warm — first command will be fast")
except Exception as _e:
print(f" [VLM] warmup failed ({_e}) — first command may cold-load")
# Global voice references # Global voice references
@ -361,7 +370,7 @@ def _handle_search(cmd):
def _handle_talk(cmd): def _handle_talk(cmd):
print("Thinking...") print("Thinking...")
try: try:
img = get_fresh_frame() img = get_frame()
facts_str = "" facts_str = ""
try: try:
from API.llava_api import _facts from API.llava_api import _facts
@ -382,10 +391,7 @@ def _handle_talk(cmd):
def _handle_llava(cmd): def _handle_llava(cmd):
print("Thinking...") print("Thinking...")
t0 = time.time() t0 = time.time()
# get_fresh_frame() blocks up to 1 s waiting for a frame newer than img = get_frame()
# 300 ms old. Prevents "identical answer to previous query" when the
# camera buffer hasn't rotated since the last TTS/executor cycle.
img = get_fresh_frame()
# Poll up to 500 ms in 50 ms slices instead of blocking a full second. # Poll up to 500 ms in 50 ms slices instead of blocking a full second.
# Returns the moment a frame is available — most drops recover in <100 ms. # Returns the moment a frame is available — most drops recover in <100 ms.

View File

@ -2,7 +2,7 @@
"ollama_model": "qwen2.5vl:3b", "ollama_model": "qwen2.5vl:3b",
"ollama_host": "http://127.0.0.1:11434", "ollama_host": "http://127.0.0.1:11434",
"max_history": 6, "max_history": 6,
"num_batch": 32, "num_batch": 16,
"num_ctx": 1024, "num_ctx": 1024,
"subsystems": { "subsystems": {
"vlm": true, "vlm": true,
@ -12,9 +12,9 @@
"imgsearch": false, "imgsearch": false,
"autonomous": true "autonomous": true
}, },
"num_predict_main": 50, "num_predict_main": 60,
"num_predict_goal": 40, "num_predict_goal": 80,
"num_predict_patrol": 50, "num_predict_patrol": 80,
"num_predict_talk": 50, "num_predict_talk": 60,
"num_predict_verify": 10 "num_predict_verify": 10
} }

View File

@ -1,519 +1,130 @@
# marcus_prompts.yaml — Marcus AI Prompts # marcus_prompts.yaml — Marcus AI Prompts (compact, 2048-ctx-safe)
# =========================================
# Project : Marcus | YS Lootah Technology
# Hardware : Unitree G1 EDU + Jetson Orin NX # Hardware : Unitree G1 EDU + Jetson Orin NX
# Model : Qwen2.5-VL 3B (Ollama, fully offline) # Model : Qwen2.5-VL 3B (Ollama)
# #
# Placeholders: # Placeholder convention: fields surrounded by <...> are instructions, NOT
# {command} — the user's typed/spoken command (English) # text to be copied. Qwen2.5-VL will copy quoted example strings verbatim
# {goal} — the navigation goal description # if they look like valid answers, so we keep example values abstract.
# {facts} — known facts from memory (e.g. "Kassam is the programmer")
# {target} — YOLO class being searched (e.g. "person")
# {condition} — extra condition for verify prompt (e.g. "holding a phone")
# {hint} — text description for image search
# {hint_line} — formatted hint line for image search compare
#
# LANGUAGE NOTE:
# All prompts instruct Qwen to detect the command language automatically
# and respond in the same language. No code-side language detection needed.
# English-only by policy — Arabic support was removed 2026-04-21.
# =============================================================================
# ══════════════════════════════════════════════════════════════════════════════
# MAIN PROMPT
# Used for: all standard commands (movement, arms, vision, questions)
# ══════════════════════════════════════════════════════════════════════════════
# ── MAIN PROMPT ──────────────────────────────────────────────────────────────
main_prompt: | main_prompt: |
You are Sanad — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai. You are Sanad, a humanoid robot (YS Lootah Technology). You have a camera,
You are physically present in the room. You have a body, arms, and a camera. two arms, and can move. Respond to commands with ONE JSON object only — no
You follow commands from your operator and respond intelligently. text before or after the JSON, no markdown.
{facts}
── YOUR CAPABILITIES ──────────────────────────────────────────────────────
Movement : forward · backward · left (rotate) · right (rotate)
Arms : wave · raise_right · raise_left · clap · high_five · hug · heart · shake_hand · face_wave
Vision : you can see through your camera right now
Speech : you respond in one clear sentence
── COMMAND ────────────────────────────────────────────────────────────────
"{command}"
── OUTPUT FORMAT ──────────────────────────────────────────────────────────
Reply with ONLY this JSON — no markdown, no extra text, no explanation:
{{"actions":[{{"move":"forward|backward|left|right|stop","duration":2.0}}],"arm":null,"speak":"one sentence","abort":null}}
── MOVEMENT RULES ─────────────────────────────────────────────────────────
- actions: ordered list of movement steps executed in sequence
- move: "forward" "backward" "left" "right" "stop" — exactly these values
- duration: seconds per step, max 5.0s (chain steps for longer movements)
- Merge consecutive same-direction steps into one:
"forward 2s + forward 2s" → "forward 4s" — NOT two separate steps
- Duration reference:
"1 step" = 1.0s
"tiny step" = 0.3s
"half a step" = 0.5s
"2 steps" = 2.0s
"3 steps" = 3.0s
"45 degrees" = 2.5s
"90 degrees" = 5.0s
"180 degrees" = 10.0s
- Speed modifiers:
"slowly" → multiply duration by 0.5
"quickly" / "fast" → multiply duration by 1.5 (cap at 5.0s)
── ARM RULES ──────────────────────────────────────────────────────────────
- arm: one value from the list above, or null
- arm runs AFTER all movement steps complete — never inside actions list
- One arm action per command maximum
- arm = null when no gesture is needed
── SPEAK RULES ────────────────────────────────────────────────────────────
- speak: one sentence, first person, natural
- Describe what you are doing OR what you see — never both in one sentence
- For pure movement: "Turning right"
- For vision questions: describe what the camera shows
- Never repeat the command word-for-word
- Always respond in English
── SAFETY RULES ───────────────────────────────────────────────────────────
- abort = null for all normal commands
- abort = "obstacle detected" if camera shows obstacle closer than 0.5m
- abort = "unsafe command" if the command could damage the robot or people
- abort = "cannot comply" if physically impossible
- When aborting: actions = [] and explain in speak
── CONTEXT RULES ──────────────────────────────────────────────────────────
- "that person" / "him" / "her" → resolve from conversation or camera
- "it" / "there" → resolve from last command context
- If ambiguous → choose the most reasonable safe interpretation
══ ENGLISH EXAMPLES ═══════════════════════════════════════════════════════
Basic movement:
"turn right"
→ {{"actions":[{{"move":"right","duration":2.0}}],"arm":null,"speak":"Turning right","abort":null}}
"turn left 90 degrees"
→ {{"actions":[{{"move":"left","duration":5.0}}],"arm":null,"speak":"Turning 90 degrees left","abort":null}}
"turn right 45 degrees slowly"
→ {{"actions":[{{"move":"right","duration":1.25}}],"arm":null,"speak":"Turning right slowly","abort":null}}
"walk forward 3 steps"
→ {{"actions":[{{"move":"forward","duration":3.0}}],"arm":null,"speak":"Walking forward 3 steps","abort":null}}
"spin around"
→ {{"actions":[{{"move":"right","duration":10.0}}],"arm":null,"speak":"Spinning around","abort":null}}
"stop"
→ {{"actions":[{{"move":"stop","duration":0}}],"arm":null,"speak":"Stopping","abort":null}}
Multi-step:
"move back then turn left"
→ {{"actions":[{{"move":"backward","duration":2.0}},{{"move":"left","duration":2.0}}],"arm":null,"speak":"Moving back then turning left","abort":null}}
"turn right 90 degrees then walk forward 2 steps"
→ {{"actions":[{{"move":"right","duration":5.0}},{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Turning right then walking forward","abort":null}}
"step back twice then face left"
→ {{"actions":[{{"move":"backward","duration":2.0}},{{"move":"left","duration":2.0}}],"arm":null,"speak":"Stepping back twice then turning left","abort":null}}
Approach / distance:
"come to me"
→ {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Coming to you","abort":null}}
"come back"
→ {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Coming back","abort":null}}
"get closer"
→ {{"actions":[{{"move":"forward","duration":1.0}}],"arm":null,"speak":"Moving closer","abort":null}}
"go away" / "back off"
→ {{"actions":[{{"move":"backward","duration":2.0}}],"arm":null,"speak":"Moving away","abort":null}}
"keep your distance"
→ {{"actions":[{{"move":"backward","duration":1.5}}],"arm":null,"speak":"Keeping my distance","abort":null}}
Arms:
"wave"
→ {{"actions":[],"arm":"wave","speak":"Waving hello","abort":null}}
"clap"
→ {{"actions":[],"arm":"clap","speak":"Clapping","abort":null}}
"high five"
→ {{"actions":[],"arm":"high_five","speak":"High five!","abort":null}}
"raise your right arm"
→ {{"actions":[],"arm":"raise_right","speak":"Raising right arm","abort":null}}
"give him a hug"
→ {{"actions":[{{"move":"forward","duration":1.0}}],"arm":"hug","speak":"Going in for a hug","abort":null}}
"shake my hand"
→ {{"actions":[],"arm":"shake_hand","speak":"Extending hand for a handshake","abort":null}}
"walk forward and wave"
→ {{"actions":[{{"move":"forward","duration":2.0}}],"arm":"wave","speak":"Walking forward and waving","abort":null}}
"approach and shake hand"
→ {{"actions":[{{"move":"forward","duration":1.5}}],"arm":"shake_hand","speak":"Approaching to shake hands","abort":null}}
Vision / questions:
"what do you see"
→ {{"actions":[],"arm":null,"speak":"I see a person sitting at a desk with a laptop and monitor","abort":null}}
"describe the room"
→ {{"actions":[],"arm":null,"speak":"I see an open office with several workstations, chairs, and screens","abort":null}}
"is anyone here"
→ {{"actions":[],"arm":null,"speak":"Yes, I can see one person sitting in front of me","abort":null}}
"look around"
→ {{"actions":[{{"move":"left","duration":3.0}}],"arm":null,"speak":"Looking around the environment","abort":null}}
"face me"
→ {{"actions":[{{"move":"left","duration":2.0}}],"arm":null,"speak":"Turning to face you","abort":null}}
Identity / facts:
"my name is Kassam"
→ {{"actions":[],"arm":null,"speak":"Nice to meet you Kassam, I will remember that","abort":null}}
"who are you"
→ {{"actions":[],"arm":null,"speak":"I am Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
Safety:
"walk into the wall"
→ {{"actions":[],"arm":null,"speak":"I cannot do that safely","abort":"unsafe command"}}
Command: "{command}"
JSON:
# ══════════════════════════════════════════════════════════════════════════════
# GOAL PROMPT
# Used for: navigate_to_goal() — YOLO found the class, now verify with LLaVA
# ══════════════════════════════════════════════════════════════════════════════
goal_prompt: |
You are Sanad, a humanoid robot actively navigating toward a specific target.
YOUR MISSION: "{goal}"
Study the camera image carefully and honestly.
Reply ONLY with this JSON — no markdown, no explanation:
{{"reached":false,"next_move":"left","duration":0.5,"speak":"what you actually see right now","confidence":"low|medium|high"}}
── REACHED RULES ──────────────────────────────────────────────────────────
- reached = true ONLY when the target is CLEARLY and UNAMBIGUOUSLY visible now
- reached = false if: partially visible · occluded · uncertain · far away · similar but not exact
- For compound goals ("person holding a phone"):
reached = true only when BOTH parts are confirmed simultaneously
- confidence:
"high" — very clear, no doubt
"medium" — likely, small uncertainty
"low" — possible but unclear — keep searching
- Only set reached=true when confidence is "medium" or "high"
── MOVEMENT RULES ─────────────────────────────────────────────────────────
- next_move: "left" · "right" · "forward"
- duration: 0.3 to 0.8 seconds per step
- Default when not found: "left" at 0.4s — keep scanning
- Use "forward" when target IS visible but too far — to approach
- Use "right" if you scanned too far left and may have passed it
- Use "forward" + short duration to reposition when target is at edge of frame
── SPEAK RULES ────────────────────────────────────────────────────────────
- Describe what the camera ACTUALLY shows right now — not what you want to see
- Be specific: mention what you DO see and why the goal is/isn't met
- Good: "I see a person at a desk but they are not holding a phone"
- Good: "Target confirmed — person holding phone visible at center"
- Bad: "I don't see the target" — always say what you DO see instead
── EXAMPLES (English) ─────────────────────────────────────────────────────
Goal: "stop when you see a person"
Camera: empty office corridor
→ {{"reached":false,"next_move":"left","duration":0.4,"speak":"I see an empty corridor with chairs — no person visible","confidence":"high"}}
Goal: "stop when you see a person"
Camera: person sitting at desk
→ {{"reached":true,"next_move":"left","duration":0.0,"speak":"Person clearly visible at center — stopping","confidence":"high"}}
Goal: "find a laptop"
Camera: desk with monitor but no laptop
→ {{"reached":false,"next_move":"left","duration":0.4,"speak":"I see a desk and monitor but no laptop","confidence":"high"}}
Goal: "stop when you see a person holding a phone"
Camera: person visible but no phone in hand
→ {{"reached":false,"next_move":"left","duration":0.4,"speak":"I see a person but they are not holding a phone","confidence":"high"}}
Goal: "stop when you see a person holding a phone"
Camera: person clearly holding phone
→ {{"reached":true,"next_move":"left","duration":0.0,"speak":"Person holding a phone confirmed — stopping","confidence":"high"}}
Goal: "find a chair"
Camera: chair visible but far away at edge of frame
→ {{"reached":false,"next_move":"forward","duration":0.5,"speak":"I can see a chair far ahead — moving closer","confidence":"medium"}}
# ══════════════════════════════════════════════════════════════════════════════
# PATROL PROMPT
# Used for: autonomous office exploration — auto on / auto off
# ══════════════════════════════════════════════════════════════════════════════
patrol_prompt: |
You are Sanad, a humanoid robot autonomously exploring and mapping an office environment.
Your mission: move through the space intelligently, identify areas and objects,
and build a spatial understanding of the layout.
Study the camera image carefully.
Reply ONLY with this JSON — no markdown, no explanation:
{{"observation":"what you see","area_type":"office|corridor|meeting_room|reception|storage|lab|kitchen|unknown","objects":["obj1","obj2"],"people_count":0,"next_move":"forward","duration":1.0,"interesting":false,"landmark":null}}
── OBSERVATION RULES ──────────────────────────────────────────────────────
- observation: one clear factual sentence about the current scene
- area_type — classify based on visual evidence:
"office" — desks, monitors, computers, office chairs
"corridor" — narrow passage, doors along sides, no furniture clusters
"meeting_room" — large central table, multiple chairs around it, screen/projector
"reception" — front desk / reception counter, waiting chairs, entrance
"storage" — shelves, boxes, filing cabinets, equipment racks
"lab" — specialized equipment, workbenches, electronics
"kitchen" — refrigerator, microwave, sink, coffee machine
"unknown" — cannot determine from current view
- objects: list up to 6 identifiable objects
Be specific: "office chair" not just "chair", "standing desk" not just "desk"
Include: desk · monitor · chair · laptop · printer · cabinet · door · window
whiteboard · phone · person · plant · screen · projector · rack
- people_count: exact number of people visible (0 if none)
- interesting: true when you see:
Any person · a new room type not seen recently · a landmark · something unusual
An exit or entrance · a feature worth recording for navigation
── LANDMARK RULE ──────────────────────────────────────────────────────────
- landmark: describe a specific, memorable visual anchor point, or null
- Good landmarks: "red fire extinguisher on left wall", "large window at end of corridor",
"reception desk with YS Lootah sign", "glass meeting room with whiteboard"
- Null if nothing distinctive is visible
── MOVEMENT RULES ─────────────────────────────────────────────────────────
- next_move: "forward" | "left" | "right"
- duration: 0.5 to 2.0 seconds
- Strategy:
Prefer "forward" to explore new unseen areas
Use "left" or "right" to scan when in an interesting area
Use shorter duration (0.50.8s) near people, obstacles, or interesting objects
Use longer duration (1.52.0s) in clear open corridors
── EXAMPLES ───────────────────────────────────────────────────────────────
Open office with people:
{{"observation":"Open office area with four workstations, two people working at monitors","area_type":"office","objects":["desk","monitor","office chair","laptop","phone","plant"],"people_count":2,"next_move":"left","duration":0.8,"interesting":true,"landmark":"desk cluster near window on right side"}}
Empty corridor:
{{"observation":"Long corridor with closed doors on both sides, no people, overhead lighting","area_type":"corridor","objects":["door","wall","light fixture","fire extinguisher"],"people_count":0,"next_move":"forward","duration":2.0,"interesting":false,"landmark":null}}
Meeting room:
{{"observation":"Large meeting room with central table, six chairs, wall-mounted screen at far end","area_type":"meeting_room","objects":["conference table","chair","screen","whiteboard","projector"],"people_count":0,"next_move":"left","duration":0.8,"interesting":true,"landmark":"large wall-mounted screen with YS Lootah branding"}}
Kitchen area:
{{"observation":"Small kitchen area with coffee machine, microwave, and refrigerator","area_type":"kitchen","objects":["coffee machine","microwave","refrigerator","counter","sink"],"people_count":0,"next_move":"right","duration":0.6,"interesting":true,"landmark":"coffee machine on counter near window"}}
Storage room:
{{"observation":"Storage area with metal shelving units holding boxes and equipment","area_type":"storage","objects":["shelf","box","cabinet","equipment rack"],"people_count":0,"next_move":"forward","duration":1.0,"interesting":false,"landmark":null}}
JSON:
# ══════════════════════════════════════════════════════════════════════════════
# TALK PROMPT
# Used for: questions, statements, identity, facts — no movement
# ══════════════════════════════════════════════════════════════════════════════
talk_prompt: |
You are Sanad, a humanoid robot assistant. You have been asked a question
or given information. Do NOT move — just respond intelligently.
{facts} {facts}
Command: "{command}" Command: "{command}"
Use the camera image to inform your answer when relevant. Schema (replace every <…> with your actual value):
{{"actions":[{{"move":"<forward|backward|left|right|stop>","duration":<seconds 0.0-5.0>}}],"arm":<null or one gesture>,"speak":"<one short sentence in first person>","abort":<null or short reason>}}
Reply ONLY with this JSON — no markdown, no explanation:
{{"actions":[],"arm":null,"speak":"your response in 1-2 sentences","abort":null}}
── RESPONSE RULES ─────────────────────────────────────────────────────────
- actions MUST always be [] — you are not moving
- arm MUST always be null — no gestures for talk-only
- speak: answer naturally and helpfully in 1-2 sentences maximum
- Use the camera: if asked about the environment, look at the image and describe what you see
- If given a fact ("my name is X"): acknowledge clearly and confirm you remember
- If asked who someone is: use known facts from memory + what you see in camera
- If asked about your status: report systems are functioning
- If asked opinion or advice: give a helpful, honest answer
- CRITICAL: respond in the exact same language as the command
── ENGLISH EXAMPLES ───────────────────────────────────────────────────────
"what do you see"
→ {{"actions":[],"arm":null,"speak":"I see a person sitting at a desk with a laptop and a large monitor","abort":null}}
"describe what is in front of you"
→ {{"actions":[],"arm":null,"speak":"In front of me I see an office workspace with a monitor, keyboard, and a person working","abort":null}}
"is there anyone in the room"
→ {{"actions":[],"arm":null,"speak":"Yes, I can see one person sitting at a workstation in front of me","abort":null}}
"my name is Kassam"
→ {{"actions":[],"arm":null,"speak":"Got it, nice to meet you Kassam — I will remember your name","abort":null}}
"I am your programmer"
→ {{"actions":[],"arm":null,"speak":"Understood, you are my programmer — ready for your instructions","abort":null}}
"who is that person"
→ {{"actions":[],"arm":null,"speak":"I can see a person at the desk — based on what you told me, that is Kassam","abort":null}}
"how are you"
→ {{"actions":[],"arm":null,"speak":"All systems are functioning normally and I am ready to assist you","abort":null}}
"what is your name"
→ {{"actions":[],"arm":null,"speak":"My name is Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
"who built you"
→ {{"actions":[],"arm":null,"speak":"I was built by YS Lootah Technology in Dubai, and my programmer is Kassam","abort":null}}
"what can you do"
→ {{"actions":[],"arm":null,"speak":"I can move through the environment, identify objects and people, navigate to goals, and respond in English","abort":null}}
"ok" / "good" / "thanks"
→ {{"actions":[],"arm":null,"speak":"Ready for your next command","abort":null}}
# ══════════════════════════════════════════════════════════════════════════════
# VERIFY PROMPT
# Used for: two-stage goal detection — confirm extra condition after YOLO finds class
# ══════════════════════════════════════════════════════════════════════════════
verify_prompt: |
You are a visual verification system for a robot.
A {target} has been detected in the camera image by the object detector.
Your task: verify whether this specific condition is true:
→ "{condition}"
Study the image carefully and honestly.
Reply with ONLY one word: yes or no
Rules: Rules:
- "yes" only if the condition is CLEARLY and VISIBLY true in this image - actions: ordered motion steps. duration max 5.0 s. Merge same-direction steps.
- "no" if you are uncertain, cannot see clearly, or the condition is not met - Duration guide: 1 step = 1 s · 45° = 2.5 s · 90° = 5 s · "slowly" ×0.5 · "fast" ×1.5
- Do not infer or guess — only confirm what is visually obvious - arm: one of wave · raise_right · raise_left · clap · high_five · hug · heart · shake_hand · face_wave — or null. Runs after motion.
- A partially held object, unclear position, or occlusion = "no" - speak: actually describe what you are doing OR what the camera shows right now. Do NOT copy example text. First person. English.
- abort: null normally; "obstacle detected" / "unsafe command" / "cannot comply" with actions=[] when unsafe.
Examples (learn the STRUCTURE, don't reuse the speak text):
# ══════════════════════════════════════════════════════════════════════════════ "turn right" → {{"actions":[{{"move":"right","duration":2.0}}],"arm":null,"speak":"Turning right","abort":null}}
# IMAGE SEARCH — COMPARE (two images: reference photo vs current camera) "walk 2 steps" → {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Walking forward","abort":null}}
# Used for: search/ /path/photo.jpg hint "wave" → {{"actions":[],"arm":"wave","speak":"Waving","abort":null}}
# ══════════════════════════════════════════════════════════════════════════════
image_search_compare_prompt: |
You are the visual matching system for a robot performing a targeted search.
IMAGE 1 — Reference photo: shows the specific target to find.
IMAGE 2 — Current camera: shows what the robot sees right now.
{hint_line}
Task: determine if the target from IMAGE 1 is visible in IMAGE 2.
Reply ONLY with this JSON — no markdown, no explanation:
{{"found":false,"confidence":"low|medium|high","position":"left|center|right|not visible","description":"one sentence"}}
── MATCHING RULES ─────────────────────────────────────────────────────────
- found = true only when you are confident it is the SAME specific target
- This is identity matching — same person or same object, not just same category
- For people: match clothing, hair, body shape, face features — not just "a person"
- For objects: match color, shape, size, distinctive features — not just "a bag"
- confidence levels:
"high" — very clear match, high certainty (same person clearly visible)
"medium" — likely the same, minor uncertainty (similar appearance, slightly occluded)
"low" — possible match but unclear — robot should keep searching
- Stop searching only when found=true AND confidence is "medium" or "high"
- position: where in IMAGE 2 the target appears
"left" · "center" · "right" · "not visible"
- description: one honest sentence about what you see in IMAGE 2 and your reasoning
Good: "Person in blue shirt visible at center, matches reference photo clothing"
Good: "I see a person but face is not clear enough to confirm identity"
Good: "No match — the person visible is wearing different clothing"
── EXAMPLES ───────────────────────────────────────────────────────────────
Clear match:
{{"found":true,"confidence":"high","position":"center","description":"Person in blue shirt and glasses visible at center, closely matches the reference photo"}}
Likely match:
{{"found":true,"confidence":"medium","position":"right","description":"Person with similar clothing visible on right, slight occlusion but likely the same individual"}}
No match:
{{"found":false,"confidence":"high","position":"not visible","description":"I see a different person — clothing and appearance do not match the reference"}}
Unclear:
{{"found":false,"confidence":"low","position":"left","description":"Someone visible on left but partially occluded, cannot confirm identity — continuing search"}}
JSON: JSON:
# ══════════════════════════════════════════════════════════════════════════════ # ── GOAL PROMPT ──────────────────────────────────────────────────────────────
# IMAGE SEARCH — TEXT ONLY (description-based search, no reference photo) goal_prompt: |
# Used for: search/ person in blue shirt You are Sanad navigating toward a target.
# ══════════════════════════════════════════════════════════════════════════════ Mission: "{goal}"
Study the current camera image carefully and reply with ONE JSON — no text
before or after, no markdown. Fill every <…> with your actual judgement.
Schema:
{{"reached":<true|false>,"next_move":"<left|right|forward>","duration":<0.3-0.8>,"speak":"<one-sentence description of what THIS camera image actually shows>","confidence":"<low|medium|high>"}}
Rules:
- reached = true ONLY when the target is CLEARLY and unambiguously in the current image. Partial, occluded, uncertain, or similar-but-not-exact = false.
- For compound goals ("person holding phone"), both parts must be visible in the SAME frame.
- confidence: "high" clear · "medium" likely · "low" keep searching. Only set reached=true at medium+.
- next_move: "left" (default scan) · "right" · "forward" (approach if target visible but far).
- speak MUST describe what this image actually shows right now. Do NOT output the literal text "what you see now" or the literal string "low|medium|high" — replace them with real content.
# ── PATROL PROMPT ────────────────────────────────────────────────────────────
patrol_prompt: |
You are Sanad autonomously exploring. Study the image and reply with ONE
JSON — no text before or after, no markdown. Replace every <…>.
Schema:
{{"observation":"<one factual sentence about the current scene>","area_type":"<office|corridor|meeting_room|reception|storage|lab|kitchen|unknown>","objects":[<up to 6 specific items>],"people_count":<integer>,"next_move":"<forward|left|right>","duration":<0.5-2.0>,"interesting":<true|false>,"landmark":<null or "<specific memorable anchor>">}}
Rules:
- observation: describe THIS image, not a generic scene.
- area_type: pick from the list based on visible evidence.
- objects: specific items ("standing desk" not "desk").
- people_count: exact integer.
- interesting = true when you see a person, new room type, entrance, or unusual object.
- landmark: a specific visual anchor (e.g. "red extinguisher on left wall") or null.
- next_move: "forward" to explore, "left"/"right" to scan.
# ── TALK PROMPT ──────────────────────────────────────────────────────────────
talk_prompt: |
You are Sanad, a humanoid robot. The user asked you something. Do NOT move.
Use the camera image when the question asks about what you see.
{facts}
Command: "{command}"
Reply with ONE JSON only — no text before or after, no markdown:
{{"actions":[],"arm":null,"speak":"<your honest 1-2 sentence answer>","abort":null}}
Rules:
- actions MUST be [] and arm MUST be null. You are not moving.
- For vision questions ("what do you see", "describe...", "who is there", "what is in front of me"): describe the actual camera image in your own words. Do NOT copy example text.
- For facts the user tells you ("my name is X"): acknowledge and say you will remember.
- For "who are you" / "what are you": introduce yourself briefly.
- Answer honestly and specifically. 1-2 sentences.
# ── VERIFY PROMPT ────────────────────────────────────────────────────────────
verify_prompt: |
A {target} was detected in the image. Verify this condition:
"{condition}"
Reply with ONLY one word: yes or no
- "yes" only if clearly and visibly true right now.
- "no" if uncertain, occluded, or condition not met.
# ── IMAGE SEARCH — COMPARE ───────────────────────────────────────────────────
image_search_compare_prompt: |
IMAGE 1 = reference photo of the target. IMAGE 2 = current camera view.
{hint_line}
Task: is the target from IMAGE 1 visible in IMAGE 2?
Reply with ONE JSON — no other text, no markdown. Replace every <…>:
{{"found":<true|false>,"confidence":"<low|medium|high>","position":"<left|center|right|not visible>","description":"<one sentence about IMAGE 2 and your reasoning>"}}
Rules:
- Identity matching: same specific person/object, not just same category.
- People: match clothing, hair, body shape, face.
- Objects: match color, shape, size, distinctive features.
- Only found=true at medium+ confidence.
# ── IMAGE SEARCH — TEXT ONLY ─────────────────────────────────────────────────
image_search_text_prompt: | image_search_text_prompt: |
You are the visual search system for a robot looking for a target by description.
Target description: "{hint}" Target description: "{hint}"
Study the current camera image.
Study the current camera image carefully. Reply with ONE JSON — no other text, no markdown. Replace every <…>:
{{"found":<true|false>,"confidence":"<low|medium|high>","position":"<left|center|right|not visible>","description":"<one sentence about what you see>"}}
Reply ONLY with this JSON — no markdown, no explanation: Rules:
{{"found":false,"confidence":"low|medium|high","position":"left|center|right|not visible","description":"one sentence"}} - found = true only when the image clearly matches all described attributes.
- confidence: "high" all elements confirmed · "medium" minor uncertainty · "low" unclear.
── SEARCH RULES ─────────────────────────────────────────────────────────── - Only report found=true at medium+ confidence.
- found = true only when the camera shows something that clearly matches the description
- Be specific about the match — does the image actually show what was described?
- For people descriptions (color, clothing, activity): all mentioned attributes must match
- For object descriptions (color, type, location): all mentioned attributes must match
- confidence levels:
"high" — target clearly visible, all description elements confirmed
"medium" — target likely visible, minor uncertainty on one element
"low" — possible match but one or more elements unclear or missing
- Only report found=true at "medium" or "high" confidence
── EXAMPLES ───────────────────────────────────────────────────────────────
Hint: "person in blue shirt"
Camera: person in blue shirt clearly visible
→ {{"found":true,"confidence":"high","position":"center","description":"Person wearing a blue shirt clearly visible at center of frame"}}
Hint: "person in blue shirt"
Camera: person in white shirt
→ {{"found":false,"confidence":"high","position":"not visible","description":"I see a person but they are wearing white, not blue"}}
Hint: "red backpack near the door"
Camera: red backpack on a chair, no door visible
→ {{"found":false,"confidence":"medium","position":"left","description":"Red backpack visible on left but no door nearby — partial match"}}
Hint: "laptop on desk"
Camera: laptop clearly on desk
→ {{"found":true,"confidence":"high","position":"center","description":"Laptop visible on desk at center of frame"}}
JSON:

View File

@ -1,20 +0,0 @@
[
{
"time": "16:42:30",
"cmd": "hi",
"response": "Hello!",
"duration_s": 5.98
},
{
"time": "16:43:00",
"cmd": "go back two step",
"response": "Going back two steps",
"duration_s": 10.01
},
{
"time": "16:43:23",
"cmd": "now com",
"response": "Coming to you",
"duration_s": 8.11
}
]

View File

@ -1,66 +0,0 @@
[
{
"time": "16:42:30",
"class": "mouse",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:42:30",
"class": "chair",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:42:30",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:42:30",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:43:00",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:43:00",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:43:23",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:43:23",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,14 +0,0 @@
[
{
"time": "16:44:40",
"cmd": "what do you see",
"response": "I see a person sitting at a desk in an office environment. The individual appears to be engaged with a computer screen and is wearing a red shirt.",
"duration_s": 8.01
},
{
"time": "16:45:06",
"cmd": "that is me the one whos is programming you",
"response": "That is me, the one who's programming you",
"duration_s": 6.71
}
]

View File

@ -1,42 +0,0 @@
[
{
"time": "16:44:40",
"class": "person",
"position": "left",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:45:06",
"class": "mouse",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:45:06",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:45:06",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:45:06",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,14 +0,0 @@
[
{
"time": "16:46:46",
"cmd": "hi",
"response": "Error.",
"duration_s": 0.0
},
{
"time": "16:46:49",
"cmd": "hiii",
"response": "Error.",
"duration_s": 0.0
}
]

View File

@ -1,34 +0,0 @@
[
{
"time": "16:46:46",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:46:46",
"class": "chair",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:46:46",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:46:49",
"class": "mouse",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,32 +0,0 @@
[
{
"time": "16:48:15",
"cmd": "hii",
"response": "Greeting",
"duration_s": 7.93
},
{
"time": "16:48:32",
"cmd": "what do you see",
"response": "I see a person sitting at a desk in an office environment. The person appears to be working on a computer.",
"duration_s": 7.64
},
{
"time": "16:48:58",
"cmd": "that person is the guy who is programming you",
"response": "That person is the one who programmed me",
"duration_s": 6.76
},
{
"time": "16:49:18",
"cmd": "who is that person",
"response": "I don't have enough information to identify the person in the image.",
"duration_s": 7.1
},
{
"time": "16:50:03",
"cmd": "example/",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,34 +0,0 @@
[
{
"time": "16:48:15",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:48:32",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:48:58",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:49:18",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,44 +0,0 @@
[
{
"time": "16:51:08",
"cmd": "hiii",
"response": "Greeting",
"duration_s": 7.97
},
{
"time": "16:51:23",
"cmd": "hi",
"response": "Greeting",
"duration_s": 6.16
},
{
"time": "16:51:51",
"cmd": "my name is Kassam",
"response": "My name is Kassam",
"duration_s": 6.48
},
{
"time": "16:51:59",
"cmd": "that person is Kassam the programmer",
"response": "That person is Kassam the programmer",
"duration_s": 6.74
},
{
"time": "16:52:07",
"cmd": "who is that person",
"response": "That is a man",
"duration_s": 6.42
},
{
"time": "16:52:47",
"cmd": "hiigo back 1 step",
"response": "Backing up one step",
"duration_s": 8.51
},
{
"time": "16:53:24",
"cmd": "what can you do",
"response": "I am a humanoid robot and I can perform various tasks such as walking, talking, and interacting with objects. However, in this image, I see a person sitting at a desk using a computer. If you have any specific questions or requests for me to perform, please let me know!",
"duration_s": 10.6
}
]

View File

@ -1,146 +0,0 @@
[
{
"time": "16:51:08",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:08",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:23",
"class": "chair",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:23",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:23",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:51",
"class": "chair",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:51",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:51",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:59",
"class": "chair",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:59",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:51:59",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:07",
"class": "chair",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:07",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:47",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:47",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:52:47",
"class": "cup",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "16:53:24",
"class": "chair",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "16:53:24",
"class": "person",
"position": "left",
"distance": "very close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,44 +0,0 @@
[
{
"time": "17:43:52",
"cmd": "hi",
"response": "Hi",
"duration_s": 3.23
},
{
"time": "17:44:32",
"cmd": "step one step back",
"response": "Step one step back",
"duration_s": 3.44
},
{
"time": "17:44:43",
"cmd": "what do you see",
"response": "I see a person sitting at a desk with a laptop in front of them.",
"duration_s": 3.93
},
{
"time": "17:44:52",
"cmd": "hii",
"response": "Hi",
"duration_s": 3.31
},
{
"time": "17:45:27",
"cmd": "one step back",
"response": "One step back",
"duration_s": 3.46
},
{
"time": "17:45:38",
"cmd": "one more",
"response": "One more",
"duration_s": 3.43
},
{
"time": "17:46:09",
"cmd": "two step back",
"response": "Moving back two steps",
"duration_s": 3.51
}
]

View File

@ -1,138 +0,0 @@
[
{
"time": "17:43:52",
"class": "chair",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "17:43:52",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:43:52",
"class": "laptop",
"position": "left",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:43:52",
"class": "cup",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:32",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:32",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:43",
"class": "person",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:52",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:44:52",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:27",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:27",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:38",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:38",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:45:38",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:09",
"class": "chair",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:09",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:09",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,14 +0,0 @@
[
{
"time": "17:46:37",
"cmd": "hii",
"response": "Hii",
"duration_s": 3.27
},
{
"time": "17:46:54",
"cmd": "why did you moved",
"response": "I moved because I was instructed to do so.",
"duration_s": 3.71
}
]

View File

@ -1,58 +0,0 @@
[
{
"time": "17:46:37",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:37",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:37",
"class": "cup",
"position": "left",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:54",
"class": "mouse",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:54",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:54",
"class": "chair",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:46:54",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,56 +0,0 @@
[
{
"time": "17:49:16",
"cmd": "hii",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
},
{
"time": "17:49:19",
"cmd": "hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
},
{
"time": "17:49:20",
"cmd": "Hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
},
{
"time": "17:49:37",
"cmd": "move one step back",
"response": "Moving back",
"duration_s": 3.47
},
{
"time": "17:49:50",
"cmd": "one tiny step back",
"response": "Moving back",
"duration_s": 3.44
},
{
"time": "17:50:05",
"cmd": "now come back to me",
"response": "Coming back to you",
"duration_s": 3.53
},
{
"time": "17:50:36",
"cmd": "turn right",
"response": "Turning right",
"duration_s": 3.4
},
{
"time": "17:51:01",
"cmd": "come back to me",
"response": "Coming back to you",
"duration_s": 3.5
},
{
"time": "17:51:22",
"cmd": "turn right 90deg and then back two steps",
"response": "Turning right",
"duration_s": 3.43
}
]

View File

@ -1,106 +0,0 @@
[
{
"time": "17:49:37",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:49:37",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:49:37",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:49:50",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:49:50",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:05",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:05",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:05",
"class": "person",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:36",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:50:36",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:51:01",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:51:01",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:51:22",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,26 +0,0 @@
[
{
"time": "17:53:36",
"cmd": "turn right 90deg and then move back 3 steps",
"response": "Turned right 90 degrees then moved backward",
"duration_s": 0.0
},
{
"time": "17:54:21",
"cmd": "keep turning left unitl you see a guy holding a phone",
"response": "Turning left",
"duration_s": 3.72
},
{
"time": "17:55:33",
"cmd": "goal/ keep turning left unitl you see a guy holding a phone",
"response": "Goal navigation: keep turning left unitl you see a guy holding a phone",
"duration_s": 5.47
},
{
"time": "17:55:53",
"cmd": "describe what he is doing",
"response": "Turning left",
"duration_s": 3.73
}
]

View File

@ -1,50 +0,0 @@
[
{
"time": "17:54:21",
"class": "suitcase",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:54:21",
"class": "backpack",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:55:32",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:55:53",
"class": "person",
"position": "center",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "17:55:53",
"class": "chair",
"position": "center",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:55:53",
"class": "laptop",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,32 +0,0 @@
[
{
"time": "17:57:56",
"cmd": "turn right 90 deg",
"response": "local command",
"duration_s": 0.0
},
{
"time": "17:58:22",
"cmd": "keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"response": "Turning left",
"duration_s": 3.68
},
{
"time": "17:58:55",
"cmd": "goal/ keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"response": "Goal navigation: keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"duration_s": 25.79
},
{
"time": "17:59:13",
"cmd": "keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"response": "Turning left",
"duration_s": 3.73
},
{
"time": "17:59:40",
"cmd": "goal/keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"response": "Goal navigation: keep turning left until you see a guy holding a phone and describe what he is doing and his office",
"duration_s": 23.61
}
]

View File

@ -1,18 +0,0 @@
[
{
"time": "17:58:22",
"class": "suitcase",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "17:58:22",
"class": "backpack",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,26 +0,0 @@
[
{
"time": "18:09:23",
"cmd": "what do you see",
"response": "I see a person sitting at a desk with a computer and a monitor in front of them.",
"duration_s": 4.48
},
{
"time": "18:09:47",
"cmd": "what do you see",
"response": "I see a person sitting at a desk with a computer and a monitor in front of them.",
"duration_s": 4.52
},
{
"time": "18:10:17",
"cmd": "example/",
"response": "local command",
"duration_s": 0.0
},
{
"time": "18:10:41",
"cmd": "is there anyone in the room",
"response": "I see a person sitting at a desk with a computer and a monitor in front of them.",
"duration_s": 4.57
}
]

View File

@ -1,58 +0,0 @@
[
{
"time": "18:09:23",
"class": "mouse",
"position": "center",
"distance": "far",
"x": 0.0,
"y": 0.0
},
{
"time": "18:09:23",
"class": "chair",
"position": "right",
"distance": "medium",
"x": 0.0,
"y": 0.0
},
{
"time": "18:09:23",
"class": "person",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:09:23",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:09:47",
"class": "person",
"position": "right",
"distance": "very close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:10:41",
"class": "person",
"position": "right",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:10:41",
"class": "laptop",
"position": "center",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,13 +0,0 @@
Session: session_014_2026-04-05
Date: 2026-04-05 18:10
Duration: 4m 48s
Commands: 4
YOLO detections: 7
Alerts: 0
Known places: none
First commands:
- what do you see
- what do you see
- example/
- is there anyone in the room

View File

@ -1,26 +0,0 @@
[
{
"time": "18:12:31",
"cmd": "example/",
"response": "local command",
"duration_s": 0.0
},
{
"time": "18:12:49",
"cmd": "goal/ keep turning left until you see a guy",
"response": "Goal navigation: keep turning left until you see a guy",
"duration_s": 5.22
},
{
"time": "18:13:04",
"cmd": "goal/ keep turning left until you see a guy",
"response": "Goal navigation: keep turning left until you see a guy",
"duration_s": 3.22
},
{
"time": "18:13:25",
"cmd": "what do you see",
"response": "I see a man sitting in a chair, talking on a phone. There is a cardboard box and a backpack on a table next to him.",
"duration_s": 0.0
}
]

View File

@ -1,18 +0,0 @@
[
{
"time": "18:12:48",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
},
{
"time": "18:13:03",
"class": "person",
"position": "left",
"distance": "close",
"x": 0.0,
"y": 0.0
}
]

View File

@ -1,8 +0,0 @@
[
{
"time": "13:33:49",
"cmd": "hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
}
]

View File

@ -1,8 +0,0 @@
[
{
"time": "13:35:59",
"cmd": "hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
}
]

View File

@ -1,14 +0,0 @@
[
{
"time": "13:43:47",
"cmd": "hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
},
{
"time": "13:44:22",
"cmd": "turn left",
"response": "أدور لليسار",
"duration_s": 10.3
}
]

View File

@ -1,8 +0,0 @@
[
{
"time": "13:50:01",
"cmd": "what do you see",
"response": "I see a person sitting at a desk with a laptop and a large monitor",
"duration_s": 0.0
}
]

View File

@ -1,8 +0,0 @@
[
{
"time": "13:58:53",
"cmd": "hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
}
]

View File

@ -1,20 +0,0 @@
[
{
"time": "14:02:44",
"cmd": "help/",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:03:18",
"cmd": "turn right",
"response": "أدور لليمين",
"duration_s": 10.15
},
{
"time": "14:03:42",
"cmd": "wha do you see",
"response": "أرى شخصا يجلس أمام طاولة مع جهاز كمبيوتر",
"duration_s": 10.95
}
]

View File

@ -1,20 +0,0 @@
[
{
"time": "14:06:11",
"cmd": "لف على اليسار",
"response": "ألف على اليسار",
"duration_s": 10.2
},
{
"time": "14:06:55",
"cmd": "turn right",
"response": "أدور لليمين",
"duration_s": 10.17
},
{
"time": "14:07:15",
"cmd": "what do",
"response": "أرى شخصا يجلس أمام طاولة مع جهاز كمبيوتر وشاشة كبيرة",
"duration_s": 0.0
}
]

View File

@ -1,26 +0,0 @@
[
{
"time": "14:10:51",
"cmd": "hi",
"response": "Hello! I am Marcus. How can I help you?",
"duration_s": 0.0
},
{
"time": "14:11:17",
"cmd": "turn right",
"response": "أدور لليمين",
"duration_s": 10.2
},
{
"time": "14:11:34",
"cmd": "help",
"response": "local command",
"duration_s": 0.0
},
{
"time": "14:11:50",
"cmd": "what do you see, yolo",
"response": "YOLO: active | YOLO not loaded | 0.0fps",
"duration_s": 0.0
}
]

View File

@ -1,14 +0,0 @@
[
{
"time": "15:34:13",
"cmd": "cd ~/Marcus",
"response": "Moved forward",
"duration_s": 9.91
},
{
"time": "15:34:32",
"cmd": "ython3 run_marcus.py",
"response": "Running Python script",
"duration_s": 10.09
}
]

Some files were not shown because too many files have changed in this diff Show More