Update 2026-04-22 12:17:30
This commit is contained in:
parent
dcf5f9f39b
commit
d257808e48
@ -1,5 +1,24 @@
|
|||||||
"""
|
"""
|
||||||
llava_api.py — LLaVA / Qwen VL query interface
|
llava_api.py — Qwen-VL query interface (via Ollama)
|
||||||
|
|
||||||
|
Three deployment modes, chosen via config_Brain.json:
|
||||||
|
|
||||||
|
1. subsystems.vlm = false
|
||||||
|
→ every ask*() returns a safe fallback dict. Marcus runs in
|
||||||
|
regex-only "safe mode": no LLM load on the Jetson, no GPU/CPU
|
||||||
|
contention with Holosoma, robot won't fall from thrashing.
|
||||||
|
Vision questions just answer "Scene understanding is disabled
|
||||||
|
— running in safe mode." Everything else (movement, places,
|
||||||
|
patrol, autonomous) still works.
|
||||||
|
|
||||||
|
2. ollama_host = "http://127.0.0.1:11434" + subsystems.vlm = true
|
||||||
|
→ Ollama runs on the Jetson. Old behavior — competes with
|
||||||
|
Holosoma for memory. Unsafe during walking with a 3B VL model.
|
||||||
|
|
||||||
|
3. ollama_host = "http://192.168.123.222:11434" + subsystems.vlm = true
|
||||||
|
→ Ollama runs on the workstation. Jetson stays light, Holosoma
|
||||||
|
keeps its 50 Hz real-time deadline, and the brain still gets
|
||||||
|
full Qwen-VL. Best mode for demos / walking with conversation.
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
import ollama
|
import ollama
|
||||||
@ -9,12 +28,13 @@ from Core.config_loader import load_config
|
|||||||
|
|
||||||
_cfg = load_config("Brain")
|
_cfg = load_config("Brain")
|
||||||
|
|
||||||
# Load prompts from YAML (the authoritative source — bilingual, complete)
|
|
||||||
_yaml_path = Path(__file__).resolve().parent.parent / "Config" / "marcus_prompts.yaml"
|
_yaml_path = Path(__file__).resolve().parent.parent / "Config" / "marcus_prompts.yaml"
|
||||||
with open(_yaml_path, encoding="utf-8") as _f:
|
with open(_yaml_path, encoding="utf-8") as _f:
|
||||||
_prompts = yaml.safe_load(_f)
|
_prompts = yaml.safe_load(_f)
|
||||||
|
|
||||||
OLLAMA_MODEL = _cfg["ollama_model"]
|
OLLAMA_MODEL = _cfg["ollama_model"]
|
||||||
|
OLLAMA_HOST = _cfg.get("ollama_host", "http://127.0.0.1:11434")
|
||||||
|
VLM_ENABLED = bool(_cfg.get("subsystems", {}).get("vlm", True))
|
||||||
MAX_HISTORY = _cfg["max_history"]
|
MAX_HISTORY = _cfg["max_history"]
|
||||||
# Cap batch and context on every request. Without this, llama.cpp on Jetson
|
# Cap batch and context on every request. Without this, llama.cpp on Jetson
|
||||||
# Orin NX allocates a ~7.5 GiB compute graph (defaults: batch 512, ctx 4096)
|
# Orin NX allocates a ~7.5 GiB compute graph (defaults: batch 512, ctx 4096)
|
||||||
@ -28,6 +48,14 @@ PATROL_PROMPT = _prompts["patrol_prompt"]
|
|||||||
TALK_PROMPT = _prompts["talk_prompt"]
|
TALK_PROMPT = _prompts["talk_prompt"]
|
||||||
VERIFY_PROMPT = _prompts["verify_prompt"]
|
VERIFY_PROMPT = _prompts["verify_prompt"]
|
||||||
|
|
||||||
|
# Explicit Ollama client — lets us route to a remote host (e.g., workstation)
|
||||||
|
# without relying on the OLLAMA_HOST env var being set in the launch shell.
|
||||||
|
_client = ollama.Client(host=OLLAMA_HOST)
|
||||||
|
|
||||||
|
# Safe-mode replies used when subsystems.vlm == false
|
||||||
|
_VLM_OFF_TALK = "Scene understanding is disabled — Sanad is in safe mode."
|
||||||
|
_VLM_OFF_EMPTY = {"actions": [], "arm": None, "speak": _VLM_OFF_TALK, "abort": None}
|
||||||
|
|
||||||
# Conversation state
|
# Conversation state
|
||||||
_conversation_history = []
|
_conversation_history = []
|
||||||
_facts = []
|
_facts = []
|
||||||
@ -48,6 +76,8 @@ def add_to_history(user_msg: str, assistant_msg: str):
|
|||||||
|
|
||||||
|
|
||||||
def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str:
|
def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool = False) -> str:
|
||||||
|
if not VLM_ENABLED:
|
||||||
|
return "" # safe-mode — caller must handle empty string
|
||||||
messages = []
|
messages = []
|
||||||
if use_history and _conversation_history:
|
if use_history and _conversation_history:
|
||||||
messages.extend(_conversation_history)
|
messages.extend(_conversation_history)
|
||||||
@ -55,13 +85,13 @@ def call_llava(prompt: str, img_b64, num_predict: int = 200, use_history: bool =
|
|||||||
if img_b64:
|
if img_b64:
|
||||||
msg["images"] = [img_b64]
|
msg["images"] = [img_b64]
|
||||||
messages.append(msg)
|
messages.append(msg)
|
||||||
r = ollama.chat(model=OLLAMA_MODEL, messages=messages,
|
r = _client.chat(model=OLLAMA_MODEL, messages=messages,
|
||||||
options={
|
options={
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"num_predict": num_predict,
|
"num_predict": num_predict,
|
||||||
"num_batch": NUM_BATCH,
|
"num_batch": NUM_BATCH,
|
||||||
"num_ctx": NUM_CTX,
|
"num_ctx": NUM_CTX,
|
||||||
})
|
})
|
||||||
return r["message"]["content"].strip()
|
return r["message"]["content"].strip()
|
||||||
|
|
||||||
|
|
||||||
@ -79,7 +109,9 @@ def parse_json(raw: str):
|
|||||||
|
|
||||||
|
|
||||||
def ask(command: str, img_b64) -> dict:
|
def ask(command: str, img_b64) -> dict:
|
||||||
"""Send command + camera frame to LLaVA with conversation history."""
|
"""Send command + camera frame to the VLM with conversation history."""
|
||||||
|
if not VLM_ENABLED:
|
||||||
|
return dict(_VLM_OFF_EMPTY)
|
||||||
try:
|
try:
|
||||||
facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
|
facts_str = ("\nKnown facts: " + "; ".join(_facts) + ".") if _facts else ""
|
||||||
raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
|
raw = call_llava(MAIN_PROMPT.format(command=command, facts=facts_str), img_b64,
|
||||||
@ -92,16 +124,19 @@ def ask(command: str, img_b64) -> dict:
|
|||||||
return {"actions": [], "arm": None, "speak": raw, "abort": None}
|
return {"actions": [], "arm": None, "speak": raw, "abort": None}
|
||||||
return d
|
return d
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
print(f" LLaVA error: {ex}")
|
print(f" VLM error: {ex}")
|
||||||
return {"actions": [], "arm": None, "speak": "Error.", "abort": None}
|
return {"actions": [], "arm": None, "speak": "VLM error.", "abort": None}
|
||||||
|
|
||||||
|
|
||||||
def ask_goal(goal: str, img_b64) -> dict:
|
def ask_goal(goal: str, img_b64) -> dict:
|
||||||
"""Ask LLaVA if goal is reached."""
|
"""Ask the VLM if the goal is reached."""
|
||||||
|
if not VLM_ENABLED:
|
||||||
|
return {"reached": False, "next_move": "left", "duration": 0.5,
|
||||||
|
"speak": "VLM disabled — relying on YOLO fast-match only."}
|
||||||
try:
|
try:
|
||||||
raw = call_llava(GOAL_PROMPT.format(goal=goal), img_b64,
|
raw = call_llava(GOAL_PROMPT.format(goal=goal), img_b64,
|
||||||
num_predict=_cfg["num_predict_goal"])
|
num_predict=_cfg["num_predict_goal"])
|
||||||
print(f" LLaVA: {raw}")
|
print(f" VLM: {raw}")
|
||||||
d = parse_json(raw)
|
d = parse_json(raw)
|
||||||
if d is None:
|
if d is None:
|
||||||
text = raw.lower()
|
text = raw.lower()
|
||||||
@ -119,6 +154,8 @@ def ask_goal(goal: str, img_b64) -> dict:
|
|||||||
|
|
||||||
def ask_talk(command: str, img_b64, facts: str = "") -> dict:
|
def ask_talk(command: str, img_b64, facts: str = "") -> dict:
|
||||||
"""Handle talk-only commands using the YAML talk_prompt."""
|
"""Handle talk-only commands using the YAML talk_prompt."""
|
||||||
|
if not VLM_ENABLED:
|
||||||
|
return dict(_VLM_OFF_EMPTY)
|
||||||
try:
|
try:
|
||||||
prompt = TALK_PROMPT.format(command=command, facts=facts)
|
prompt = TALK_PROMPT.format(command=command, facts=facts)
|
||||||
raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"],
|
raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_talk"],
|
||||||
@ -137,6 +174,9 @@ def ask_talk(command: str, img_b64, facts: str = "") -> dict:
|
|||||||
|
|
||||||
def ask_verify(target: str, condition: str, img_b64) -> str:
|
def ask_verify(target: str, condition: str, img_b64) -> str:
|
||||||
"""Verify a condition on a detected target. Returns 'yes' or 'no'."""
|
"""Verify a condition on a detected target. Returns 'yes' or 'no'."""
|
||||||
|
if not VLM_ENABLED:
|
||||||
|
# Without VLM we can't verify compound conditions; trust the YOLO match.
|
||||||
|
return "yes"
|
||||||
try:
|
try:
|
||||||
prompt = VERIFY_PROMPT.format(target=target, condition=condition)
|
prompt = VERIFY_PROMPT.format(target=target, condition=condition)
|
||||||
raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_verify"])
|
raw = call_llava(prompt, img_b64, num_predict=_cfg["num_predict_verify"])
|
||||||
@ -148,7 +188,10 @@ def ask_verify(target: str, condition: str, img_b64) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def ask_patrol(img_b64) -> dict:
|
def ask_patrol(img_b64) -> dict:
|
||||||
"""Ask LLaVA to assess scene during patrol."""
|
"""Ask the VLM to assess the scene during patrol."""
|
||||||
|
if not VLM_ENABLED:
|
||||||
|
return {"observation": "VLM off — patrolling without scene analysis.",
|
||||||
|
"alert": None, "next_move": "forward", "duration": 1.0}
|
||||||
try:
|
try:
|
||||||
raw = call_llava(PATROL_PROMPT, img_b64, num_predict=_cfg["num_predict_patrol"])
|
raw = call_llava(PATROL_PROMPT, img_b64, num_predict=_cfg["num_predict_patrol"])
|
||||||
d = parse_json(raw)
|
d = parse_json(raw)
|
||||||
|
|||||||
@ -147,11 +147,19 @@ def init_brain():
|
|||||||
|
|
||||||
_log("Brain initialized", "info", "brain")
|
_log("Brain initialized", "info", "brain")
|
||||||
|
|
||||||
# Warmup runs in a daemon thread so the dashboard + Command: prompt
|
# Skip warmup when VLM is off — there's no model to warm, and the
|
||||||
# appear immediately. The first real user command will either hit a
|
# dashboard should mention that Marcus is in safe mode.
|
||||||
# warm model (fast) or pay the cold-load itself (same as before).
|
from API.llava_api import VLM_ENABLED, OLLAMA_HOST
|
||||||
import threading as _t
|
if not VLM_ENABLED:
|
||||||
_t.Thread(target=_warmup_llava, daemon=True, name="llava-warmup").start()
|
print(" [VLM] disabled by config — safe mode (no Ollama load)")
|
||||||
|
else:
|
||||||
|
host_short = OLLAMA_HOST.replace("http://", "")
|
||||||
|
print(f" [VLM] target: {host_short} ({OLLAMA_MODEL})")
|
||||||
|
# Warmup runs in a daemon thread so the dashboard + Command: prompt
|
||||||
|
# appear immediately. The first real user command will either hit a
|
||||||
|
# warm model (fast) or pay the cold-load itself (same as before).
|
||||||
|
import threading as _t
|
||||||
|
_t.Thread(target=_warmup_llava, daemon=True, name="llava-warmup").start()
|
||||||
|
|
||||||
|
|
||||||
# Global voice references
|
# Global voice references
|
||||||
@ -540,7 +548,9 @@ def run_terminal():
|
|||||||
print("\n\n" + "╔" + "═" * (W-2) + "╗")
|
print("\n\n" + "╔" + "═" * (W-2) + "╗")
|
||||||
print("║" + _pad(" SANAD — AI BRAIN READY", W-2) + "║")
|
print("║" + _pad(" SANAD — AI BRAIN READY", W-2) + "║")
|
||||||
print("╠" + "═" * (W-2) + "╣")
|
print("╠" + "═" * (W-2) + "╣")
|
||||||
|
from API.llava_api import VLM_ENABLED
|
||||||
left = [("model", status["model"]),
|
left = [("model", status["model"]),
|
||||||
|
("vlm", _fmt(VLM_ENABLED)),
|
||||||
("voice", _fmt(status["voice"])),
|
("voice", _fmt(status["voice"])),
|
||||||
("camera", status["camera"])]
|
("camera", status["camera"])]
|
||||||
right = [("yolo", _fmt(status["yolo"])),
|
right = [("yolo", _fmt(status["yolo"])),
|
||||||
|
|||||||
@ -1,12 +1,14 @@
|
|||||||
{
|
{
|
||||||
"ollama_model": "qwen2.5vl:3b",
|
"ollama_model": "qwen2.5vl:3b",
|
||||||
|
"ollama_host": "http://127.0.0.1:11434",
|
||||||
"max_history": 6,
|
"max_history": 6,
|
||||||
"num_batch": 128,
|
"num_batch": 128,
|
||||||
"num_ctx": 2048,
|
"num_ctx": 2048,
|
||||||
"subsystems": {
|
"subsystems": {
|
||||||
"lidar": true,
|
"vlm": true,
|
||||||
"voice": true,
|
"lidar": true,
|
||||||
"imgsearch": false,
|
"voice": true,
|
||||||
|
"imgsearch": false,
|
||||||
"autonomous": true
|
"autonomous": true
|
||||||
},
|
},
|
||||||
"num_predict_main": 120,
|
"num_predict_main": 120,
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
{
|
{
|
||||||
"tts": {
|
"tts": {
|
||||||
"backend": "builtin_ttsmaker",
|
"backend": "builtin_ttsmaker",
|
||||||
"builtin_speaker_id": 0,
|
"builtin_speaker_id": 2,
|
||||||
"target_sample_rate": 16000
|
"target_sample_rate": 16000
|
||||||
},
|
},
|
||||||
"stt": {
|
"stt": {
|
||||||
|
|||||||
1
Data/Brain/Sessions/session_029_2026-04-22/alerts.json
Normal file
1
Data/Brain/Sessions/session_029_2026-04-22/alerts.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
8
Data/Brain/Sessions/session_029_2026-04-22/commands.json
Normal file
8
Data/Brain/Sessions/session_029_2026-04-22/commands.json
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"time": "10:54:15",
|
||||||
|
"cmd": "hi",
|
||||||
|
"response": "Hello! I am Sanad. How can I help you?",
|
||||||
|
"duration_s": 0.0
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
1
Data/Brain/Sessions/session_029_2026-04-22/places.json
Normal file
1
Data/Brain/Sessions/session_029_2026-04-22/places.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{}
|
||||||
1
Data/Brain/Sessions/session_030_2026-04-22/alerts.json
Normal file
1
Data/Brain/Sessions/session_030_2026-04-22/alerts.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
1
Data/Brain/Sessions/session_030_2026-04-22/commands.json
Normal file
1
Data/Brain/Sessions/session_030_2026-04-22/commands.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
1
Data/Brain/Sessions/session_030_2026-04-22/places.json
Normal file
1
Data/Brain/Sessions/session_030_2026-04-22/places.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{}
|
||||||
9
Data/Brain/Sessions/session_030_2026-04-22/summary.txt
Normal file
9
Data/Brain/Sessions/session_030_2026-04-22/summary.txt
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
Session: session_030_2026-04-22
|
||||||
|
Date: 2026-04-22 10:56
|
||||||
|
Duration: 0m 37s
|
||||||
|
Commands: 0
|
||||||
|
YOLO detections: 0
|
||||||
|
Alerts: 0
|
||||||
|
Known places: none
|
||||||
|
|
||||||
|
First commands:
|
||||||
@ -349,12 +349,14 @@ class ImageSearch:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
if has_ref:
|
if has_ref:
|
||||||
# Pass BOTH images: [reference, current_frame]
|
# Pass BOTH images: [reference, current_frame]. Route through
|
||||||
# num_batch/num_ctx mirror llava_api.py — without these
|
# the shared Ollama client (so VLM-off and remote-host config
|
||||||
# caps the compute graph OOMs the runner on Jetson.
|
# are honored) and mirror the compute-graph caps.
|
||||||
import ollama as _ollama
|
from API.llava_api import NUM_BATCH, NUM_CTX, VLM_ENABLED, _client as _llava_client
|
||||||
from API.llava_api import NUM_BATCH, NUM_CTX
|
if not VLM_ENABLED:
|
||||||
r = _ollama.chat(
|
print(f" [{step}/{max_steps}] VLM disabled — skipping image-match")
|
||||||
|
continue
|
||||||
|
r = _llava_client.chat(
|
||||||
model=self._model,
|
model=self._model,
|
||||||
messages=[{
|
messages=[{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user