diff --git a/README.md b/README.md index cbf7bec..7c542d5 100644 --- a/README.md +++ b/README.md @@ -7,21 +7,44 @@ JSONL macros; everything is orchestrated by a FastAPI dashboard. ``` ┌────────────────────────────────────────────────────────────────────┐ │ Dashboard (FastAPI) ── http://:8000 │ +│ ├─ Operations Quick-fire arm actions │ │ ├─ Voice & Audio Live Gemini, Typed Replay, Wake Phrases │ │ ├─ Motion & Replay SDK actions, JSONL replays, teaching mode │ -│ ├─ Camera & Vision (deprecated, UI kept for compat) │ +│ ├─ Recognition Camera vision + face gallery (Gemini-side) │ │ ├─ Recordings Skills registry, saved Gemini turns │ │ └─ Settings & Logs System info, tail live log │ └────────────────────────────────────────────────────────────────────┘ │ ├─ voice/sanad_voice.py (subprocess — Gemini Live audio loop) + ├─ gemini/script.py (Gemini Live brain — audio + video + state) ├─ gemini/client.py (short-session client for Typed Replay) - ├─ gemini/subprocess.py (spawns+supervises sanad_voice.py) + ├─ gemini/subprocess.py (spawns+supervises sanad_voice.py; + │ pushes camera frames + motion state + │ to the child over its stdin) + ├─ vision/camera.py (RealSense/USB capture daemon) + ├─ vision/face_gallery.py (data/faces/ CRUD for the primer turn) ├─ motion/arm_controller.py (G1 arm DDS publisher) ├─ voice/audio_io.py (mic + speaker abstraction — 3 profiles) └─ core/brain.py (skill dispatcher, event bus) ``` +### Camera + face recognition data flow + +``` +CameraDaemon (parent, in-memory JPEG+b64 cache) + ├─→ dashboard /api/recognition/frame.jpg ── snapshot_jpeg() + └─→ GeminiSubprocess._frame_forwarder ── get_frame_b64() + │ "frame:\n" over stdin +ArmController ─emit→ event bus ─→ main.py ─→ live_sub.send_state() + │ "state:\n" over stdin + ▼ + gemini/script.py _stdin_watcher thread + ├─ frame: → _LATEST_FRAME → _send_frame_loop → + │ session.send_realtime_input(video=Blob) + └─ state: → _STATE_PENDING → _send_state_loop → + session.send_realtime_input(text=…) +``` + ## Quick start (on the robot) @@ -42,13 +65,14 @@ Then open `http://:8000` in a browser. | `config.py` | Runtime constants derived from `config/*_config.json`. | | `config/` | Per-subsystem JSON config: `core`, `voice`, `gemini`, `motion`, `dashboard`, `local`. | | `core/` | Brain, skill registry, event bus, config loader, logger. | -| `gemini/` | Gemini Live — `client.py` (one-shot), `script.py` (live brain), `subprocess.py` (supervisor). | +| `gemini/` | Gemini Live — `client.py` (one-shot), `script.py` (live brain: audio + video + motion-state), `subprocess.py` (supervisor + stdin frame/state push). | | `voice/` | `sanad_voice.py` (subprocess entry), `audio_io.py` (mic/speaker), `audio_manager.py`, `local_tts.py`, `live_voice_loop.py`, `typed_replay.py`, `wake_phrase_manager.py`, `text_utils.py`, `model_script.py` (brain template). | +| `vision/` | `camera.py` (RealSense/USB capture daemon, auto-reconnect), `face_gallery.py` (`data/faces/` CRUD), `recognition_state.py` (toggle state file I/O). | | `local/` | Offline pipeline skeleton — Silero VAD, Whisper, Qwen (via Ollama), CosyVoice2. Opt-in via `SANAD_VOICE_BRAIN=local`. | | `motion/` | `arm_controller.py` (main), `sanad_arm_controller.py`, `macro_player.py`, `macro_recorder.py`, `teaching.py`. | | `dashboard/` | FastAPI routes (`dashboard/routes/*.py`) + static UI (`dashboard/static/index.html`). | -| `scripts/` | Persona files — `sanad_script.txt` (voice persona), `sanad_rule.txt`, `sanad_arm.txt` (voice→arm phrases). | -| `data/` | Runtime state — `audio/` (typed-replay WAVs), `motions/` (arm JSONL files), `recordings/` (live-captured turns), `motions/config.json` (dashboard-editable settings). | +| `scripts/` | Persona files — `sanad_v2` (voice persona), `sanad_rule.txt`, `sanad_arm.txt` (voice→arm phrases). | +| `data/` | Runtime state — `audio/` (typed-replay WAVs), `motions/` (arm JSONL files), `recordings/` (live-captured turns), `faces/face_{id}/` (enrolled face galleries), `.recognition_state.json` (vision/face-rec toggle state), `motions/config.json` (dashboard-editable settings). | | `model/` | Place for local SpeechT5 / CosyVoice2 weights when using offline pipeline. | | `logs/` | Per-module rotating logs. | @@ -64,6 +88,16 @@ Then open `http://:8000` in a browser. | `SANAD_LIVE_SCRIPT` | path | auto | Override the subprocess entry script path. | | `SANAD_RECORD` | `0` or `1` | `1` | Record every Gemini turn to `data/recordings/`. | | `SANAD_AEC_ENABLE` | `0` or `1` | `1` | Enable WebRTC AEC3 (if the Python binding is installed). | +| `SANAD_VISION_ENABLE` | `0` or `1` | `0` | Boot default for camera vision. **Runtime truth is the Recognition-tab toggle** → `data/.recognition_state.json`, hot-applied without a restart. | +| `SANAD_FACE_RECOGNITION_ENABLE` | `0` or `1` | `0` | Boot default for Gemini-side face recognition. Also a hot toggle. | +| `SANAD_VISION_SEND_HZ` | float | `2` | Frames/sec the Gemini child relays to Live. | +| `SANAD_CAMERA_WIDTH` / `_HEIGHT` / `_FPS` | int | `424` / `240` / `15` | Capture profile. Also settable per-deploy in `config/core_config.json > camera`. | +| `SANAD_FACES_MAX_SAMPLES` | int | `3` | Max photos per person fed into the gallery primer turn (token budget). | + +> All `SANAD_VISION_*` / `SANAD_CAMERA_*` / `SANAD_FACE_*` vars are **boot +> defaults** forwarded to the Gemini child via `LIVE_TUNE`. Once running, +> the Recognition tab's toggles are the live source of truth — they write +> `data/.recognition_state.json`, which the child polls at 1 Hz. ## Dashboard features @@ -83,6 +117,25 @@ Quick-fire SDK + JSONL arm actions (chip buttons), gestural speaking toggle. - **Replay Manager** — upload `.jsonl` files, test-play with speed, Teaching Mode (kinesthetic record). - **Macro Recorder** — Record new audio+motion pair, OR pick any WAV + any motion (SDK or JSONL) and Play them in parallel. +### Recognition +Camera vision + Gemini-side face recognition. Both are **off by default**; +each is a **hot toggle** — flipping it takes effect on the running Gemini +session within ~1 s, no restart. + +- **Camera Vision** — when on, the `CameraDaemon` captures from a RealSense + (preferred) or USB camera and the supervisor streams JPEG frames to + Gemini Live so it can answer "what do you see?". Live preview panel. +- **Face Recognition** — manage `data/faces/face_{id}/` galleries: enroll + from the live camera or upload photos, rename, download (per-photo or + ZIP), delete. On a session start (and on any gallery change) the child + sends a **primer turn** carrying every enrolled face + a Khaleeji + greeting instruction — Gemini itself does the matching in-context, so + there's **no local face-recognition model**. Recognition needs vision on. +- **Sync Gallery** — force-resend the primer to the live session. + +The camera daemon auto-reconnects on USB unplug / stalled frames and warns +if a RealSense negotiated USB 2.0 (Marcus-ported resilience). + ### Recordings Skill Registry (predefined audio+motion skills from `skills.json`) + Saved Records (Gemini turn recordings). @@ -94,6 +147,48 @@ Skill Registry (predefined audio+motion skills from `skills.json`) + Saved Recor - **Supervisor contract**: each brain ships a sibling supervisor (e.g., `gemini/subprocess.py`) that spawns `sanad_voice.py` with its `SANAD_VOICE_BRAIN` env var and parses the brain's log markers. Template: `voice/model_subprocess.py`. - **Audio routing**: the G1's platform-sound PulseAudio sink is NOT wired to a physical speaker. All dashboard-triggered playback (`play_wav`, typed-replay audio, record playback) routes through DDS `AudioClient.PlayStream` via `audio_manager._play_pcm_via_g1`. The PyAudio path is kept as a desktop/dev fallback only. - **Arm replay**: `motion/arm_controller.py:_replay_file_inner()` is a verbatim port of `G1_Lootah/Manual_Recorder/g1_replay_v4_stable.py:Run()` — ramp-in → settle hold → playback → smooth return → disable SDK. Cancel breaks the play loop; `_return_home()` runs unconditionally afterwards for a jerk-free return. +- **Camera frame transport (stdin push)**: the `CameraDaemon` lives in the parent and caches frames in memory. `GeminiSubprocess` runs a `_frame_forwarder` thread that base64-encodes the latest frame and writes `frame:\n` to the child's stdin (~2 fps). The child's `_stdin_watcher` thread decodes into `_LATEST_FRAME`; `_send_frame_loop` relays it to Gemini Live with a staleness guard. This is the Marcus pattern — chosen over a file drop so the parent owns the camera once and the dashboard preview reads the same in-memory cache. +- **Motion-state channel**: `arm_controller._execute()` emits `motion.action_started` / `_done` / `_error` on the event bus. `main.py` forwards each to `live_sub.send_state()`, which writes `state:\n` to the child's stdin. The child injects `[STATE-START] wave_hand`, `[STATE-DONE] wave_hand (2.3s)`, etc. into Gemini Live as silent text context (`send_realtime_input(text=…)`) so it can honestly answer "what are you doing?". +- **Face recognition is Gemini-side**: no dlib/insightface/onnxruntime. `vision/face_gallery.py` is pure file IO over `data/faces/face_{id}/` (`face_N.jpg|png` samples + optional `meta.json` with a `name`). At session start (and on any gallery change) `gemini/script.py:_send_gallery_primer()` builds one multimodal `send_client_content` turn — every enrolled face's photos + a greeting instruction — and Gemini matches incoming frames against it in-context. + + +## Camera vision on Jetson + +The Recognition tab needs `pyrealsense2` to talk to the Intel RealSense. +**Do not `pip install pyrealsense2` on JetPack 5** — the PyPI wheel is built +against glibc 2.32+ (Ubuntu 22.04) and fails to load on JetPack 5's glibc +2.31 with `ImportError: ... version 'GLIBC_2.32' not found`. + +The native runtime is already there (`apt`-installed `librealsense2`). Build +just the Python binding from source against it, into the `gemini_sdk` env: + +```bash +rs-enumerate-devices # confirm the D435I shows up at OS level first + +source ~/miniconda3/etc/profile.d/conda.sh && conda activate gemini_sdk +pip uninstall -y pyrealsense2 # remove the broken wheel if present +sudo apt install -y cmake build-essential git python3-dev libusb-1.0-0-dev pkg-config libssl-dev + +cd /tmp && rm -rf librealsense +git clone --depth=1 --branch v2.56.5 https://github.com/IntelRealSense/librealsense.git +cd librealsense && mkdir -p build && cd build +cmake .. -DBUILD_PYTHON_BINDINGS=ON -DPYTHON_EXECUTABLE=$(which python3) \ + -DBUILD_EXAMPLES=OFF -DBUILD_GRAPHICAL_EXAMPLES=OFF \ + -DBUILD_UNIT_TESTS=OFF -DCHECK_FOR_UPDATES=OFF -DCMAKE_BUILD_TYPE=Release +make -j$(nproc) pyrealsense2 +SITE=$(python3 -c "import sysconfig; print(sysconfig.get_paths()['purelib'])") +mkdir -p "$SITE/pyrealsense2" +cp wrappers/python/pyrealsense2*.so "$SITE/pyrealsense2/" +cp ../wrappers/python/pyrealsense2/__init__.py "$SITE/pyrealsense2/" 2>/dev/null || true + +python3 -c 'import pyrealsense2 as rs; print([d.get_info(rs.camera_info.name) for d in rs.context().query_devices()])' +``` + +Match the `--branch` tag to the installed runtime (`dpkg -l | grep librealsense2`). +If the build isn't worth it, `CameraDaemon` falls back to `cv2.VideoCapture(0)` +automatically — fine for a plain USB webcam, but note a RealSense exposes its +*depth* stream at `/dev/video0`, not RGB, so a real USB cam is the cleaner +fallback. On x86_64 / Ubuntu 22.04+ desktops, `pip install pyrealsense2` just works. ## Dynamic paths @@ -133,7 +228,10 @@ Then on the robot: `Ctrl+C` the running `main.py` and re-run. | Record playback silent | `audio_mgr.play_wav` only routes to G1 DDS if the Unitree SDK is importable; on desktop it falls back to the PulseAudio sink. | | Live Voice Commands transcript stuck | Deferred trigger was queued but `trigger_enabled` toggle was off. Toggle on — or the pending-trigger poll now fires it automatically once enabled. | | Gemini "no audio" on Typed Replay | Non-deterministic; the retry chain in `voice/typed_replay.py:generate_audio` tries three prompt variants. For reliable TTS, use the offline `local_tts` SpeechT5 path. | -| Dashboard `Not Found` 404s for `/api/vision/*` | Vision module was deleted; HTML still has stale fetches for a few endpoints. Cosmetic — `dashboard/static/index.html` init block already skips most. | +| Recognition tab: "Camera could not start (no backend)" | No camera backend acquired. Check `rs-enumerate-devices` (RealSense at OS level) and `python3 -c 'import pyrealsense2'` in the `gemini_sdk` env. The glibc `ImportError` means the pip wheel is incompatible — see "Camera vision on Jetson" above. | +| Camera badge stuck on "reconnecting…" | `CameraDaemon` lost the device and is retrying with exponential backoff. Re-seat the USB 3 cable; check `logs/camera.log` for the USB-2.0 warning. | +| Gemini doesn't greet an enrolled face | Face Recognition toggle on? Vision on? (Face rec needs frames.) Check `logs/gemini_brain.log` for `face gallery primed: N person(s)`. Hit "Sync Gallery" to force a re-prime. | +| Gemini unaware of motion state | The `motion.action_*` → `send_state` chain only runs when Live Gemini is up. Check `logs/gemini_subprocess.log` and `logs/gemini_brain.log` for `STATE injected:` lines. | ## License / attribution @@ -142,4 +240,7 @@ Internal project for YS Lootah Technology. Reuses/ports patterns from: - `G1_Lootah/Manual_Recorder/g1_replay_v4_stable.py` (arm replay math) - `SanadVoice/gemini_interact` (arm-phrase dispatch, skill registry) - `SanadVoice/gemini_voice_v2` (local SpeechT5 TTS) +- `Project/Marcus` — camera→Gemini stdin-push transport, motion-state + injection, camera daemon resilience (auto-reconnect, USB-2.0 warning), + and the `API/camera_api.py` cache shape (`get_frame_b64` / `get_fresh_frame`). - Unitree `unitree_sdk2py` (G1 low-level SDK, LocoClient, G1ArmActionClient) diff --git a/config.py b/config.py index 376eef4..a25c673 100644 --- a/config.py +++ b/config.py @@ -341,6 +341,24 @@ LIVE_TUNE: dict[str, str] = { # G1 built-in mic — UDP multicast 239.168.123.161:5555. # Requires wake-up conversation mode ON in Unitree app. "SANAD_USE_G1_MIC": "1", + + # ── Recognition (camera vision + face recognition) ── + # All of these are BOOT defaults. The runtime source of truth is the + # state file data/.recognition_state.json — toggled live from the + # Recognition tab and polled by the Gemini child at 1 Hz. + "SANAD_VISION_ENABLE": "0", + "SANAD_VISION_SEND_HZ": "2", + "SANAD_VISION_STALE_MS": "1500", + "SANAD_CAMERA_WIDTH": "424", + "SANAD_CAMERA_HEIGHT": "240", + "SANAD_CAMERA_FPS": "15", + "SANAD_CAMERA_JPEG_QUALITY": "70", + "SANAD_FACE_RECOGNITION_ENABLE": "0", + "SANAD_FACES_DIR": str(DATA_DIR / "faces"), + "SANAD_FACES_MAX_SAMPLES": "3", + "SANAD_FACES_PRIMER_RESIZE": "256", + "SANAD_RECOGNITION_STATE_PATH": str(DATA_DIR / ".recognition_state.json"), + "SANAD_RECOGNITION_POLL_S": "1.0", } # -- Camera -- diff --git a/config/core_config.json b/config/core_config.json index 16b72f1..be31af4 100644 --- a/config/core_config.json +++ b/config/core_config.json @@ -76,5 +76,26 @@ "dds": { "network_interface_default": "eth0" + }, + + "camera": { + "_comment": "Recognition tab camera daemon (parent process reads this). width/height/fps/jpeg_quality + the reconnect knobs configure CameraDaemon. Frames are cached in memory and pushed to the Gemini child over its stdin (no file drop). send_hz/stale_ms are read by the Gemini child via SANAD_VISION_SEND_HZ / SANAD_VISION_STALE_MS env vars (LIVE_TUNE).", + "width": 424, + "height": 240, + "fps": 15, + "jpeg_quality": 70, + "send_hz": 2, + "stale_ms": 1500, + "stale_threshold_s": 10.0, + "reconnect_min_s": 2.0, + "reconnect_max_s": 10.0, + "capture_timeout_ms": 5000 + }, + + "faces": { + "_comment": "Face gallery for Gemini-side recognition. Folder layout: data/faces/face_{id}/{face_1.jpg, ...} + optional meta.json {\"name\": \"...\"}. Gemini does the matching — no local ML model.", + "dir_rel": "data/faces", + "max_samples_per_face": 3, + "primer_resize_long_side": 256 } } diff --git a/config/gemini_config.json b/config/gemini_config.json index 4f56f09..39c84ca 100644 --- a/config/gemini_config.json +++ b/config/gemini_config.json @@ -10,12 +10,13 @@ }, "subprocess": { - "_comment": "gemini/subprocess.py — GeminiSubprocess supervisor. Spawns voice/sanad_voice.py as a child, tails stdout for Gemini-specific log markers, exposes transcript + state to the dashboard.", + "_comment": "gemini/subprocess.py — GeminiSubprocess supervisor. Spawns voice/sanad_voice.py as a child, tails stdout for Gemini-specific log markers, pushes camera frames + motion state to the child over its stdin, exposes transcript + state to the dashboard.", "log_tail_size": 2000, "transcript_tail_size": 30, "log_name": "gemini_subprocess", "stop_timeout_sec": 3.0, "terminate_timeout_sec": 2.0, + "frame_forward_interval_sec": 0.5, "noisy_prefixes": [ "ALSA lib ", "Expression 'alsa_", diff --git a/dashboard/app.py b/dashboard/app.py index 87ddefb..5de8495 100644 --- a/dashboard/app.py +++ b/dashboard/app.py @@ -50,6 +50,7 @@ _REST_ROUTES: list[tuple[str, str, str]] = [ ("live_voice", "/api/live-voice", "live-voice"), ("live_subprocess", "/api/live-subprocess", "live-subprocess"), ("typed_replay", "/api/typed-replay", "typed-replay"), + ("recognition", "/api/recognition", "recognition"), ] _WS_ROUTES: list[str] = ["log_stream"] diff --git a/dashboard/routes/recognition.py b/dashboard/routes/recognition.py new file mode 100644 index 0000000..b1f9d60 --- /dev/null +++ b/dashboard/routes/recognition.py @@ -0,0 +1,457 @@ +"""Recognition tab — camera vision + face gallery + hot toggles. + +Single router covering: + - Vision / Face Recognition toggles (hot — no Gemini restart needed) + - Live camera preview (latest JPEG drop) + - Face gallery CRUD: enroll, upload, capture, rename, delete, ZIP + - Per-photo download + delete + +Toggle changes write data/.recognition_state.json atomically. The Gemini +child polls that file at 1 Hz and applies changes mid-session. +""" + +from __future__ import annotations + +import io +from typing import Optional + +from fastapi import APIRouter, File, HTTPException, Query, UploadFile +from fastapi.responses import FileResponse, Response, StreamingResponse +from pydantic import BaseModel + +from Project.Sanad.config import BASE_DIR +from Project.Sanad.core.logger import get_logger +from Project.Sanad.dashboard.routes._safe_io import check_upload_size +from Project.Sanad.vision import recognition_state + +log = get_logger("recognition_routes") + +router = APIRouter() + + +# ── paths (resolved from BASE_DIR) ────────────────────────── + +STATE_PATH = BASE_DIR / "data" / ".recognition_state.json" +FACES_DIR = BASE_DIR / "data" / "faces" + +ALLOWED_IMAGE_EXTS = {".jpg", ".jpeg", ".png"} + + +# ── helpers ───────────────────────────────────────────────── + +def _get_camera(): + """Lazy import to avoid circular import on dashboard load.""" + try: + from Project.Sanad.main import camera # type: ignore + return camera + except Exception: + return None + + +def _get_gallery(): + """Lazy import — same reason.""" + try: + from Project.Sanad.main import gallery # type: ignore + return gallery + except Exception: + return None + + +def _bump_and_write_state(**changes) -> recognition_state.RecognitionState: + """Apply changes (vision_enabled, face_rec_enabled) and persist.""" + return recognition_state.mutate(STATE_PATH, **changes) + + +def _bump_gallery_version() -> int: + cur = recognition_state.read(STATE_PATH) + new_version = cur.gallery_version + 1 + recognition_state.mutate(STATE_PATH, gallery_version=new_version) + return new_version + + +# ── state + toggles ───────────────────────────────────────── + +@router.get("/state") +async def get_state(): + """Return the current toggle/camera/gallery state.""" + st = recognition_state.read(STATE_PATH) + cam = _get_camera() + gallery = _get_gallery() + faces_count = 0 + photos_count = 0 + if gallery is not None: + try: + entries = gallery.list() + faces_count = len(entries) + photos_count = sum(len(e.sample_paths) for e in entries) + except Exception: + pass + return { + "vision_enabled": st.vision_enabled, + "face_rec_enabled": st.face_rec_enabled, + "gallery_version": st.gallery_version, + "camera": cam.status() if cam is not None else { + "running": False, "backend": None, "error": "camera subsystem unavailable" + }, + "faces_count": faces_count, + "photos_count": photos_count, + } + + +@router.post("/vision") +async def set_vision(on: bool = Query(...)): + """Enable / disable camera vision (hot — no Gemini restart).""" + cam = _get_camera() + if cam is None: + log.warning("vision toggle requested but camera subsystem unavailable") + raise HTTPException(503, "Camera subsystem not available.") + + if on and not cam.is_running(): + ok = cam.start() + if not ok: + log.warning("vision ON requested but camera.start() failed: %s", + cam.error or "no backend") + _bump_and_write_state(vision_enabled=False) + raise HTTPException(503, + f"Camera could not start (no backend). {cam.error or ''}") + elif (not on) and cam.is_running(): + cam.stop() + + st = _bump_and_write_state(vision_enabled=bool(on)) + log.info("vision %s (backend=%s)", "ON" if on else "OFF", + cam.backend if cam.is_running() else "none") + return {"ok": True, "vision_enabled": st.vision_enabled, + "camera": cam.status()} + + +@router.post("/face-rec") +async def set_face_rec(on: bool = Query(...)): + """Enable / disable face recognition (hot — no Gemini restart). + + The Gemini child picks the change up within ~1 s: ON re-sends the + gallery primer and tells Gemini it can recognise people; OFF tells + Gemini to disregard the gallery and stop identifying anyone. Both + take effect on the live session — no reconnect needed. + """ + st = _bump_and_write_state(face_rec_enabled=bool(on)) + log.info("face recognition %s", "ON" if on else "OFF") + return {"ok": True, "face_rec_enabled": st.face_rec_enabled} + + +@router.post("/sync") +async def sync_gallery(): + """Bump gallery_version so the child re-sends the primer if face-rec is ON.""" + v = _bump_gallery_version() + log.info("gallery sync requested → v.%d", v) + return {"ok": True, "gallery_version": v} + + +# ── live preview ──────────────────────────────────────────── + +@router.get("/frame.jpg") +async def latest_frame(): + """Serve the most recent camera frame straight from the daemon's + in-memory cache (no file drop — frames are also pushed to the Gemini + child over its stdin).""" + cam = _get_camera() + if cam is None: + raise HTTPException(503, "Camera subsystem unavailable.") + jpeg = cam.snapshot_jpeg() + if not jpeg: + raise HTTPException(404, "No frame captured yet.") + return Response( + content=jpeg, + media_type="image/jpeg", + headers={"Cache-Control": "no-store, must-revalidate"}, + ) + + +# ── camera resolution / quality ───────────────────────────── + +class CameraConfigPayload(BaseModel): + width: Optional[int] = None + height: Optional[int] = None + fps: Optional[int] = None + jpeg_quality: Optional[int] = None + + +@router.post("/camera-config") +async def set_camera_config(payload: CameraConfigPayload): + """Hot-swap the camera capture profile (resolution / fps / JPEG quality). + + If the camera is running, CameraDaemon.reconfigure() rebuilds the + pipeline at the new profile (~0.5 s gap). If idle, the values just + take effect on the next start. Bounds are sanity-checked here so a + fat-fingered value can't wedge the daemon.""" + cam = _get_camera() + if cam is None: + raise HTTPException(503, "Camera subsystem unavailable.") + if payload.width is not None and not (160 <= payload.width <= 1920): + raise HTTPException(400, "width out of range (160–1920)") + if payload.height is not None and not (120 <= payload.height <= 1080): + raise HTTPException(400, "height out of range (120–1080)") + if payload.fps is not None and not (1 <= payload.fps <= 60): + raise HTTPException(400, "fps out of range (1–60)") + if payload.jpeg_quality is not None and not (10 <= payload.jpeg_quality <= 95): + raise HTTPException(400, "jpeg_quality out of range (10–95)") + profile = cam.reconfigure( + width=payload.width, height=payload.height, + fps=payload.fps, jpeg_quality=payload.jpeg_quality, + ) + log.info("camera reconfigured via dashboard → %s", profile) + return {"ok": True, "profile": profile, "camera": cam.status()} + + +# ── face gallery routes ───────────────────────────────────── + +def _validate_image(content: bytes, filename: str | None = None) -> None: + """Reject non-JPEG/PNG content + oversize uploads.""" + check_upload_size(content) + if len(content) < 16: + raise HTTPException(400, "Image too small / empty.") + is_jpeg = content[:3] == b"\xff\xd8\xff" + is_png = content[:8] == b"\x89PNG\r\n\x1a\n" + if not (is_jpeg or is_png): + raise HTTPException( + 400, + f"Only JPEG/PNG accepted (got {filename or 'unknown'}).", + ) + + +def _entry_to_dict(entry) -> dict: + photos = [] + for p in entry.sample_paths: + try: + photos.append({"name": p.name, "size_bytes": p.stat().st_size}) + except OSError: + continue + return { + "id": entry.id, + "name": entry.name, + "description": entry.description, + "added_at": entry.added_at, + "photos": photos, + } + + +@router.get("/faces") +async def list_faces(): + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + entries = gallery.list() + return {"faces": [_entry_to_dict(e) for e in entries], + "total": len(entries)} + + +class RenamePayload(BaseModel): + name: Optional[str] = None + + +class DescribePayload(BaseModel): + description: Optional[str] = None + + +@router.post("/faces/enroll") +async def enroll_from_camera(name: Optional[str] = Query(default=None), + description: Optional[str] = Query(default=None)): + """Create a new face from the camera's latest snapshot.""" + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + cam = _get_camera() + if cam is None or not cam.is_running(): + raise HTTPException(409, "Camera is not running. Toggle Vision ON first.") + # get_fresh_frame waits briefly for a current frame so the enrolled + # photo is the scene the user is posing for, not a stale buffer. + jpeg = cam.get_fresh_frame(max_age_s=0.5, timeout_s=1.5) + if not jpeg: + raise HTTPException(409, "Camera has no frame yet. Wait a moment and retry.") + entry = gallery.create_face( + [jpeg], + name=name.strip() if name else None, + description=description.strip() if description else None, + ) + v = _bump_gallery_version() + log.info("enrolled face_%d via camera (name=%s, desc=%s, v.%d)", + entry.id, name or "(unnamed)", + "yes" if description else "no", v) + return {"ok": True, "face": _entry_to_dict(entry)} + + +@router.post("/faces/upload") +async def enroll_from_upload( + files: list[UploadFile] = File(...), + name: Optional[str] = Query(default=None), + description: Optional[str] = Query(default=None), +): + """Create a new face from uploaded image file(s).""" + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + if not files: + raise HTTPException(400, "At least one image file required.") + image_bytes: list[bytes] = [] + for f in files: + content = await f.read() + _validate_image(content, f.filename) + image_bytes.append(content) + entry = gallery.create_face( + image_bytes, + name=name.strip() if name else None, + description=description.strip() if description else None, + ) + v = _bump_gallery_version() + log.info("enrolled face_%d via upload (%d photos, name=%s, desc=%s, v.%d)", + entry.id, len(image_bytes), name or "(unnamed)", + "yes" if description else "no", v) + return {"ok": True, "face": _entry_to_dict(entry)} + + +@router.post("/faces/{face_id}/capture") +async def capture_to_face(face_id: int): + """Add a new sample (from the camera) to an existing face.""" + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + cam = _get_camera() + if cam is None or not cam.is_running(): + raise HTTPException(409, "Camera is not running. Toggle Vision ON first.") + jpeg = cam.get_fresh_frame(max_age_s=0.5, timeout_s=1.5) + if not jpeg: + raise HTTPException(409, "Camera has no frame yet.") + try: + fname = gallery.add_photo(face_id, jpeg) + except FileNotFoundError as exc: + raise HTTPException(404, str(exc)) + v = _bump_gallery_version() + log.info("captured new photo for face_%d → %s (v.%d)", face_id, fname, v) + return {"ok": True, "added": fname, "face": _entry_to_dict(gallery.get(face_id))} + + +@router.post("/faces/{face_id}/upload") +async def upload_to_face(face_id: int, files: list[UploadFile] = File(...)): + """Add one or more uploaded samples to an existing face.""" + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + if gallery.get(face_id) is None: + raise HTTPException(404, f"face_{face_id} not found") + added: list[str] = [] + for f in files: + content = await f.read() + _validate_image(content, f.filename) + try: + fname = gallery.add_photo(face_id, content) + added.append(fname) + except FileNotFoundError as exc: + raise HTTPException(404, str(exc)) + v = _bump_gallery_version() + log.info("uploaded %d photo(s) to face_%d (v.%d)", len(added), face_id, v) + return {"ok": True, "added": added, + "face": _entry_to_dict(gallery.get(face_id))} + + +@router.post("/faces/{face_id}/rename") +async def rename_face(face_id: int, payload: RenamePayload): + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + try: + gallery.rename(face_id, payload.name) + except FileNotFoundError as exc: + raise HTTPException(404, str(exc)) + v = _bump_gallery_version() + log.info("renamed face_%d → %s (v.%d)", face_id, + payload.name or "(unnamed)", v) + return {"ok": True, "face": _entry_to_dict(gallery.get(face_id))} + + +@router.post("/faces/{face_id}/describe") +async def describe_face(face_id: int, payload: DescribePayload): + """Set / clear a face's free-text description. The description is + folded into the Gemini primer turn so Gemini can reference it.""" + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + try: + gallery.set_description(face_id, payload.description) + except FileNotFoundError as exc: + raise HTTPException(404, str(exc)) + v = _bump_gallery_version() + log.info("described face_%d (%s, v.%d)", face_id, + "set" if payload.description else "cleared", v) + return {"ok": True, "face": _entry_to_dict(gallery.get(face_id))} + + +@router.delete("/faces/{face_id}") +async def delete_face(face_id: int): + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + try: + gallery.delete_face(face_id) + except FileNotFoundError as exc: + raise HTTPException(404, str(exc)) + v = _bump_gallery_version() + log.info("deleted face_%d (v.%d)", face_id, v) + return {"ok": True, "deleted": face_id} + + +@router.delete("/faces/{face_id}/photo/{photo_name}") +async def delete_photo(face_id: int, photo_name: str): + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + # safe filename — only allow simple file names, no traversal + if "/" in photo_name or ".." in photo_name or "\x00" in photo_name: + raise HTTPException(400, "Invalid photo name.") + try: + gallery.delete_photo(face_id, photo_name) + except FileNotFoundError as exc: + raise HTTPException(404, str(exc)) + except ValueError as exc: + raise HTTPException(400, str(exc)) + v = _bump_gallery_version() + log.info("deleted photo %s from face_%d (v.%d)", photo_name, face_id, v) + return {"ok": True, "deleted": photo_name} + + +@router.get("/faces/{face_id}/photo/{photo_name}") +async def get_photo(face_id: int, photo_name: str, + download: int = Query(default=0)): + """Serve a single photo. Add ?download=1 for attachment disposition.""" + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + if "/" in photo_name or ".." in photo_name or "\x00" in photo_name: + raise HTTPException(400, "Invalid photo name.") + path = gallery.get_photo(face_id, photo_name) + if path is None: + raise HTTPException(404, "Photo not found.") + media = "image/png" if path.suffix.lower() == ".png" else "image/jpeg" + headers = {} + if download: + headers["Content-Disposition"] = ( + f'attachment; filename="face_{face_id}_{photo_name}"' + ) + return FileResponse(path, media_type=media, headers=headers) + + +@router.get("/faces/{face_id}/download.zip") +async def download_face_zip(face_id: int): + gallery = _get_gallery() + if gallery is None: + raise HTTPException(503, "Face gallery subsystem unavailable.") + try: + data = gallery.zip_face(face_id) + except FileNotFoundError as exc: + raise HTTPException(404, str(exc)) + return StreamingResponse( + io.BytesIO(data), + media_type="application/zip", + headers={ + "Content-Disposition": f'attachment; filename="face_{face_id}.zip"', + "Content-Length": str(len(data)), + }, + ) diff --git a/dashboard/static/index.html b/dashboard/static/index.html index 73f6ceb..086b375 100644 --- a/dashboard/static/index.html +++ b/dashboard/static/index.html @@ -123,7 +123,7 @@
Operations
Voice & Audio
Motion & Replay
-
Camera & Vision
+
Recognition
Recordings
Settings & Logs
@@ -211,23 +211,6 @@
- -
-

Live Camera

- -
- - - - -
-
- - - -
-
-

Quick Actions

@@ -288,6 +271,7 @@
+ @@ -470,78 +454,102 @@
- -
+ +
- -
-

Camera Device

-
- - - -
-
-
- All plugged cameras -
-
-
- Pin RealSense serial to slot -
- - - -
-
Use this when you have two RealSense units and want to lock which one is "primary".
-
-
- - - -
-
-
- - -
-

Camera Configuration

-
-
- - x - @ - fps - -
-
- -
- - -
-

YOLO Vision Detector

-
- - - -
-
-
- - +
-

Photo Gallery

-
- - - - +

Camera Vision & Face Recognition

+
+
+ + + -- +
+
+ + + -- +
+
- +
+ Toggles take effect within ~1 second on the running Gemini session — no restart required. +
+
+ + +
+

Live Preview

+
+ +
Camera off — toggle Vision ON to see the live feed.
+
+
--
+
+
Resolution / FPS
+
+ + + + + + +
+
JPEG Quality
+
+ + + +
+
+
+ Each button rebuilds the capture pipeline (~0.5 s). Modes match the + RealSense D435I colour sensor — on USB 2.x, stick to 424×240 or 640×480. + If the feed is grayscale/IR, pin the colour node with SANAD_CAMERA_USB_INDEX. +
+
+ + +
+

Add New Face

+
+ + +
+
+ + +
+
+ + +
+
+ Tip: add 2–3 photos / different angles per person for best recognition. + The description is sent to Gemini with the photos — it can then greet + and talk about the person using what you wrote. +
+
+ + +
+

Enrolled Faces

+
+ + +
+
Loading…
@@ -621,7 +629,14 @@ function btnDone(b){if(b&&b.classList)b.classList.remove('loading');} async function api(m,p,b){const o={method:m,headers:{'Content-Type':'application/json'}};if(b)o.body=JSON.stringify(b);const r=await fetch(API+p,o);const j=await r.json();if(!r.ok){toast(j.detail||j.error||'Error '+r.status,'err');throw new Error(j.detail||j.error);}return j;} // Tabs -function switchTab(name){document.querySelectorAll('.tab').forEach(t=>t.classList.toggle('active',t.textContent.toLowerCase().includes(name.slice(0,4))));document.querySelectorAll('.tab-content').forEach(c=>c.classList.toggle('active',c.id==='tab-'+name));} +function switchTab(name){ + // Match the nav tab by its exact onclick target — NOT a substring of the + // label. "recognition" and "recordings" both start with "reco", so the old + // textContent.includes(name.slice(0,4)) lit up both tabs at once. + const want="switchTab('"+name+"')"; + document.querySelectorAll('.tab').forEach(t=>t.classList.toggle('active',(t.getAttribute('onclick')||'').includes(want))); + document.querySelectorAll('.tab-content').forEach(c=>c.classList.toggle('active',c.id==='tab-'+name)); +} // Emergency Stop async function emergencyStop(){try{await api('POST','/api/replay/cancel');await api('POST','/api/live-voice/stop');toast('EMERGENCY STOP sent','err');}catch(e){}} @@ -894,155 +909,6 @@ async function applyManualAudio(b){ refreshAudioDevices(); } -// Camera -async function capturePhoto(b){btnLoad(b);try{await api('POST','/api/vision/capture');toast('Photo captured','ok');refreshPhotos();}catch(e){}btnDone(b);} -async function captureWithMotion(b){btnLoad(b);const g=document.getElementById('capture-gesture').value;const url=g?'/api/vision/capture?motion_file='+encodeURIComponent(g):'/api/vision/capture';try{await api('POST',url);toast('Captured'+(g?' + gesture':''),'ok');refreshPhotos();}catch(e){}btnDone(b);} -async function refreshCamSources(){try{const r=await api('GET','/api/vision/cameras');const sel=document.getElementById('cam-source');sel.innerHTML=''+(r||[]).map(c=>``).join('');}catch(e){}} -async function populateGestureSelect(){try{const r=await api('GET','/api/replay/files');const sel=document.getElementById('capture-gesture');sel.innerHTML=''+(r.files||[]).map(f=>``).join('');}catch(e){}} -async function setCamSource(v){if(v)try{await api('POST','/api/vision/set-source',{source:v});toast('Camera source set','ok');}catch(e){}} -async function setCamRes(){const w=+document.getElementById('cam-w').value,h=+document.getElementById('cam-h').value,f=+document.getElementById('cam-fps').value;try{await api('POST','/api/vision/set-resolution',{width:w,height:h,fps:f});toast(`${w}x${h}@${f}fps`,'ok');}catch(e){}} -async function setPreferredCam(){const s=document.getElementById('cam-serial').value;if(s)try{await api('POST','/api/vision/set-preferred-camera',{serial:s});toast('Saved','ok');}catch(e){}} - -// Photos -async function refreshPhotos(){try{const r=await api('GET','/api/vision/photos');const el=document.getElementById('photo-gallery');document.getElementById('photo-count').textContent=`${r.total} photos`;if(!r.photos?.length){el.innerHTML='
No photos yet
';return;}el.innerHTML=r.photos.map(p=>{const n=esc(p.name);return``;}).join('');}catch(e){}} -async function deletePhoto(n){try{await api('DELETE','/api/vision/photos/'+encodeURIComponent(n));toast('Deleted','ok');refreshPhotos();}catch(e){}} -function downloadAllPhotos(){window.open('/api/vision/photos/download-zip');} -async function clearPhotos(){if(confirm('Delete ALL photos?'))try{const r=await api('POST','/api/vision/photos/clear');toast(`Cleared ${r.deleted_count}`,'ok');refreshPhotos();}catch(e){}} - -// Camera devices -async function refreshCamDevices(){ - try{ - const r=await api('GET','/api/vision/devices'); - const cur=r.current||{}; - const curDev=cur.device||{}; - const curId=cur.profile?cur.profile.id:''; - const detIds=r.detected_ids||[]; - // Profile dropdown - const profSel=document.getElementById('camdev-profile'); - profSel.innerHTML=(r.profiles||[]).map(p=>{ - const avail=detIds.indexOf(p.id)>=0; - const sel=p.id===curId?' selected':''; - const tag=avail?'':' (no device)'; - return ``; - }).join(''); - // Pin profile dropdown - const pinSel=document.getElementById('camdev-pin-profile'); - pinSel.innerHTML=(r.profiles||[]).filter(p=>p.backend==='realsense').map(p=> - ``).join(''); - // Detected summary - const det=document.getElementById('camdev-detected'); - const counts=r.counts||{}; - det.innerHTML=`Detected: ${counts.realsense||0} RealSense, ${counts.v4l2||0} V4L2 (total ${counts.total||0})`; - // All devices list - const list=document.getElementById('camdev-list'); - if(!(r.all_devices||[]).length){ - list.innerHTML='
No cameras plugged
'; - }else{ - list.innerHTML=''+ - r.all_devices.map(d=>{ - const idVal=d.serial||d.device_path; - const action=d.serial - ?`` - :``; - return ``; - }).join('')+'
BackendNameSerial / Path
${esc(d.backend)}${esc(d.name||'-')}${esc(idVal||'-')}${action}
'; - } - // Status text - const st=document.getElementById('camdev-status'); - if(curDev.name){ - st.innerHTML=`Active: ${esc(curDev.name)}
`+ - (curDev.serial?`Serial: ${esc(curDev.serial)}
`:'')+ - (curDev.device_path?`Path: ${esc(curDev.device_path)}
`:'')+ - `via ${esc(cur.source_kind||'?')}`; - }else{ - st.innerHTML='No camera selected'; - } - }catch(e){} -} -async function scanCameras(b){ - if(b)btnLoad(b); - try{await api('POST','/api/vision/devices/scan');toast('Re-scanned cameras','ok');}catch(e){} - if(b)btnDone(b); - refreshCamDevices(); -} -async function selectCamProfile(profileId){ - if(!profileId)return; - try{ - await api('POST','/api/vision/devices/select-profile',{profile_id:profileId}); - toast('Camera profile switched','ok'); - }catch(e){} - refreshCamDevices(); -} -async function selectCamSerial(serial){ - if(!serial)return; - try{ - await api('POST','/api/vision/devices/select-serial',{serial}); - toast('Camera selected by serial','ok'); - }catch(e){} - refreshCamDevices(); -} -async function selectCamPath(path){ - if(!path)return; - try{ - await api('POST','/api/vision/devices/select-path',{device_path:path}); - toast('Camera selected by path','ok'); - }catch(e){} - refreshCamDevices(); -} -async function startLocalCam(b){ - btnLoad(b); - try{ - const r=await api('POST','/api/vision/local/start',{}); - if(r.ok){ - toast('Camera started: '+(r.backend||'?'),'ok'); - }else{ - toast('Camera start failed: '+(r.error||'unknown'),'err'); - } - }catch(e){} - btnDone(b); - setTimeout(refreshLocalCam, 500); -} -async function stopLocalCam(b){ - btnLoad(b); - try{await api('POST','/api/vision/local/stop');toast('Camera stopped','info');}catch(e){} - btnDone(b); - refreshLocalCam(); -} -async function refreshLocalCam(){ - try{ - const r=await api('GET','/api/vision/local/status'); - const els=[document.getElementById('local-cam-state'),document.getElementById('ops-cam-state')]; - els.forEach(el=>{ - if(!el)return; - if(r.running){ - el.textContent=(r.backend||'on')+(r.serial?(' '+r.serial.slice(-6)):'')+' '+r.width+'x'+r.height+'@'+r.fps; - el.className='badge badge-ok'; - }else if(r.last_error){ - el.textContent='error'; - el.className='badge badge-err'; - el.title=r.last_error; - }else{ - el.textContent='stopped'; - el.className='badge badge-warn'; - } - }); - }catch(e){} -} - -async function pinCamSerial(b){ - const pid=document.getElementById('camdev-pin-profile').value; - const serial=document.getElementById('camdev-pin-serial').value.trim(); - if(!pid||!serial){toast('Pick a profile and enter a serial','err');return;} - btnLoad(b); - try{ - await api('POST','/api/vision/devices/assign-serial',{profile_id:pid,serial:serial}); - toast(`Pinned ${serial} → ${pid}`,'ok'); - document.getElementById('camdev-pin-serial').value=''; - }catch(e){} - btnDone(b); - refreshCamDevices(); -} - // Motion async function toggleGestural(v){try{await api('POST','/api/motion/gestural-speaking?enabled='+v);}catch(e){}} let _armBusy=false,_runId=null; @@ -1212,10 +1078,10 @@ async function stopCombo(b){ async function refreshReplayFiles(){try{const r=await api('GET','/api/replay/files');const el=document.getElementById('replay-files');if(!(r.files||[]).length){el.innerHTML='
No motion files
';return;}el.innerHTML=''+(r.files||[]).map(f=>``).join('')+'
FileFramesDurationSize
${esc(f.name)}${f.frames}${f.duration_sec}s${f.size_kb}KB
';}catch(e){}} async function testReplay(b){const n=document.getElementById('replay-name').value,s=parseFloat(document.getElementById('replay-speed').value);if(!n)return;btnLoad(b);try{await api('POST','/api/replay/test',{name:n,speed:s});toast('Replay: '+n,'ok');pollArmBusy();}catch(e){}btnDone(b);} async function cancelReplay(){try{const r=await api('POST','/api/replay/cancel');toast(r&&r.message?r.message:'Cancelled','info');}catch(e){}} -async function deleteMotionFile(n){if(confirm('Delete '+n+'?'))try{await api('DELETE','/api/replay/files/'+encodeURIComponent(n));toast('Deleted','ok');refreshReplayFiles();populateGestureSelect();}catch(e){}} -async function uploadMotionFile(input){if(!input.files[0])return;const fd=new FormData();fd.append('file',input.files[0]);try{const r=await fetch('/api/replay/files/upload',{method:'POST',body:fd});if(!r.ok){const j=await r.json();toast(j.detail||'Upload failed','err');}else{toast('Uploaded','ok');refreshReplayFiles();populateGestureSelect();}}catch(e){toast('Upload error','err');}input.value='';} +async function deleteMotionFile(n){if(confirm('Delete '+n+'?'))try{await api('DELETE','/api/replay/files/'+encodeURIComponent(n));toast('Deleted','ok');refreshReplayFiles();}catch(e){}} +async function uploadMotionFile(input){if(!input.files[0])return;const fd=new FormData();fd.append('file',input.files[0]);try{const r=await fetch('/api/replay/files/upload',{method:'POST',body:fd});if(!r.ok){const j=await r.json();toast(j.detail||'Upload failed','err');}else{toast('Uploaded','ok');refreshReplayFiles();}}catch(e){toast('Upload error','err');}input.value='';} async function startTeaching(b){const n=document.getElementById('teach-name').value,d=parseFloat(document.getElementById('teach-duration').value);if(!n)return toast('Enter name','err');btnLoad(b);try{await api('POST','/api/replay/teach/start',{name:n,duration_sec:d});toast('Teaching: '+n,'ok');pollTeachStatus();}catch(e){}btnDone(b);} -async function stopTeaching(b){btnLoad(b);try{const r=await api('POST','/api/replay/teach/stop');toast(`Saved: ${r.name} (${r.frames} frames)`,'ok');document.getElementById('teach-status').textContent=`Done: ${r.frames} frames`;refreshReplayFiles();populateGestureSelect();}catch(e){}btnDone(b);} +async function stopTeaching(b){btnLoad(b);try{const r=await api('POST','/api/replay/teach/stop');toast(`Saved: ${r.name} (${r.frames} frames)`,'ok');document.getElementById('teach-status').textContent=`Done: ${r.frames} frames`;refreshReplayFiles();}catch(e){}btnDone(b);} let _teachPoll;function pollTeachStatus(){clearInterval(_teachPoll);_teachPoll=setInterval(async()=>{try{const r=await api('GET','/api/replay/teach/status');document.getElementById('teach-status').textContent=`${r.phase} | ${r.elapsed_sec}s | ${r.frames_recorded} frames`;if(!r.recording){clearInterval(_teachPoll);refreshReplayFiles();}}catch(e){clearInterval(_teachPoll);}},500);} // Scripts @@ -1288,11 +1154,6 @@ async function trReplayLast(b){btnLoad(b);try{await api('POST','/api/typed-repla async function trSaveLast(b){btnLoad(b);try{await api('POST','/api/typed-replay/save-last',{record_name:document.getElementById('tr-name').value});toast('Saved','ok');refreshTR();refreshRecords();}catch(e){}btnDone(b);} async function refreshTR(){try{const r=await api('GET','/api/typed-replay/status');const s=r.session||{};document.getElementById('tr-session').innerHTML=`Text: ${esc(s.text||'--')}
Audio: ${s.has_audio?'Yes':'No'} | Capture: ${s.has_capture?'Yes':'No'}
Replays: ${s.replay_count||0}
Generated: ${s.generated_at||'--'}
Saved: ${esc(s.saved_as||'--')}`;}catch(e){}} -// YOLO -async function loadDetector(b){btnLoad(b);try{const r=await api('POST','/api/detector/load');const el=document.getElementById('yolo-status');el.textContent=r.ok?'Loaded':'Failed';el.className='badge '+(r.ok?'badge-ok':'badge-err');toast(r.ok?'Model loaded':'Failed',r.ok?'ok':'err');}catch(e){}btnDone(b);} -async function runDetection(b){btnLoad(b);try{const r=await api('POST','/api/detector/detect');document.getElementById('yolo-result').innerHTML=`Persons: ${r.person_count} | Faces: ${r.face_count} | Group: ${r.group_detected?'Yes ('+r.group_size+')':'No'} | Intent: ${r.intent_detected?'Yes':'No'} | ${r.detection_ms}ms`;}catch(e){document.getElementById('yolo-result').textContent='Detection failed';}btnDone(b);} -async function refreshDetector(){try{const r=await api('GET','/api/detector/status');const el=document.getElementById('yolo-status');el.textContent=r.loaded?'Loaded':'Not loaded';el.className='badge '+(r.loaded?'badge-ok':'badge-warn');}catch(e){}} - // Wake Phrases async function refreshWakeActions(){try{const r=await api('GET','/api/wake-phrases/');const sel=document.getElementById('wp-action');sel.innerHTML=''+(r.actions||[]).map(a=>``).join('');}catch(e){}} async function loadWakePhrases(action){if(!action)return;try{const r=await api('GET',`/api/wake-phrases/${encodeURIComponent(action)}`);const el=document.getElementById('wp-phrases');if(!(r.phrases||[]).length){el.innerHTML='
No phrases
';return;}el.innerHTML=(r.phrases||[]).map(p=>`
${esc(p)}
`).join('');}catch(e){}} @@ -1386,10 +1247,6 @@ async function refreshStatus(){try{const s=await api('GET','/api/status');docume // WebSocket logs let logWs;function connectLogs(){const p=location.protocol==='https:'?'wss':'ws';logWs=new WebSocket(`${p}://${location.host}/ws/logs`);const box=document.getElementById('log-box');logWs.onmessage=e=>{box.textContent+=e.data+'\n';if(box.childNodes.length>1000)box.textContent=box.textContent.split('\n').slice(-500).join('\n');box.scrollTop=box.scrollHeight;};logWs.onclose=()=>setTimeout(connectLogs,3000);} -// WebSocket camera -let camWs;function connectCamera(){if(camWs&&camWs.readyState<=1)try{camWs.close();}catch(e){}const p=location.protocol==='https:'?'wss':'ws';camWs=new WebSocket(`${p}://${location.host}/ws/camera`);camWs.binaryType='arraybuffer';const canvas=document.getElementById('camera-feed'),ctx=canvas.getContext('2d');camWs.onmessage=e=>{const url=URL.createObjectURL(new Blob([e.data],{type:'image/jpeg'})),img=new Image();img.onload=()=>{ctx.drawImage(img,0,0,canvas.width,canvas.height);URL.revokeObjectURL(url);};img.onerror=()=>URL.revokeObjectURL(url);img.src=url;};camWs.onclose=()=>setTimeout(connectCamera,3000);} -function reconnectCamera(){if(camWs)try{camWs.close();}catch(e){}setTimeout(connectCamera,300);toast('Camera reconnecting...','info');} - // Auto-connect Gemini and auto-start Live Subprocess on page load async function autoConnectGemini(){ try{ @@ -1413,10 +1270,331 @@ async function autoStartLiveSub(){ }catch(e){} } +// ── Recognition tab (camera vision + face recognition) ── +// Mirror of /api/recognition/state.vision_enabled — kept fresh by +// refreshRecognition() so the Live-Gemini-panel Camera button can flip +// it without a round-trip GET. +let _recVisionEnabled=false; +async function refreshRecognition(){ + try{ + const r=await api('GET','/api/recognition/state'); + _recVisionEnabled=!!r.vision_enabled; + const v=document.getElementById('rec-vision-toggle'); + const f=document.getElementById('rec-facerec-toggle'); + if(v) v.checked=!!r.vision_enabled; + if(f) f.checked=!!r.face_rec_enabled; + const cs=document.getElementById('rec-camera-status'); + if(cs){ + const c=r.camera||{}; + cs.title=c.error||''; + if(c.running&&c.backend){ + cs.textContent=c.backend+' '+(c.width||'')+'x'+(c.height||'') + +(c.reconnect_count?(' ↻'+c.reconnect_count):''); + cs.className='badge badge-ok'; + }else if(c.running&&!c.backend){ + // thread alive but between reconnect attempts (camera unplugged) + cs.textContent='reconnecting…';cs.className='badge badge-warn'; + }else if(c.error){ + cs.textContent='error';cs.className='badge badge-warn'; + }else{ + cs.textContent='off';cs.className='badge'; + } + } + // Camera button in the Live Gemini Process panel (Voice & Audio tab) — + // same toggle as the Recognition tab, surfaced where it's handy. + const cb=document.getElementById('ls-cam-btn'); + if(cb){ + const c=r.camera||{}; + if(c.running&&c.backend){ + cb.textContent='Camera: ON'; + cb.className='btn btn-sm btn-success'; + cb.title='Streaming '+(c.backend||'')+' '+(c.width||'')+'x'+(c.height||'')+' to Gemini — click to turn off'; + }else if(c.running&&!c.backend){ + cb.textContent='Camera: …'; + cb.className='btn btn-sm btn-ghost'; + cb.title='Camera reconnecting…'; + }else if(r.vision_enabled&&c.error){ + cb.textContent='Camera: N/A'; + cb.className='btn btn-sm btn-danger'; + cb.title='Vision on but no camera backend: '+(c.error||''); + }else{ + cb.textContent='Camera: OFF'; + cb.className='btn btn-sm btn-ghost'; + cb.title='Click to stream camera frames to Gemini Live'; + } + } + const fs=document.getElementById('rec-facerec-status'); + if(fs){fs.textContent=r.face_rec_enabled?'on':'off';fs.className='badge '+(r.face_rec_enabled?'badge-ok':'');} + const fc=document.getElementById('rec-faces-count'); + if(fc) fc.textContent=`(${r.faces_count} faces, ${r.photos_count} photos)`; + const gv=document.getElementById('rec-gallery-version'); + if(gv) gv.textContent='v.'+r.gallery_version; + // toggle preview visibility — only when actively capturing (has a backend) + const img=document.getElementById('rec-preview-img'); + const empty=document.getElementById('rec-preview-empty'); + const meta=document.getElementById('rec-preview-meta'); + const c2=r.camera||{}; + if(c2.running&&c2.backend){ + img.style.display='inline-block';empty.style.display='none'; + if(meta) meta.textContent=`${c2.width}x${c2.height} @ ${c2.fps}fps · seq=${c2.frame_seq}`; + }else{ + img.style.display='none';empty.style.display='block'; + if(empty) empty.textContent=(c2.running&&!c2.backend) + ? 'Camera reconnecting…' + : 'Camera off — toggle Vision ON to see the live feed.'; + if(meta) meta.textContent='--'; + } + // Highlight the active resolution / quality buttons to match the live + // capture profile (works whether the camera is running or idle). + document.querySelectorAll('#rec-res-buttons button').forEach(btn=>{ + const on = parseInt(btn.dataset.w)===c2.width + && parseInt(btn.dataset.h)===c2.height + && parseInt(btn.dataset.fps)===c2.fps; + btn.className='btn btn-sm '+(on?'btn-primary':'btn-ghost'); + }); + document.querySelectorAll('#rec-quality-buttons button').forEach(btn=>{ + const on = parseInt(btn.dataset.q)===c2.jpeg_quality; + btn.className='btn btn-sm '+(on?'btn-primary':'btn-ghost'); + }); + }catch(e){} +} +// Resolution / FPS button menu — each click POSTs one mode and the +// CameraDaemon rebuilds the pipeline at it. refreshRecognition() then +// highlights whichever button matches the live profile. +async function setCameraMode(btn){ + btnLoad(btn); + try{ + const body={ + width: parseInt(btn.dataset.w), + height: parseInt(btn.dataset.h), + fps: parseInt(btn.dataset.fps), + }; + const r=await api('POST','/api/recognition/camera-config',body); + const p=r.profile||body; + toast(`Camera → ${p.width}×${p.height} @ ${p.fps}fps`,'ok'); + refreshRecognition(); + }catch(e){toast('Resolution change failed: '+(e.message||e),'err');} + btnDone(btn); +} +async function setCameraQuality(btn){ + btnLoad(btn); + try{ + const q=parseInt(btn.dataset.q); + await api('POST','/api/recognition/camera-config',{jpeg_quality:q}); + toast('JPEG quality → '+q,'ok'); + refreshRecognition(); + }catch(e){toast('Quality change failed: '+(e.message||e),'err');} + btnDone(btn); +} +// Camera button in the Live Gemini Process panel — flips the same +// vision toggle the Recognition tab owns. _recVisionEnabled is the +// last-known state (refreshed every 5 s by refreshRecognition). +async function toggleGeminiCamera(b){ + if(b) btnLoad(b); + const next=!_recVisionEnabled; + try{ + const r=await api('POST','/api/recognition/vision?on='+(next?'1':'0')); + _recVisionEnabled=!!(r&&r.vision_enabled); + toast(next?'Camera ON for Gemini':'Camera OFF for Gemini','ok'); + }catch(e){ + toast('Camera toggle failed: '+(e.message||e),'err'); + } + if(b) btnDone(b); + refreshRecognition(); // refresh both the panel button + the Recognition tab +} +async function setVisionEnabled(on){ + try{ + const r=await api('POST','/api/recognition/vision?on='+(on?'1':'0')); + toast(on?'Vision ON':'Vision OFF','ok'); + refreshRecognition(); + }catch(e){ + toast('Vision toggle failed: '+(e.message||e),'err'); + refreshRecognition(); + } +} +async function setFaceRecEnabled(on){ + try{ + const r=await api('POST','/api/recognition/face-rec?on='+(on?'1':'0')); + toast(on?'Face Recognition ON':'Face Recognition OFF','ok'); + if(r&&r.warning) toast(r.warning,'info'); + refreshRecognition(); + }catch(e){ + toast('Face Rec toggle failed: '+(e.message||e),'err'); + refreshRecognition(); + } +} +async function syncGallery(b){ + if(b) btnLoad(b); + try{await api('POST','/api/recognition/sync');toast('Gallery sync requested','ok');refreshRecognition();} + catch(e){toast('Sync failed','err');} + if(b) btnDone(b); +} +// Preview poller — bumps the img src each tick to defeat caching. +let _recPreviewTimer=null; +function startRecPreview(){ + if(_recPreviewTimer) return; + const tick=()=>{ + const img=document.getElementById('rec-preview-img'); + if(img && img.style.display!=='none'){ + img.src='/api/recognition/frame.jpg?t='+Date.now(); + } + }; + tick(); + _recPreviewTimer=setInterval(tick,500); +} +function stopRecPreview(){if(_recPreviewTimer){clearInterval(_recPreviewTimer);_recPreviewTimer=null;}} +// Hook into tab switch — start/stop preview when recognition tab is active. +(function(){ + const origSwitchTab=window.switchTab; + window.switchTab=function(name){ + origSwitchTab(name); + if(name==='recognition'){refreshRecognition();refreshFaces();startRecPreview();} + else{stopRecPreview();} + }; +})(); +// Face CRUD stubs — filled in milestone 5 +async function refreshFaces(){ + const el=document.getElementById('rec-faces-list'); + if(!el) return; + try{ + const r=await api('GET','/api/recognition/faces'); + if(!r.faces||!r.faces.length){el.innerHTML='
No faces enrolled yet
';return;} + el.innerHTML=r.faces.map(f=>renderFaceCard(f)).join(''); + }catch(e){ + el.innerHTML='
(face gallery not yet wired)
'; + } +} +function renderFaceCard(f){ + const name=f.name||`(face_${f.id})`; + const photos=(f.photos||[]).map(p=>{ + const url=`/api/recognition/faces/${f.id}/photo/${encodeURIComponent(p.name)}`; + return `
+ ${esc(p.name)} +
+ + 🗑 +
+
`; + }).join(''); + return `
+
+ face_${f.id} + + ${esc(name)} + + ${(f.photos||[]).length} photo(s) +
+
+ Description: + ${f.description?esc(f.description):''}${f.description?'':'(none — no extra context for Gemini)'} + +
+
${photos}
+
+ + + ⬇ ZIP + +
+
`; +} +// Build the ?name=&description= query string from the Add-New-Face inputs. +function _newFaceQuery(){ + const name=document.getElementById('rec-newface-name').value.trim(); + const desc=document.getElementById('rec-newface-desc').value.trim(); + const qs=[]; + if(name) qs.push('name='+encodeURIComponent(name)); + if(desc) qs.push('description='+encodeURIComponent(desc)); + return qs.length?('?'+qs.join('&')):''; +} +function _clearNewFaceInputs(){ + document.getElementById('rec-newface-name').value=''; + document.getElementById('rec-newface-desc').value=''; +} +async function enrollFromCamera(b){ + btnLoad(b); + try{ + const r=await api('POST','/api/recognition/faces/enroll'+_newFaceQuery()); + toast('Enrolled face_'+r.face.id+(r.face.description?' (with description)':''),'ok'); + _clearNewFaceInputs(); + refreshFaces();refreshRecognition(); + }catch(e){toast('Enroll failed: '+(e.message||e),'err');} + btnDone(b); +} +async function enrollFromUpload(input){ + const files=input.files;if(!files||!files.length)return; + const fd=new FormData();for(const f of files) fd.append('files',f); + try{ + const resp=await fetch('/api/recognition/faces/upload'+_newFaceQuery(),{method:'POST',body:fd}); + if(!resp.ok)throw new Error(await resp.text()); + const r=await resp.json(); + toast('Uploaded face_'+r.face.id+' ('+files.length+' photos'+(r.face.description?', with description':'')+')','ok'); + _clearNewFaceInputs(); + input.value=''; + refreshFaces();refreshRecognition(); + }catch(e){toast('Upload failed: '+(e.message||e),'err');} +} +async function captureToFace(id,b){ + btnLoad(b); + try{await api('POST','/api/recognition/faces/'+id+'/capture');toast('Added photo','ok');refreshFaces();} + catch(e){toast('Capture failed','err');} + btnDone(b); +} +async function uploadToFace(id,input){ + const files=input.files;if(!files||!files.length)return; + const fd=new FormData();for(const f of files) fd.append('files',f); + try{ + const resp=await fetch('/api/recognition/faces/'+id+'/upload',{method:'POST',body:fd}); + if(!resp.ok)throw new Error(await resp.text()); + toast('Uploaded '+files.length+' photo(s)','ok'); + input.value=''; + refreshFaces(); + }catch(e){toast('Upload failed: '+(e.message||e),'err');} +} +async function renameFace(id){ + const el=document.getElementById('rec-name-'+id);if(!el)return; + const cur=el.textContent.replace(/^\((.*)\)$/,'$1'); + const next=prompt('New name (blank to clear):',cur==='face_'+id?'':cur); + if(next===null) return; + try{ + await api('POST','/api/recognition/faces/'+id+'/rename',{name:next}); + toast('Renamed','ok');refreshFaces(); + }catch(e){toast('Rename failed','err');} +} +async function describeFace(id){ + const el=document.getElementById('rec-desc-'+id); + const cur=el?el.textContent.trim():''; + const next=prompt('Description for Gemini — who is this person? '+ + '(blank to clear)',cur); + if(next===null) return; + try{ + await api('POST','/api/recognition/faces/'+id+'/describe',{description:next}); + toast(next.trim()?'Description saved':'Description cleared','ok'); + refreshFaces(); + }catch(e){toast('Save failed: '+(e.message||e),'err');} +} +async function deletePhoto(id,name){ + if(!confirm('Delete photo '+name+'?'))return; + try{ + await api('DELETE','/api/recognition/faces/'+id+'/photo/'+encodeURIComponent(name)); + toast('Photo deleted','ok');refreshFaces(); + }catch(e){toast('Delete failed: '+(e.message||e),'err');} +} +async function deleteFace(id){ + if(!confirm('Delete face_'+id+' and all photos?'))return; + try{ + await api('DELETE','/api/recognition/faces/'+id); + toast('Face deleted','ok');refreshFaces();refreshRecognition(); + }catch(e){toast('Delete failed','err');} +} + // Init — vision/camera/detector fetches removed; those endpoints were deleted. -refreshStatus();refreshSystem();refreshAudio();refreshAudioDevices();refreshSkills();refreshReplayFiles();refreshScripts();refreshPrompt();refreshRecords();populateGestureSelect();refreshLiveVoice();refreshLiveSub();refreshTR();refreshWakeActions();refreshApiKey();refreshCombo();connectLogs(); +refreshStatus();refreshSystem();refreshAudio();refreshAudioDevices();refreshSkills();refreshReplayFiles();refreshScripts();refreshPrompt();refreshRecords();refreshLiveVoice();refreshLiveSub();refreshTR();refreshWakeActions();refreshApiKey();refreshCombo();refreshRecognition();connectLogs(); setTimeout(autoConnectGemini,2000);setTimeout(autoStartLiveSub,3000); -setInterval(refreshStatus,5000);setInterval(refreshSystem,30000);setInterval(refreshLiveVoice,5000);setInterval(refreshLiveSub,5000); +setInterval(refreshStatus,5000);setInterval(refreshSystem,30000);setInterval(refreshLiveVoice,5000);setInterval(refreshLiveSub,5000);setInterval(refreshRecognition,5000); diff --git a/gemini/script.py b/gemini/script.py index 768b668..0eb5130 100644 --- a/gemini/script.py +++ b/gemini/script.py @@ -19,8 +19,13 @@ from __future__ import annotations import array import asyncio +import base64 +import json import os +import sys +import threading import time +from pathlib import Path from typing import Any, Optional import numpy as np @@ -29,6 +34,7 @@ from google import genai from google.genai import types from Project.Sanad.config import ( + BASE_DIR, CHUNK_SIZE, GEMINI_API_KEY, GEMINI_VOICE, @@ -37,6 +43,7 @@ from Project.Sanad.config import ( ) from Project.Sanad.core.config_loader import section as _cfg_section from Project.Sanad.core.logger import get_logger +from Project.Sanad.vision import recognition_state as _recog_state log = get_logger("gemini_brain") @@ -57,6 +64,93 @@ _NO_MESSAGES_TIMEOUT = _SV.get("no_messages_timeout_sec", 30) _CHUNK_BYTES = CHUNK_SIZE * 2 _SILENCE_PCM = b"\x00" * _CHUNK_BYTES +# ── Recognition (camera + face gallery) tunables ── +_RECOG_STATE_PATH = Path(os.environ.get( + "SANAD_RECOGNITION_STATE_PATH", + str(BASE_DIR / "data" / ".recognition_state.json"), +)) +_VISION_SEND_HZ = float(os.environ.get("SANAD_VISION_SEND_HZ", "2")) +_VISION_STALE_MS = int(os.environ.get("SANAD_VISION_STALE_MS", "1500")) +_RECOG_POLL_S = float(os.environ.get("SANAD_RECOGNITION_POLL_S", "1.0")) +_FACES_DIR = Path(os.environ.get( + "SANAD_FACES_DIR", + str(BASE_DIR / "data" / "faces"), +)) +_FACES_MAX_SAMPLES = int(os.environ.get("SANAD_FACES_MAX_SAMPLES", "3")) +_FACES_PRIMER_RESIZE = int(os.environ.get("SANAD_FACES_PRIMER_RESIZE", "256")) + + +# ── stdin push channel (Marcus pattern) ────────────────────── +# The GeminiSubprocess supervisor writes two line types to this process's +# stdin: +# "frame:\n" — a camera frame to relay to Gemini Live +# "state:\n" — a motion-state update to inject as text +# A daemon thread parses them into the caches below; the asyncio tasks +# _send_frame_loop / _send_state_loop drain those caches. + +_LATEST_FRAME_LOCK = threading.Lock() +_LATEST_FRAME: dict = {"bytes": None, "ts": 0.0} + +_STATE_LOCK = threading.Lock() +_STATE_PENDING: list[str] = [] + +_STATE_TAGS = { + "start": "[STATE-START]", + "complete": "[STATE-DONE]", + "interrupted": "[STATE-INTERRUPTED]", + "error": "[STATE-ERROR]", + "paused": "[STATE-PAUSED]", + "resumed": "[STATE-RESUMED]", +} + + +def _stdin_watcher() -> None: + """Daemon thread — parse 'frame:' / 'state:' lines off stdin. + + Best-effort: any malformed line is skipped. Exits when the parent + closes our stdin (subprocess teardown).""" + try: + for line in sys.stdin: + line = line.rstrip("\n") + if not line: + continue + if line.startswith("frame:"): + b64 = line[len("frame:"):] + try: + data = base64.b64decode(b64) + except Exception: + continue + if data: + with _LATEST_FRAME_LOCK: + _LATEST_FRAME["bytes"] = data + _LATEST_FRAME["ts"] = time.time() + elif line.startswith("state:"): + try: + payload = json.loads(line[len("state:"):]) + except Exception: + continue + event = (payload.get("event") or "").strip().lower() + cmd = (payload.get("cmd") or "").strip() + tag = _STATE_TAGS.get(event) + if not tag or not cmd: + continue + msg = f"{tag} {cmd}" + elapsed = payload.get("elapsed_sec") + if isinstance(elapsed, (int, float)): + msg += f" ({float(elapsed):.1f}s)" + reason = payload.get("reason") + if reason and event == "error": + msg += f" — {reason}" + with _STATE_LOCK: + _STATE_PENDING.append(msg) + except Exception: + return + + +# Start the watcher at import time — it blocks harmlessly on sys.stdin +# until the supervisor sends something. Daemon so it never blocks exit. +threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start() + def _audio_energy(pcm: bytes) -> int: try: @@ -86,6 +180,19 @@ class GeminiBrain: self._ai_speak_start = 0.0 self._last_ai_audio = 0.0 self._done: Optional[asyncio.Event] = None + # ── Recognition flags — kept in sync with the state file by + # _recognition_state_watcher. Boot defaults come from the file (or + # the SANAD_* env vars if the file is missing). + _initial = _recog_state.read(_RECOG_STATE_PATH) + self._vision_enabled = bool( + _initial.vision_enabled + or os.environ.get("SANAD_VISION_ENABLE", "0") == "1" + ) + self._face_rec_enabled = bool( + _initial.face_rec_enabled + or os.environ.get("SANAD_FACE_RECOGNITION_ENABLE", "0") == "1" + ) + self._gallery_version_primed = -1 # bumped after first successful primer def stop(self) -> None: """Signal the run loop to exit at the next opportunity.""" @@ -116,12 +223,19 @@ class GeminiBrain: consecutive_errors = 0 self._mic.flush() self._done = asyncio.Event() + # Reset per-session primer state so re-priming on reconnect + # actually happens. The state watcher will re-prime as soon + # as it sees vision+face-rec enabled. + self._gallery_version_primed = -1 try: await asyncio.wait_for( asyncio.gather( self._send_mic_loop(session), self._receive_loop(session), + self._send_frame_loop(session), + self._send_state_loop(session), + self._recognition_state_watcher(session), ), timeout=_SESSION_TIMEOUT, ) @@ -368,3 +482,310 @@ class GeminiBrain: log.warning("receive ended: %s", exc) finally: self._done.set() + + # ─── vision-state announcer ─────────────────────────── + # Injects the camera state into the live session as text context. + # On a live toggle Gemini is told to say so out loud ("I can see you + # now" / "I can't see you anymore"); at session start it's silent + # standing context so "can you see me?" is answered honestly. + + async def _announce_vision_state(self, session: Any, enabled: bool, + is_toggle: bool) -> None: + if is_toggle and enabled: + text = ( + "[VISION ON] Your camera was just enabled — you can now see " + "the user through it. Briefly tell them you can see them now, " + "in your normal Khaleeji style (for example: " + "'هلا، الحين أشوفك زين')." + ) + elif is_toggle and not enabled: + text = ( + "[VISION OFF] Your camera was just disabled — you can no " + "longer see anything. Briefly tell the user you can't see " + "them anymore. If they later ask whether you can see them, " + "tell them to enable the camera from the dashboard." + ) + elif enabled: # session start, camera already on + text = ( + "[VISION STATUS] Your camera is ON — you can see the user " + "through it. Do not announce this unprompted; just answer " + "naturally if they ask what you see." + ) + else: # session start, camera off + text = ( + "[VISION STATUS] Your camera is OFF — you cannot see anything " + "right now. If the user asks whether you can see them, tell " + "them to enable the camera from the dashboard. Do not announce " + "this unprompted." + ) + try: + await session.send_realtime_input(text=text) + log.info("vision-state injected (enabled=%s, toggle=%s)", + enabled, is_toggle) + except asyncio.CancelledError: + raise + except Exception as exc: + log.warning("vision-state inject failed: %s", exc) + + # ─── face-recognition-state announcer ───────────────── + # Same idea as _announce_vision_state, for the face-recognition toggle. + # On a live OFF toggle it also tells Gemini to disregard the gallery — + # so OFF takes effect immediately instead of lingering until reconnect. + + async def _announce_facerec_state(self, session: Any, enabled: bool, + is_toggle: bool) -> None: + if is_toggle and enabled: + text = ( + "[FACE RECOGNITION ON] Face recognition was just enabled — " + "you'll be shown the people you know in a moment. Briefly " + "tell the user you can now recognise the people you know, in " + "your normal Khaleeji style." + ) + elif is_toggle and not enabled: + text = ( + "[FACE RECOGNITION OFF] Face recognition was just disabled. " + "Disregard the face gallery you were given earlier — stop " + "greeting people by name and do not identify anyone. Briefly " + "tell the user you'll no longer recognise faces." + ) + elif enabled: # session start, face rec already on + text = ( + "[FACE RECOGNITION STATUS] Face recognition is ON — when you " + "see someone you've been shown in the gallery, greet them by " + "name. Do not announce this unprompted." + ) + else: # session start, face rec off + text = ( + "[FACE RECOGNITION STATUS] Face recognition is OFF — you " + "cannot identify people. If the user asks who someone is or " + "whether you recognise them, tell them to enable face " + "recognition from the dashboard. Do not announce this " + "unprompted." + ) + try: + await session.send_realtime_input(text=text) + log.info("face-rec-state injected (enabled=%s, toggle=%s)", + enabled, is_toggle) + except asyncio.CancelledError: + raise + except Exception as exc: + log.warning("face-rec-state inject failed: %s", exc) + + # ─── recognition state watcher ──────────────────────── + # Polls data/.recognition_state.json at SANAD_RECOGNITION_POLL_S Hz and + # mirrors vision_enabled / face_rec_enabled into in-memory flags so the + # rest of the session can react WITHOUT a Gemini reconnect. + + async def _recognition_state_watcher(self, session: Any) -> None: + last_mtime = 0.0 + last_state = _recog_state.RecognitionState( + vision_enabled=self._vision_enabled, + face_rec_enabled=self._face_rec_enabled, + gallery_version=self._gallery_version_primed, + ) + # Best-effort initial primer if face_rec is already on at session start. + if self._face_rec_enabled and self._vision_enabled: + try: + cur = _recog_state.read(_RECOG_STATE_PATH) + await self._send_gallery_primer(session, cur.gallery_version) + except Exception as exc: + log.warning("initial gallery primer failed: %s", exc) + + # Tell Gemini the current camera + face-recognition state at session + # start — silent standing context so "can you see me?" / "do you know + # who I am?" are answered honestly even if the user never toggles. + await self._announce_vision_state( + session, self._vision_enabled, is_toggle=False, + ) + await self._announce_facerec_state( + session, self._face_rec_enabled, is_toggle=False, + ) + + while not self._done.is_set() and not self._stop_flag.is_set(): + await asyncio.sleep(_RECOG_POLL_S) + try: + st = _RECOG_STATE_PATH.stat() + except FileNotFoundError: + continue + except Exception: + continue + if st.st_mtime == last_mtime: + continue + last_mtime = st.st_mtime + new_state = _recog_state.read(_RECOG_STATE_PATH) + + # Vision toggle — instant. Announce it out loud so Gemini reacts + # ("I can see you now" / "I can't see you anymore"). + if new_state.vision_enabled != last_state.vision_enabled: + self._vision_enabled = new_state.vision_enabled + log.info("vision toggled → %s", self._vision_enabled) + await self._announce_vision_state( + session, self._vision_enabled, is_toggle=True, + ) + + # Face-rec toggle — announce it out loud. The OFF announcement + # also tells Gemini to disregard the gallery, so OFF takes effect + # immediately instead of lingering until the next reconnect. + if new_state.face_rec_enabled != last_state.face_rec_enabled: + self._face_rec_enabled = new_state.face_rec_enabled + if self._face_rec_enabled: + log.info("face rec enabled — announcing + sending primer") + else: + log.info("face rec disabled — telling Gemini to " + "disregard the gallery") + await self._announce_facerec_state( + session, self._face_rec_enabled, is_toggle=True, + ) + + # Conditions for re-priming: + # - face_rec just turned ON (no_face_rec_before) + # - gallery version bumped since the last primer + face_rec_just_on = ( + new_state.face_rec_enabled and not last_state.face_rec_enabled + ) + gallery_changed = ( + new_state.gallery_version != self._gallery_version_primed + ) + if (self._face_rec_enabled + and (face_rec_just_on or gallery_changed) + and self._vision_enabled): + try: + await self._send_gallery_primer( + session, new_state.gallery_version, + ) + except Exception as exc: + log.warning("gallery primer failed: %s", exc) + + last_state = new_state + + # ─── camera frame send loop ─────────────────────────── + # Reads the latest JPEG from the _LATEST_FRAME cache (fed by the + # _stdin_watcher thread, which the GeminiSubprocess supervisor pushes + # 'frame:' lines into) and relays it to Gemini Live at + # _VISION_SEND_HZ. Only active when self._vision_enabled. Skips frames + # older than _VISION_STALE_MS so a stopped/unplugged camera doesn't + # waste tokens on a frozen scene. + + async def _send_frame_loop(self, session: Any) -> None: + period = 1.0 / max(0.5, _VISION_SEND_HZ) + stale_s = _VISION_STALE_MS / 1000.0 + backoff = 0.0 + last_sent_ts = 0.0 + + while not self._done.is_set() and not self._stop_flag.is_set(): + await asyncio.sleep(max(period, backoff)) + if not self._vision_enabled: + continue + with _LATEST_FRAME_LOCK: + data = _LATEST_FRAME.get("bytes") + ts = _LATEST_FRAME.get("ts", 0.0) + if not data: + continue + # Stale — supervisor stopped pushing (camera off / unplugged). + if (time.time() - ts) > stale_s: + continue + # De-dup — don't re-send a frame we already relayed. + if ts == last_sent_ts: + continue + try: + await session.send_realtime_input( + video=types.Blob(data=data, mime_type="image/jpeg"), + ) + last_sent_ts = ts + backoff = 0.0 + except asyncio.CancelledError: + return + except Exception as exc: + log.warning("frame send failed: %s", exc) + backoff = min(backoff * 2 + 0.5, 5.0) + + # ─── motion-state inject loop ───────────────────────── + # Drains _STATE_PENDING (fed by the _stdin_watcher from 'state:' lines + # the supervisor pushes when the arm starts/finishes/errors a motion) + # and injects each as silent text context into the live session, so + # Gemini can answer "what are you doing?" honestly. Per persona, Gemini + # reads these for context but does not narrate them unprompted. + + async def _send_state_loop(self, session: Any) -> None: + while not self._done.is_set() and not self._stop_flag.is_set(): + await asyncio.sleep(0.1) + with _STATE_LOCK: + if not _STATE_PENDING: + continue + pending = list(_STATE_PENDING) + _STATE_PENDING.clear() + for msg in pending: + try: + await session.send_realtime_input(text=msg) + log.info("STATE injected: %s", msg) + except asyncio.CancelledError: + return + except Exception as exc: + # Some SDK versions may not accept text on + # send_realtime_input — log once-ish and keep going; + # motion still works, only this context channel is lost. + log.warning("state inject failed: %s", exc) + + # ─── face gallery primer ────────────────────────────── + # Builds one multimodal turn carrying the entire face gallery + a Khaleeji + # greeting instruction, and sends it via send_client_content. Gemini keeps + # this in session context until reconnect. Re-sent on gallery_version bumps. + + async def _send_gallery_primer(self, session: Any, version: int) -> None: + try: + from Project.Sanad.vision.face_gallery import FaceGallery + except Exception as exc: + log.info("face gallery module unavailable: %s", exc) + return + + gallery = FaceGallery(_FACES_DIR) + try: + entries = gallery.load_for_primer( + max_samples_per_face=_FACES_MAX_SAMPLES, + resize_long_side=_FACES_PRIMER_RESIZE, + ) + except Exception as exc: + log.warning("face gallery load failed: %s", exc) + return + + if not entries: + log.info("face gallery empty — primer skipped (v.%d)", version) + self._gallery_version_primed = version + return + + parts: list[dict[str, Any]] = [{ + "text": ( + "GALLERY PRIMER (do not reply to this turn). " + "Below are people you know. When the live camera shows one of " + "them, greet them warmly by name in UAE Khaleeji dialect " + "(for example: 'هلا والله يا كسام، شحالك؟'), and you may use " + "the notes about them to make the conversation personal. " + "For faces NOT in this gallery, welcome them as a guest " + "without inventing a name. Greet each person only once per " + "minute to avoid repetition." + ), + }] + for entry, jpegs in entries: + label = ( + f"This person is named {entry.name}." + if entry.name + else "This person's name is unknown — greet as guest." + ) + if entry.description: + label += f" Notes about them: {entry.description}" + parts.append({"text": f"\n— {label}"}) + for jpeg in jpegs: + parts.append({ + "inline_data": {"mime_type": "image/jpeg", "data": jpeg}, + }) + + try: + await session.send_client_content( + turns=[{"role": "user", "parts": parts}], + turn_complete=True, + ) + except Exception as exc: + log.warning("primer send failed: %s", exc) + return + self._gallery_version_primed = version + log.info("face gallery primed: %d person(s), v.%d", len(entries), version) diff --git a/gemini/subprocess.py b/gemini/subprocess.py index 0a16fcf..2bafb43 100644 --- a/gemini/subprocess.py +++ b/gemini/subprocess.py @@ -10,15 +10,16 @@ When a new model is added, build its own sibling supervisor (see from __future__ import annotations +import base64 +import json import os import signal import subprocess import sys import threading -import time from collections import deque from datetime import datetime -from typing import Any +from typing import Any, Optional, Union from pathlib import Path @@ -30,6 +31,11 @@ log = get_logger("gemini_subprocess") _LS_CFG = _cfg_section("gemini", "subprocess") +# Camera frame forwarding — push the latest JPEG to the child over stdin +# at this interval (seconds). 0.5 s ≈ 2 fps, matching the child's +# SANAD_VISION_SEND_HZ default. The child de-stales + relays to Gemini. +_FRAME_FORWARD_INTERVAL_S = float(_LS_CFG.get("frame_forward_interval_sec", 0.5)) + def _resolve_live_script() -> Path: """Locate the voice script to run as subprocess. @@ -82,6 +88,22 @@ class GeminiSubprocess: self.state_message = "Idle." self.last_user_text = "" self.suppressed_noise = 0 + # ── stdin push channel (camera frames + motion state) ── + # The child (gemini/script.py) reads "frame:\n" and + # "state:\n" lines off its stdin. Writes are serialised + # because the frame forwarder and the motion-state bus handler + # both call from different threads. + self._stdin_lock = threading.Lock() + self._camera = None # set via attach_camera() + self._frame_thread: threading.Thread | None = None + self._frame_stop = threading.Event() + + # ── camera attach (called once from main.py) ────────────── + + def attach_camera(self, camera) -> None: + """Give the supervisor a reference to the CameraDaemon so it can + forward frames to the child over stdin while a session runs.""" + self._camera = camera def _open_session_log(self, pid: int): """Open (or re-open) the per-day append log file for this session.""" @@ -214,6 +236,7 @@ class GeminiSubprocess: proc = subprocess.Popen( cmd, cwd=str(script.parent), + stdin=subprocess.PIPE, # camera frames + motion state push stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, @@ -221,16 +244,106 @@ class GeminiSubprocess: env=env, ) + # Reap any stale frame forwarder from a previous session that ended + # by a child crash rather than a clean stop() — otherwise it keeps + # spinning on a dead pipe and we'd leak a thread per restart. + stale_ft = self._frame_thread + if stale_ft is not None and stale_ft.is_alive(): + self._frame_stop.set() + stale_ft.join(timeout=2.0) + with self._lock: self.process = proc self.log_tail.append(f"Started: pid={proc.pid}") self._set_state("starting", f"pid={proc.pid}") self._reader_thread = threading.Thread(target=self._reader_loop, daemon=True) self._reader_thread.start() + # Frame forwarder — pushes camera JPEGs to the child over stdin. + self._frame_stop.clear() + self._frame_thread = threading.Thread( + target=self._frame_forwarder, daemon=True, name="gemini-frame-fwd", + ) + self._frame_thread.start() log.info("Live Gemini subprocess started: pid=%d", proc.pid) return {"started": True, "pid": proc.pid} + # ── stdin push channel ──────────────────────────────────── + + def _send_stdin(self, line: str) -> None: + """Serialised stdin write — frame forwarder + motion-state handler + both call this from different threads. Best-effort: a closed pipe + or a not-yet-started process is a silent no-op.""" + proc = self.process + if proc is None or proc.stdin is None: + return + try: + with self._stdin_lock: + if not proc.stdin.closed: + proc.stdin.write(line) + proc.stdin.flush() + except Exception: + # Pipe broke (child exited) — drop silently; the reader thread + # will surface the exit via state="stopped". + pass + + def send_frame(self, jpeg: Union[bytes, str]) -> None: + """Forward one camera frame to the child as 'frame:\\n'. + + Accepts raw JPEG bytes (base64-encoded here) or an already-base64 + ASCII string (e.g. CameraDaemon.get_frame_b64() — no re-encode).""" + if isinstance(jpeg, bytes): + b64 = base64.b64encode(jpeg).decode("ascii") + elif isinstance(jpeg, str): + b64 = jpeg.strip() + else: + return + if b64: + self._send_stdin("frame:" + b64 + "\n") + + def send_state(self, event: str, cmd: str, + elapsed_sec: Optional[float] = None, + reason: Optional[str] = None) -> None: + """Push a motion-state update to the child as 'state:\\n'. + + Events: start | complete | interrupted | error. The child injects + '[STATE-...] ' into the live Gemini session as silent text + context so Gemini can answer "what are you doing?" honestly.""" + if not event or not cmd: + return + payload: dict[str, Any] = {"event": event, "cmd": cmd} + if elapsed_sec is not None: + payload["elapsed_sec"] = round(float(elapsed_sec), 2) + if reason: + payload["reason"] = str(reason)[:200] + try: + line = "state:" + json.dumps(payload, ensure_ascii=False) + "\n" + except Exception: + return + self._send_stdin(line) + + def _frame_forwarder(self) -> None: + """Background thread — push the camera's latest frame to the child. + + Runs for the lifetime of one subprocess session. Gated on the + camera actually running; the child does its own vision-enabled + + staleness checks, so this stays dumb (camera up → push).""" + cam = self._camera + if cam is None: + return + while not self._frame_stop.is_set(): + if self._frame_stop.wait(_FRAME_FORWARD_INTERVAL_S): + break + try: + if not cam.is_running(): + continue + b64 = cam.get_frame_b64() + if b64: + self.send_frame(b64) + except Exception: + # Best-effort — never let a frame hiccup kill the thread. + pass + def stop(self) -> dict[str, Any]: with self._lock: proc = self.process @@ -238,6 +351,13 @@ class GeminiSubprocess: return {"stopped": False, "message": "Not running."} self._set_state("stopping", "Stopping...") + # Halt the frame forwarder before we tear the pipe down. + self._frame_stop.set() + ft = self._frame_thread + if ft is not None: + ft.join(timeout=2.0) + self._frame_thread = None + try: proc.send_signal(signal.SIGINT) proc.wait(timeout=_STOP_TIMEOUT_SEC) @@ -251,6 +371,16 @@ class GeminiSubprocess: rc = proc.returncode + # Close stdin/stdout explicitly — without this each start/stop + # cycle leaks FDs (relied on Popen.__del__ which only runs at GC; + # a reconnect loop would march the FD count to the OS limit). + for pipe in (getattr(proc, "stdin", None), getattr(proc, "stdout", None)): + if pipe is not None: + try: + pipe.close() + except Exception: + pass + with self._lock: self.process = None self.log_tail.append("Stopped.") diff --git a/main.py b/main.py index a3c1884..0f16102 100644 --- a/main.py +++ b/main.py @@ -106,6 +106,8 @@ TypedReplayEngine = _safe_import("TypedReplayEngine", lambda: __import__(" GeminiVoiceClient = _safe_import("GeminiVoiceClient", lambda: __import__("Project.Sanad.gemini.client", fromlist=["GeminiVoiceClient"]).GeminiVoiceClient) GeminiSubprocess = _safe_import("GeminiSubprocess", lambda: __import__("Project.Sanad.gemini.subprocess", fromlist=["GeminiSubprocess"]).GeminiSubprocess) LocalSubprocess = _safe_import("LocalSubprocess", lambda: __import__("Project.Sanad.local.subprocess", fromlist=["LocalSubprocess"]).LocalSubprocess) +CameraDaemon = _safe_import("CameraDaemon", lambda: __import__("Project.Sanad.vision.camera", fromlist=["CameraDaemon"]).CameraDaemon) +FaceGallery = _safe_import("FaceGallery", lambda: __import__("Project.Sanad.vision.face_gallery", fromlist=["FaceGallery"]).FaceGallery) # ── global instances (imported by route modules) ── @@ -131,6 +133,107 @@ else: live_sub = _safe_construct("live_sub", GeminiSubprocess) typed_replay = _safe_construct("typed_replay", (lambda: TypedReplayEngine(voice_client, audio_mgr)) if (TypedReplayEngine and voice_client and audio_mgr) else None) +# ── Recognition (camera + face gallery) ───────────────────────────────────── +# Camera is idle until the dashboard toggles vision on; face gallery is pure +# file IO and always available if the import succeeded. +# +# Config precedence (highest first): explicit env var → config/core_config.json +# section → hardcoded default. The parent process normally has no SANAD_CAMERA_* +# env vars (LIVE_TUNE is only forwarded to the Gemini child), so in practice the +# core_config.json `camera` / `faces` sections are the live source here. +def _build_camera(): + from Project.Sanad.core.config_loader import section as _cfg_section + cam_cfg = _cfg_section("core", "camera") + + def _knob(env_key: str, cfg_key: str, default): + env_val = os.environ.get(env_key) + if env_val is not None and env_val != "": + return type(default)(env_val) + return type(default)(cam_cfg.get(cfg_key, default)) + + # Frames are cached in memory and pushed to the Gemini child over its + # stdin (see GeminiSubprocess._frame_forwarder) — no file drop. + return CameraDaemon( + width=_knob("SANAD_CAMERA_WIDTH", "width", 424), + height=_knob("SANAD_CAMERA_HEIGHT", "height", 240), + fps=_knob("SANAD_CAMERA_FPS", "fps", 15), + jpeg_quality=_knob("SANAD_CAMERA_JPEG_QUALITY", "jpeg_quality", 70), + stale_threshold_s=float(cam_cfg.get("stale_threshold_s", 10.0)), + reconnect_min_s=float(cam_cfg.get("reconnect_min_s", 2.0)), + reconnect_max_s=float(cam_cfg.get("reconnect_max_s", 10.0)), + capture_timeout_ms=int(cam_cfg.get("capture_timeout_ms", 5000)), + ) + +def _build_gallery(): + from Project.Sanad.config import BASE_DIR + from Project.Sanad.core.config_loader import section as _cfg_section + faces_cfg = _cfg_section("core", "faces") + # SANAD_FACES_DIR is set absolute by LIVE_TUNE (the Gemini child reads the + # same var). In the parent it's usually unset → fall back to the JSON's + # dir_rel, then the hardcoded default. Honour absolute paths as-is. + raw = os.environ.get("SANAD_FACES_DIR") or faces_cfg.get("dir_rel", "data/faces") + p = Path(raw) + root = p if p.is_absolute() else (BASE_DIR / raw) + return FaceGallery(root) + +camera = _safe_construct("camera", _build_camera if CameraDaemon else None) +gallery = _safe_construct("gallery", _build_gallery if FaceGallery else None) + +# Restore persisted vision_enabled at boot — start camera if the user left +# it on across a reboot. Face-rec state is read by the Gemini child directly. +try: + from Project.Sanad.vision import recognition_state as _recog_state + from Project.Sanad.config import BASE_DIR as _BD + _state = _recog_state.read(_BD / "data" / ".recognition_state.json") + if _state.vision_enabled and camera is not None: + if camera.start(): + log.info("Camera vision restored from state (backend=%s)", camera.backend) + else: + log.warning("Camera vision was ON but no backend available — leaving OFF") + _recog_state.mutate(_BD / "data" / ".recognition_state.json", + vision_enabled=False) +except Exception: + log.exception("Could not restore recognition state") + +# Hand the camera to the Gemini supervisor so it can forward frames to the +# child over stdin while a live session runs. +if live_sub is not None and camera is not None: + try: + if hasattr(live_sub, "attach_camera"): + live_sub.attach_camera(camera) + log.info("Camera attached to live subprocess supervisor") + except Exception: + log.exception("attach_camera failed") + +# ── Motion-state → Gemini channel ─────────────────────────────────────────── +# The arm controller emits motion.action_started / _done / _error on the bus. +# Forward each to the Gemini child as a 'state:' line so the live session can +# answer "what are you doing?" honestly. Sync handlers, fired via emit_sync +# from the arm's worker thread — send_state just writes to a pipe (cheap). +if live_sub is not None and hasattr(live_sub, "send_state"): + try: + from Project.Sanad.core.event_bus import bus as _bus + + def _on_motion_started(action: str = "", **_kw): + live_sub.send_state("start", action) + + def _on_motion_done(action: str = "", elapsed_sec=None, + failed: bool = False, **_kw): + # action_error already covered the failure case with a reason; + # here just emit complete (skip if it failed to avoid a dup). + if not failed: + live_sub.send_state("complete", action, elapsed_sec=elapsed_sec) + + def _on_motion_error(action: str = "", reason: str = "", **_kw): + live_sub.send_state("error", action, reason=reason) + + _bus.on("motion.action_started", _on_motion_started) + _bus.on("motion.action_done", _on_motion_done) + _bus.on("motion.action_error", _on_motion_error) + log.info("Motion-state → Gemini channel wired") + except Exception: + log.exception("Could not wire motion-state → Gemini channel") + # Wire everything into the Brain (only what was constructed) def _safe_attach(method_name: str, value): if brain is None or value is None: @@ -166,6 +269,8 @@ SUBSYSTEMS = { "live_voice": live_voice, "live_sub": live_sub, "typed_replay": typed_replay, + "camera": camera, + "gallery": gallery, } # Critical subsystems — if any of these are None, log a warning at startup @@ -220,6 +325,13 @@ def _do_shutdown(from_signal: bool = False): except Exception: log.exception("audio_mgr.close() failed") + if camera is not None: + try: + if camera.is_running(): + camera.stop() + except Exception: + log.exception("camera.stop() failed") + log.info("Shutdown complete") diff --git a/motion/arm_controller.py b/motion/arm_controller.py index 6237ea7..f68c280 100644 --- a/motion/arm_controller.py +++ b/motion/arm_controller.py @@ -673,6 +673,8 @@ class ArmController: self._is_busy = True self._cancel.clear() + _start = time.monotonic() + _failed = False try: bus.emit_sync("motion.action_started", action=action.name) if action.file: @@ -680,12 +682,20 @@ class ArmController: else: self._run_sdk_action(action) except Exception as exc: + _failed = True log.error("Action %s failed: %s", action.name, exc) + bus.emit_sync("motion.action_error", action=action.name, + reason=str(exc)) finally: with self._lock: self._is_busy = False self._last_action_time = time.monotonic() - bus.emit_sync("motion.action_done", action=action.name) + # action_done always fires (back-compat for existing listeners); + # action_error above is the extra signal for the Gemini + # motion-state channel. elapsed_sec lets Gemini say "...took 2.3s". + bus.emit_sync("motion.action_done", action=action.name, + elapsed_sec=round(time.monotonic() - _start, 2), + failed=_failed) def _run_sdk_action(self, action: Action): if not _HAS_SDK: diff --git a/requirements.txt b/requirements.txt index 190bb02..ab563c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,11 +7,30 @@ uvicorn[standard]>=0.29.0 python-multipart>=0.0.9 # Gemini voice +# google-genai: the Gemini Live SDK — used by gemini/script.py (live brain) +# and gemini/client.py. Needs Python 3.10+, which is why the voice loop +# runs in the gemini_sdk conda env. send_realtime_input(video=)/(text=) +# and send_client_content() require a reasonably recent (>=1.x) release. +google-genai>=1.0.0 websockets>=12.0 pyaudio>=0.2.13 -# Camera proxy -httpx>=0.27.0 +# Recognition (camera vision + face gallery for Gemini-side face recognition) +# opencv-python-headless: JPEG encode + USB-camera fallback. Headless wheel — +# the dashboard renders frames; we never need a GUI window. +# Pillow: resize face samples before the Gemini primer turn. +opencv-python-headless>=4.8.0 +Pillow>=10.0.0 +# +# pyrealsense2 — DO NOT `pip install` on Jetson / JetPack 5. +# The PyPI wheel is built against glibc 2.32+ (Ubuntu 22.04); JetPack 5 ships +# glibc 2.31, so the wheel fails to load with: +# ImportError: ... version `GLIBC_2.32' not found +# On Jetson, build the Python binding from source against the apt-installed +# librealsense2 runtime (see README → "Camera vision on Jetson"). +# On x86_64 / Ubuntu 22.04+ desktops, `pip install pyrealsense2` works fine. +# If pyrealsense2 is absent, CameraDaemon falls back to cv2.VideoCapture(0). +# pyrealsense2>=2.50.0 # intentionally commented — see note above # Local TTS (optional — only needed for MBZUAI model) transformers>=4.40.0 diff --git a/vision/__init__.py b/vision/__init__.py new file mode 100644 index 0000000..2a2729a --- /dev/null +++ b/vision/__init__.py @@ -0,0 +1 @@ +"""Vision package — camera daemon + face gallery for Gemini-side recognition.""" diff --git a/vision/camera.py b/vision/camera.py new file mode 100644 index 0000000..7f150f8 --- /dev/null +++ b/vision/camera.py @@ -0,0 +1,560 @@ +"""Camera daemon — single producer, in-memory frame cache. + +Captures frames at fixed FPS from a RealSense (preferred) or any USB +camera (fallback), JPEG-encodes them, and caches the latest frame in +memory in two views (matches Marcus's API/camera_api.py): + + - `_latest_jpeg` raw JPEG bytes — dashboard preview + frame forwarder + - `_latest_b64` base64 ASCII — frame forwarder → Gemini child stdin + +Consumers: + - dashboard preview → `snapshot_jpeg()` (served as an HTTP Response) + - face enrollment → `get_fresh_frame()` for a guaranteed-current capture + - GeminiSubprocess → `get_frame_b64()`, pushed over the child's stdin + +Lifecycle is driven by the Recognition tab toggle. The daemon is idle +until `start()` is called; failures in start() are non-fatal and +reported via `is_running()` / `backend`. Once running it auto-reconnects +on USB unplug / stalled frames (Marcus-style resilience), and supports +hot `reconfigure()` of resolution/FPS without a full restart. +""" + +from __future__ import annotations + +import base64 +import os +import threading +import time +from typing import Optional + +import numpy as np + +from Project.Sanad.core.logger import get_logger + +log = get_logger("camera") + +# How many /dev/video* indices to scan for a USB-style color camera when +# RealSense isn't available. A RealSense exposes ~6 V4L2 nodes (depth, IR, +# color, metadata…) — the color one is rarely index 0, so we probe each +# and accept the first that yields a real 3-channel BGR frame. +_USB_SCAN_RANGE = 10 + + +class CameraDaemon: + """RealSense → USB fallback camera capture with in-memory frame cache.""" + + def __init__( + self, + width: int = 424, + height: int = 240, + fps: int = 15, + jpeg_quality: int = 70, + stale_threshold_s: float = 10.0, + reconnect_min_s: float = 2.0, + reconnect_max_s: float = 10.0, + capture_timeout_ms: int = 5000, + ) -> None: + # Active profile — guarded by _reconfig_lock so reconfigure() can + # hot-swap it from another thread between capture sessions. + self._reconfig_lock = threading.Lock() + self._w = int(width) + self._h = int(height) + self._fps = int(fps) + self._q = max(10, min(95, int(jpeg_quality))) + self._reconfig_pending = False + + # Resilience knobs (Marcus-style) + self._stale_s = float(stale_threshold_s) + self._reconnect_min_s = float(reconnect_min_s) + self._reconnect_max_s = float(reconnect_max_s) + self._capture_timeout_ms = int(capture_timeout_ms) + + self._thread: Optional[threading.Thread] = None + self._stop = threading.Event() + self._backend: Optional[str] = None + self._lock = threading.Lock() + self._latest_jpeg: Optional[bytes] = None + self._latest_b64: Optional[str] = None + self._latest_ts: float = 0.0 + self._frame_seq: int = 0 + self._error: Optional[str] = None + self._reconnect_count: int = 0 + + # ── public API ────────────────────────────────────────── + + @property + def backend(self) -> Optional[str]: + return self._backend + + @property + def error(self) -> Optional[str]: + return self._error + + @property + def frame_seq(self) -> int: + return self._frame_seq + + def is_running(self) -> bool: + return self._thread is not None and self._thread.is_alive() + + def start(self) -> bool: + """Start capture thread. Returns True if a backend was acquired. + + Initial probe is synchronous; if it fails the thread isn't spawned. + Once running, the inner loop auto-reconnects on USB unplug or + stalled frames using exponential backoff (`reconnect_min_s` .. + `reconnect_max_s`). + """ + if self.is_running(): + return True + self._stop.clear() + self._error = None + self._reconnect_count = 0 + + # One-shot USB-2.0 negotiation diagnostic (warns operator if D435I + # came up on USB 2.0 — frame drops would be likely otherwise). + self._check_usb_version() + + backend = self._probe_any() + if backend is None: + log.warning("Camera: no backend available (RealSense + USB both failed)") + self._backend = None + return False + + self._backend = backend["name"] + self._thread = threading.Thread( + target=self._reconnect_loop, args=(backend,), + daemon=True, name="camera-daemon", + ) + self._thread.start() + with self._reconfig_lock: + w, h, f = self._w, self._h, self._fps + log.info("Camera started (backend=%s, %dx%d @ %dfps)", + self._backend, w, h, f) + return True + + def stop(self) -> None: + """Stop the capture thread and release the hardware.""" + if not self.is_running(): + self._backend = None + return + self._stop.set() + t = self._thread + if t is not None: + t.join(timeout=2.0) + self._thread = None + self._backend = None + log.info("Camera stopped") + + def reconfigure(self, width: Optional[int] = None, height: Optional[int] = None, + fps: Optional[int] = None, jpeg_quality: Optional[int] = None) -> dict: + """Hot-swap the capture profile without a full stop/start. + + Sets a pending flag — the capture loop notices it, tears the + pipeline down, and rebuilds at the new resolution (~0.5 s gap). + If the daemon isn't running the new values just take effect on + the next `start()`. Returns the resulting active profile. + """ + with self._reconfig_lock: + if width is not None: + self._w = int(width) + if height is not None: + self._h = int(height) + if fps is not None: + self._fps = int(fps) + if jpeg_quality is not None: + self._q = max(10, min(95, int(jpeg_quality))) + if self.is_running(): + self._reconfig_pending = True + profile = {"width": self._w, "height": self._h, + "fps": self._fps, "jpeg_quality": self._q} + log.info("Camera reconfigure → %s", profile) + return profile + + def snapshot_jpeg(self) -> Optional[bytes]: + """Return the latest JPEG bytes, or None if no frame yet.""" + with self._lock: + return self._latest_jpeg + + def get_frame_b64(self) -> Optional[str]: + """Return the latest frame as a base64 ASCII string (or None). + + Used by the frame forwarder to push frames over the Gemini child's + stdin without re-encoding — base64 is cached alongside the JPEG. + """ + with self._lock: + return self._latest_b64 + + def get_fresh_frame(self, max_age_s: float = 0.5, + timeout_s: float = 1.5) -> Optional[bytes]: + """Return a JPEG frame newer than `max_age_s`, waiting up to `timeout_s`. + + Used by face enrollment so the captured frame is guaranteed to be + the *current* scene, not a stale buffer from before the user got + into position. Falls back to whatever's cached on timeout. + """ + deadline = time.time() + timeout_s + while time.time() < deadline: + with self._lock: + if (self._latest_jpeg is not None + and self._latest_ts > 0 + and (time.time() - self._latest_ts) <= max_age_s): + return self._latest_jpeg + time.sleep(0.03) + with self._lock: + return self._latest_jpeg + + def latest_age_s(self) -> float: + """Seconds since last successful frame; +inf if none.""" + with self._lock: + if self._latest_ts <= 0: + return float("inf") + return time.time() - self._latest_ts + + def status(self) -> dict: + with self._reconfig_lock: + w, h, f, q = self._w, self._h, self._fps, self._q + # latest_age_s() is +inf until the first frame lands. inf is NOT + # JSON-serialisable by Starlette's JSONResponse (allow_nan=False) — + # leaving it as inf would 500 the /api/recognition/* routes. Map + # "running but no frame yet" and "not running" both to None. + age = self.latest_age_s() + age_s = round(age, 2) if (self.is_running() and age != float("inf")) else None + return { + "running": self.is_running(), + "backend": self._backend, + "width": w, + "height": h, + "fps": f, + "jpeg_quality": q, + "frame_seq": self._frame_seq, + "age_s": age_s, + "error": self._error, + "reconnect_count": self._reconnect_count, + } + + # ── helpers ───────────────────────────────────────────── + + def _probe_any(self) -> Optional[dict]: + """Try RealSense first, then USB. Returns backend dict or None.""" + b = self._probe_realsense() + if b is None: + b = self._probe_usb() + return b + + def _check_usb_version(self) -> None: + """Warn if a connected RealSense negotiated USB 2.0 (needs 3.x). + + Marcus has this same check — D435I on USB 2.0 can't deliver + color+depth+IMU and the pipeline silently stalls. Catching it at + startup lets the operator fix the cable/port instead of chasing a + "no frames" loop. Diagnostic only; never blocks startup. + """ + try: + import pyrealsense2 as rs # type: ignore + ctx = rs.context() + for dev in ctx.query_devices(): + try: + usb_type = dev.get_info(rs.camera_info.usb_type_descriptor) + name = dev.get_info(rs.camera_info.name) + except Exception: + continue + if str(usb_type).startswith("2."): + log.warning( + "RealSense %s negotiated USB %s — expected 3.x. " + "Frame drops likely. Try a USB 3 port / shorter cable / " + "powered hub.", name, usb_type, + ) + else: + log.info("RealSense %s on USB %s", name, usb_type) + except Exception: + pass + + # ── backend probing ───────────────────────────────────── + + def _probe_realsense(self) -> Optional[dict]: + with self._reconfig_lock: + w, h, f = self._w, self._h, self._fps + try: + import pyrealsense2 as rs # type: ignore + pipeline = rs.pipeline() + cfg = rs.config() + cfg.enable_stream(rs.stream.color, w, h, rs.format.bgr8, f) + profile = pipeline.start(cfg) + return {"name": "realsense", "pipeline": pipeline, "rs": rs, + "profile": profile} + except Exception as exc: + log.info("RealSense unavailable: %s", exc) + return None + + def _open_usb_index(self, idx: int, w: int, h: int, f: int, + cv2) -> Optional[dict]: + """Open one /dev/video, validate it yields a 3-channel frame, + and classify it as colour vs grayscale/IR. + + A RealSense IR node delivers Y8 — cv2 replicates that single plane + across 3 channels, so the planes come back *bit-identical*. A real + colour sensor never produces bit-identical channels (per-channel + sensor noise differs even on a flat gray scene). That's the test. + Returns a backend dict with `is_color`, or None if the node is + unusable. + """ + cap = None + try: + cap = cv2.VideoCapture(idx) + if not cap.isOpened(): + cap.release() + return None + cap.set(cv2.CAP_PROP_FRAME_WIDTH, w) + cap.set(cv2.CAP_PROP_FRAME_HEIGHT, h) + cap.set(cv2.CAP_PROP_FPS, f) + good = None + for _ in range(5): + ok, frame = cap.read() + if (ok and frame is not None and frame.ndim == 3 + and frame.shape[2] == 3): + good = frame + break + if good is None: + cap.release() + return None + is_color = not ( + np.array_equal(good[:, :, 0], good[:, :, 1]) + and np.array_equal(good[:, :, 1], good[:, :, 2]) + ) + return {"name": "usb", "cap": cap, "cv2": cv2, "index": idx, + "is_color": is_color, + "frame_wh": (good.shape[1], good.shape[0])} + except Exception as exc: + log.info("USB camera index %d: %s", idx, exc) + if cap is not None: + try: + cap.release() + except Exception: + pass + return None + + def _probe_usb(self) -> Optional[dict]: + """Scan /dev/video* for a colour camera node, falling back to a + grayscale/IR node only if no colour node exists. + + On a RealSense, /dev/video0 is the *depth* stream (Z16, cv2 can't + open it as a webcam); the IR nodes deliver Y8 (grayscale); the + *colour* node delivers YUYV/BGR. We can't know the index up front, + so we probe each and prefer the first genuine colour node — that's + why the dashboard preview used to come up grayscale. Pin a node + with SANAD_CAMERA_USB_INDEX= to skip the scan entirely. + """ + with self._reconfig_lock: + w, h, f = self._w, self._h, self._fps + try: + import cv2 # type: ignore + except Exception as exc: + log.info("USB camera unavailable: %s", exc) + return None + + # Pinned index — accept whatever it is (colour or not). + explicit = os.environ.get("SANAD_CAMERA_USB_INDEX", "").strip() + if explicit.isdigit(): + backend = self._open_usb_index(int(explicit), w, h, f, cv2) + if backend is not None: + fw, fh = backend["frame_wh"] + log.info("USB camera: pinned /dev/video%d (%dx%d, %s)", + backend["index"], fw, fh, + "colour" if backend["is_color"] else "grayscale/IR") + return backend + log.warning("USB camera: pinned index %s unusable", explicit) + return None + + # Scan — prefer a real colour node; keep the first grayscale node + # as a last resort so the camera still works if that's all there is. + gray_fallback: Optional[dict] = None + for idx in range(_USB_SCAN_RANGE): + backend = self._open_usb_index(idx, w, h, f, cv2) + if backend is None: + continue + fw, fh = backend["frame_wh"] + if backend["is_color"]: + log.info("USB camera: using /dev/video%d (colour, %dx%d)", + idx, fw, fh) + if gray_fallback is not None: + try: + gray_fallback["cap"].release() + except Exception: + pass + return backend + # grayscale/IR — remember the first, release any extras + if gray_fallback is None: + gray_fallback = backend + else: + try: + backend["cap"].release() + except Exception: + pass + + if gray_fallback is not None: + fw, fh = gray_fallback["frame_wh"] + log.warning("USB camera: no colour node found — falling back to " + "/dev/video%d (grayscale/IR, %dx%d). For a RealSense, " + "build pyrealsense2 or pin the colour node with " + "SANAD_CAMERA_USB_INDEX.", gray_fallback["index"], fw, fh) + return gray_fallback + + log.info("USB camera unavailable: no working /dev/video* node found " + "(scanned %d indices)", _USB_SCAN_RANGE) + return None + + # ── main capture loop ─────────────────────────────────── + + def _reconnect_loop(self, initial_backend: dict) -> None: + """Outer loop — owns reconnect with exponential backoff. + + Inner `_capture_session` runs until the camera goes stale, the + stop flag is set, or a reconfigure is requested. On stall we + sleep + re-probe; on reconfigure we re-probe immediately at the + new resolution. Backoff resets after a successful session. + """ + backend = initial_backend + backoff = self._reconnect_min_s + + while not self._stop.is_set(): + reconfigured = False + try: + reconfigured = self._capture_session(backend) + except Exception as exc: + log.exception("Camera capture session crashed: %s", exc) + self._error = str(exc) + finally: + self._teardown(backend) + + if self._stop.is_set(): + break + + if reconfigured: + # Fast path — rebuild immediately at the new profile. + with self._reconfig_lock: + self._reconfig_pending = False + new_backend = self._probe_any() + if new_backend is None: + self._error = "reconnecting" + log.warning("Camera reconfigure: re-probe failed — " + "retrying in %.1fs", backoff) + if self._stop.wait(backoff): + break + backoff = min(backoff * 2, self._reconnect_max_s) + continue + self._backend = new_backend["name"] + self._error = None + backend = new_backend + backoff = self._reconnect_min_s + log.info("Camera rebuilt after reconfigure (backend=%s)", + self._backend) + continue + + # Capture session ended unexpectedly (stall / crash). Sleep + re-probe. + self._error = "reconnecting" + log.warning("Camera disconnected — reconnecting in %.1fs", backoff) + if self._stop.wait(backoff): # interruptible sleep + break + backoff = min(backoff * 2, self._reconnect_max_s) + + new_backend = self._probe_any() + if new_backend is None: + self._backend = None + continue # stay in the loop; next iteration retries + self._backend = new_backend["name"] + self._reconnect_count += 1 + self._error = None + log.info("Camera reconnected (backend=%s, attempt #%d)", + self._backend, self._reconnect_count) + backend = new_backend + backoff = self._reconnect_min_s # reset on success + + def _capture_session(self, backend: dict) -> bool: + """Inner capture loop — runs until stop, stale-frame timeout, or + a reconfigure request. + + Returns True if it exited because of a reconfigure (caller rebuilds + immediately), False on a stall or clean stop. + """ + import cv2 # always available — used for JPEG encode + + with self._reconfig_lock: + encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), self._q] + last_frame_time = time.time() + consecutive_failures = 0 + + while not self._stop.is_set(): + if self._reconfig_pending: + log.info("Camera reconfigure requested — rebuilding pipeline") + return True + + bgr = self._read_frame(backend) + if bgr is None: + consecutive_failures += 1 + age = time.time() - last_frame_time + if age > self._stale_s: + log.warning( + "Camera stalled %.1fs (%d consecutive timeouts) — " + "rebuilding pipeline", age, consecutive_failures, + ) + return False + # Intermediate warnings so degradation is visible early + if consecutive_failures in (3, 10, 30): + log.warning("Camera slow (%d failures, age %.1fs)", + consecutive_failures, age) + time.sleep(0.05) + continue + + try: + ok, buf = cv2.imencode(".jpg", bgr, encode_params) + except Exception as exc: + log.warning("JPEG encode failed: %s", exc) + continue + if not ok: + continue + jpeg = bytes(buf) + b64 = base64.b64encode(jpeg).decode("ascii") + now = time.time() + with self._lock: + self._latest_jpeg = jpeg + self._latest_b64 = b64 + self._latest_ts = now + self._frame_seq += 1 + last_frame_time = now + consecutive_failures = 0 + + return False + + def _read_frame(self, backend: dict) -> Optional[np.ndarray]: + name = backend["name"] + if name == "realsense": + try: + frames = backend["pipeline"].wait_for_frames( + timeout_ms=self._capture_timeout_ms, + ) + color = frames.get_color_frame() + if not color: + return None + return np.asanyarray(color.get_data()) + except Exception: + # Soft path — single timeouts handled by _capture_session's + # stale-detection logic; don't spam the log per frame. + return None + elif name == "usb": + cap = backend["cap"] + ok, frame = cap.read() + if not ok or frame is None: + return None + return frame + return None + + def _teardown(self, backend: dict) -> None: + name = backend.get("name") + try: + if name == "realsense": + backend["pipeline"].stop() + elif name == "usb": + backend["cap"].release() + except Exception as exc: + log.info("Camera teardown: %s", exc) diff --git a/vision/face_gallery.py b/vision/face_gallery.py new file mode 100644 index 0000000..1b637ab --- /dev/null +++ b/vision/face_gallery.py @@ -0,0 +1,363 @@ +"""Face gallery — pure file IO over data/faces/face_{id}/. + +Layout per face: + face_{id}/ + face_1.jpg ← samples (≥1 required) + face_2.jpg + face_3.png + meta.json ← optional: {"name": "...", "description": "...", "added_at": "..."} + +`description` is free text the operator writes about the person ("lead +engineer, likes coffee") — it's folded into the Gemini primer turn so +Gemini can reference it when it recognises that face. + +No ML — Gemini does the recognition in-context using the samples we feed it +via the primer turn. This module's only jobs are: + - enumerate enrolled faces + - serve & accept JPEG/PNG bytes per face + - rename / describe / delete / zip / load-for-primer + +Thread-safe via a single internal RLock. +""" + +from __future__ import annotations + +import io +import json +import re +import threading +import zipfile +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Iterable + +from Project.Sanad.core.logger import get_logger + +log = get_logger("face_gallery") + + +_DIR_RE = re.compile(r"^face_(\d+)$") +ALLOWED_EXTS = {".jpg", ".jpeg", ".png"} +SAMPLE_NAME_RE = re.compile(r"^face_(\d+)\.(jpg|jpeg|png)$", re.IGNORECASE) + + +@dataclass +class PhotoInfo: + name: str + size_bytes: int + path: Path + + +@dataclass +class FaceEntry: + id: int + name: str | None + added_at: str | None + dir: Path + description: str | None = None + sample_paths: list[Path] = field(default_factory=list) + + def to_dict(self) -> dict: + return { + "id": self.id, + "name": self.name, + "description": self.description, + "added_at": self.added_at, + "dir": str(self.dir), + "photos": [ + {"name": p.name, "size_bytes": p.stat().st_size} + for p in self.sample_paths + if p.exists() + ], + } + + +class FaceGallery: + """File-system backed gallery rooted at `root` (e.g. data/faces/).""" + + def __init__(self, root: Path | str) -> None: + self.root = Path(root) + self._lock = threading.RLock() + + # ── read ──────────────────────────────────────────────── + + def _ensure_root(self) -> None: + self.root.mkdir(parents=True, exist_ok=True) + + def _iter_face_dirs(self) -> Iterable[tuple[int, Path]]: + if not self.root.exists(): + return + for child in sorted(self.root.iterdir()): + if not child.is_dir(): + continue + m = _DIR_RE.match(child.name) + if not m: + continue + yield int(m.group(1)), child + + def _samples_in(self, face_dir: Path) -> list[Path]: + out: list[Path] = [] + for p in sorted(face_dir.iterdir()): + if p.is_file() and p.suffix.lower() in ALLOWED_EXTS: + out.append(p) + return out + + def _meta(self, face_dir: Path) -> tuple[str | None, str | None, str | None]: + """Return (name, description, added_at) — any may be None.""" + meta_path = face_dir / "meta.json" + if not meta_path.exists(): + return None, None, None + try: + data = json.loads(meta_path.read_text(encoding="utf-8")) + except Exception: + return None, None, None + name = data.get("name") + description = data.get("description") + added = data.get("added_at") + return (name if name else None), (description if description else None), added + + def list(self) -> list[FaceEntry]: + with self._lock: + entries: list[FaceEntry] = [] + for face_id, face_dir in self._iter_face_dirs(): + name, description, added = self._meta(face_dir) + entries.append(FaceEntry( + id=face_id, + name=name, + description=description, + added_at=added, + dir=face_dir, + sample_paths=self._samples_in(face_dir), + )) + return entries + + def get(self, face_id: int) -> FaceEntry | None: + with self._lock: + face_dir = self.root / f"face_{face_id}" + if not face_dir.is_dir(): + return None + name, description, added = self._meta(face_dir) + return FaceEntry( + id=face_id, name=name, description=description, added_at=added, + dir=face_dir, sample_paths=self._samples_in(face_dir), + ) + + def get_photo(self, face_id: int, photo_name: str) -> Path | None: + with self._lock: + face_dir = self.root / f"face_{face_id}" + if not face_dir.is_dir(): + return None + p = face_dir / photo_name + try: + p.resolve().relative_to(face_dir.resolve()) + except ValueError: + return None + if not p.exists() or p.suffix.lower() not in ALLOWED_EXTS: + return None + return p + + # ── write ─────────────────────────────────────────────── + + def next_id(self) -> int: + with self._lock: + max_id = 0 + for face_id, _ in self._iter_face_dirs(): + if face_id > max_id: + max_id = face_id + return max_id + 1 + + def _next_sample_name(self, face_dir: Path, ext: str) -> str: + """Return next free face_N. filename inside face_dir.""" + existing = self._samples_in(face_dir) + max_n = 0 + for p in existing: + m = SAMPLE_NAME_RE.match(p.name) + if m: + n = int(m.group(1)) + if n > max_n: + max_n = n + return f"face_{max_n + 1}{ext.lower()}" + + @staticmethod + def _detect_ext(jpeg_or_png: bytes) -> str: + """Sniff PNG vs JPEG from the magic bytes.""" + if len(jpeg_or_png) >= 8 and jpeg_or_png[:8] == b"\x89PNG\r\n\x1a\n": + return ".png" + return ".jpg" + + def _write_meta(self, face_dir: Path, name: str | None, + description: str | None = None, + added_at: str | None = None) -> None: + meta: dict[str, str] = {} + if name: + meta["name"] = name + if description: + meta["description"] = description + meta["added_at"] = added_at or datetime.now().isoformat(timespec="seconds") + (face_dir / "meta.json").write_text( + json.dumps(meta, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + def create_face(self, image_bytes_list: list[bytes], + name: str | None = None, + description: str | None = None) -> FaceEntry: + """Create a new face_{next_id}/ with one or more samples.""" + if not image_bytes_list: + raise ValueError("create_face: empty image list") + with self._lock: + self._ensure_root() + face_id = self.next_id() + face_dir = self.root / f"face_{face_id}" + face_dir.mkdir(parents=True, exist_ok=False) + for idx, data in enumerate(image_bytes_list, start=1): + ext = self._detect_ext(data) + fname = f"face_{idx}{ext}" + (face_dir / fname).write_bytes(data) + clean_name = (name or "").strip() or None + clean_desc = (description or "").strip() or None + self._write_meta(face_dir, clean_name, clean_desc) + log.info("Created face_%d (samples=%d, name=%s, desc=%s)", + face_id, len(image_bytes_list), clean_name or "(unnamed)", + "yes" if clean_desc else "no") + return self.get(face_id) # type: ignore[return-value] + + def add_photo(self, face_id: int, image_bytes: bytes) -> str: + """Append a new sample to an existing face. Returns the filename.""" + with self._lock: + face_dir = self.root / f"face_{face_id}" + if not face_dir.is_dir(): + raise FileNotFoundError(f"face_{face_id} not found") + ext = self._detect_ext(image_bytes) + fname = self._next_sample_name(face_dir, ext) + (face_dir / fname).write_bytes(image_bytes) + log.info("Added sample %s to face_%d", fname, face_id) + return fname + + def rename(self, face_id: int, name: str | None) -> None: + """Update meta.json with a new name (or clear it if name is empty). + + Preserves the existing description + added_at. + """ + with self._lock: + face_dir = self.root / f"face_{face_id}" + if not face_dir.is_dir(): + raise FileNotFoundError(f"face_{face_id} not found") + _, description, added = self._meta(face_dir) + clean = (name or "").strip() or None + self._write_meta(face_dir, clean, description, added_at=added) + log.info("Renamed face_%d → %s", face_id, clean or "(unnamed)") + + def set_description(self, face_id: int, description: str | None) -> None: + """Update meta.json with a free-text description (or clear it). + + Preserves the existing name + added_at. The description is folded + into the Gemini primer turn so Gemini can reference it. + """ + with self._lock: + face_dir = self.root / f"face_{face_id}" + if not face_dir.is_dir(): + raise FileNotFoundError(f"face_{face_id} not found") + name, _, added = self._meta(face_dir) + clean = (description or "").strip() or None + self._write_meta(face_dir, name, clean, added_at=added) + log.info("Set description for face_%d (%s)", face_id, + "cleared" if not clean else f"{len(clean)} chars") + + def delete_photo(self, face_id: int, photo_name: str) -> None: + """Delete one photo. Refuses if it's the only remaining sample.""" + with self._lock: + face_dir = self.root / f"face_{face_id}" + if not face_dir.is_dir(): + raise FileNotFoundError(f"face_{face_id} not found") + samples = self._samples_in(face_dir) + if len(samples) <= 1: + raise ValueError( + "Cannot delete the only photo — delete the face instead." + ) + target = self.get_photo(face_id, photo_name) + if target is None: + raise FileNotFoundError(f"photo {photo_name} not found") + target.unlink() + log.info("Deleted %s from face_%d", photo_name, face_id) + + def delete_face(self, face_id: int) -> None: + """Delete the entire face_{id}/ folder (including meta.json).""" + import shutil + with self._lock: + face_dir = self.root / f"face_{face_id}" + if not face_dir.is_dir(): + raise FileNotFoundError(f"face_{face_id} not found") + shutil.rmtree(face_dir) + log.info("Deleted face_%d", face_id) + + def zip_face(self, face_id: int) -> bytes: + """Return the entire face_{id}/ folder packaged as a ZIP.""" + with self._lock: + face_dir = self.root / f"face_{face_id}" + if not face_dir.is_dir(): + raise FileNotFoundError(f"face_{face_id} not found") + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf: + for p in sorted(face_dir.iterdir()): + if p.is_file(): + zf.write(p, arcname=f"face_{face_id}/{p.name}") + return buf.getvalue() + + # ── primer support (used by gemini/script.py) ─────────── + + def load_for_primer( + self, max_samples_per_face: int = 3, resize_long_side: int = 256, + ) -> list[tuple[FaceEntry, list[bytes]]]: + """Return [(FaceEntry, [jpeg_bytes,…]), …] for Gemini upload. + + Resizes each sample to longest-side <= resize_long_side, re-encodes + as JPEG (q=85) to keep the token cost manageable. Falls back to + the raw bytes if PIL isn't available. + """ + entries = self.list() + if not entries: + return [] + out: list[tuple[FaceEntry, list[bytes]]] = [] + for e in entries: + paths = e.sample_paths[:max_samples_per_face] + jpegs: list[bytes] = [] + for p in paths: + try: + raw = p.read_bytes() + except OSError: + continue + processed = self._resize_for_primer(raw, resize_long_side) + jpegs.append(processed or raw) + if jpegs: + out.append((e, jpegs)) + return out + + @staticmethod + def _resize_for_primer(raw: bytes, long_side: int) -> bytes | None: + """Resize image to longest-side ≤ long_side, re-encode JPEG q=85. + + Returns None on any failure (caller falls back to raw bytes). + """ + try: + from PIL import Image # type: ignore + except Exception: + return None + try: + img = Image.open(io.BytesIO(raw)) + img.load() + if img.mode not in ("RGB", "L"): + img = img.convert("RGB") + w, h = img.size + scale = long_side / max(w, h) if max(w, h) > long_side else 1.0 + if scale < 1.0: + img = img.resize( + (max(1, int(w * scale)), max(1, int(h * scale))), + Image.LANCZOS, + ) + buf = io.BytesIO() + img.save(buf, format="JPEG", quality=85, optimize=True) + return buf.getvalue() + except Exception: + return None diff --git a/vision/recognition_state.py b/vision/recognition_state.py new file mode 100644 index 0000000..f7a21d3 --- /dev/null +++ b/vision/recognition_state.py @@ -0,0 +1,68 @@ +"""Recognition state file — atomic JSON I/O shared by parent + child. + +The dashboard (parent process) writes this file on every toggle / face +gallery change; the Gemini child (`gemini/script.py`) polls it at 1 Hz +to flip its in-memory flags without a session restart. + +Format (data/.recognition_state.json): + { + "vision_enabled": bool, + "face_rec_enabled": bool, + "gallery_version": int # bumped on every face CRUD + } +""" + +from __future__ import annotations + +import json +import os +import tempfile +from dataclasses import asdict, dataclass +from pathlib import Path + + +@dataclass +class RecognitionState: + vision_enabled: bool = False + face_rec_enabled: bool = False + gallery_version: int = 0 + + +def read(path: Path) -> RecognitionState: + """Return the persisted state, or a default if missing/corrupt.""" + try: + raw = json.loads(Path(path).read_text(encoding="utf-8")) + except (FileNotFoundError, json.JSONDecodeError, OSError): + return RecognitionState() + return RecognitionState( + vision_enabled=bool(raw.get("vision_enabled", False)), + face_rec_enabled=bool(raw.get("face_rec_enabled", False)), + gallery_version=int(raw.get("gallery_version", 0)), + ) + + +def write(path: Path, state: RecognitionState) -> None: + """Write atomically via tempfile + os.replace.""" + p = Path(path) + p.parent.mkdir(parents=True, exist_ok=True) + fd, tmp = tempfile.mkstemp(prefix=f".{p.name}.", suffix=".tmp", dir=str(p.parent)) + try: + with os.fdopen(fd, "w", encoding="utf-8") as fh: + json.dump(asdict(state), fh, ensure_ascii=False, indent=2) + os.replace(tmp, p) + except Exception: + try: + os.unlink(tmp) + except OSError: + pass + raise + + +def mutate(path: Path, **changes) -> RecognitionState: + """Read-modify-write helper. Returns the new state.""" + cur = read(path) + for k, v in changes.items(): + if hasattr(cur, k): + setattr(cur, k, v) + write(path, cur) + return cur