Marcus/Vision/marcus_imgsearch.py

"""
marcus_imgsearch.py — Image-Guided Search
==========================================
Project  : Marcus | YS Lootah Technology
Hardware : Unitree G1 EDU + Jetson Orin NX
Purpose  : Find a specific person or object by comparing camera frames
           to a reference image provided by the user.

How it works
------------
1. User provides a reference image (photo of person or object)
2. Marcus starts rotating while scanning camera frames
3. Every step: LLaVA compares current frame to reference image
4. When match found: robot stops, reports location
5. Optional: YOLO pre-filter speeds up search (find person class first,
   then LLaVA verifies it's the right person)

Usage in marcus_llava.py
------------------------
  from marcus_imgsearch import ImageSearch
  searcher = ImageSearch(get_frame_fn=get_frame, send_vel_fn=send_vel,
                         gradual_stop_fn=gradual_stop, llava_fn=_call_llava,
                         yolo_sees_fn=yolo_sees, model=OLLAMA_MODEL)

  # Start search with a reference image (base64 JPEG)
  result = searcher.search(ref_img_b64, hint="person in blue shirt", max_steps=60)
  # result: {"found": True, "position": "center", "steps": 12, "description": "..."}

  # Or from a file path
  result = searcher.search_from_file("/tmp/target.jpg", hint="kassam")

Standalone test
---------------
  python3 ~/Models_marcus/marcus_imgsearch.py --image /path/to/photo.jpg

Date     : April 2026
"""

import base64
import io
import json
import time
import threading
import os
import re
from pathlib import Path

import numpy as np

try:
    from PIL import Image
    PIL_AVAILABLE = True
except ImportError:
    PIL_AVAILABLE = False


# ══════════════════════════════════════════════════════════════════════════════
# CONFIGURATION
# ══════════════════════════════════════════════════════════════════════════════

DEFAULT_MAX_STEPS    = 60      # max rotation steps before giving up
STEP_DELAY           = 0.4     # seconds between YOLO checks
ROTATE_SPEED         = 0.25    # rad/s rotation speed during search
MIN_STEPS_WARMUP     = 3       # skip first N steps (stale frame)
MATCH_CONFIDENCE_THR = 0.6     # LLaVA confidence threshold (not used directly,
                               # but kept for future scoring)


# ══════════════════════════════════════════════════════════════════════════════
# IMAGE UTILITIES
# ══════════════════════════════════════════════════════════════════════════════

def _load_image_b64(path: str) -> str:
    """
    Load an image file and return as base64 JPEG string.

    Handles: JPEG, PNG, BMP, WEBP.
    Resizes to 336x336 max for LLaVA efficiency.
    Returns None if file not found or unreadable.
    """
    if not PIL_AVAILABLE:
        print("  [ImgSearch] PIL not available — pip install Pillow")
        return None

    try:
        path = Path(path)
        if not path.exists():
            print(f"  [ImgSearch] File not found: {path}")
            return None

        img = Image.open(path).convert("RGB")

        # Resize to max 336x336 keeping aspect ratio
        img.thumbnail((336, 336), Image.LANCZOS)

        buf = io.BytesIO()
        img.save(buf, format="JPEG", quality=85)
        return base64.b64encode(buf.getvalue()).decode()

    except Exception as e:
        print(f"  [ImgSearch] Cannot load image: {e}")
        return None


def _numpy_to_b64(frame: np.ndarray, quality: int = 80) -> str:
    """Convert a BGR numpy frame to base64 JPEG."""
    if not PIL_AVAILABLE or frame is None:
        return None
    try:
        img = Image.fromarray(frame[:, :, ::-1])  # BGR → RGB
        buf = io.BytesIO()
        img.save(buf, format="JPEG", quality=quality)
        return base64.b64encode(buf.getvalue()).decode()
    except Exception:
        return None


def _resize_b64(img_b64: str, max_size: int = 336) -> str:
    """Resize a base64 image to max_size × max_size."""
    if not PIL_AVAILABLE or not img_b64:
        return img_b64
    try:
        raw = base64.b64decode(img_b64)
        img = Image.open(io.BytesIO(raw)).convert("RGB")
        img.thumbnail((max_size, max_size), Image.LANCZOS)
        buf = io.BytesIO()
        img.save(buf, format="JPEG", quality=85)
        return base64.b64encode(buf.getvalue()).decode()
    except Exception:
        return img_b64


# ══════════════════════════════════════════════════════════════════════════════
# COMPARISON PROMPTS
# ══════════════════════════════════════════════════════════════════════════════

def _build_compare_prompt(hint: str = "") -> str:
    """
    Build a LLaVA prompt that compares a reference image to the current camera frame.

    The model receives TWO images: [reference, current_frame].
    It must answer yes/no whether the target from the reference is visible
    in the current frame.
    """
    hint_line = f'The target is: "{hint}".' if hint else ""
    return f"""You are helping a robot find a specific target.

IMAGE 1 (reference): Shows the target to find.
IMAGE 2 (current camera): Shows what the robot sees right now.

{hint_line}

Is the target from IMAGE 1 visible in IMAGE 2?

Answer ONLY with this JSON:
{{"found": false, "confidence": "low|medium|high", "position": "left|center|right|not visible", "description": "one sentence"}}

JSON:"""


def _build_single_prompt(hint: str) -> str:
    """
    Prompt for when only current frame is available (no reference image).
    Uses the hint description to search.
    """
    return f"""You are a robot scanning for a target.

Target description: "{hint}"

Look at the camera image. Is the target visible?

Answer ONLY with this JSON:
{{"found": false, "confidence": "low|medium|high", "position": "left|center|right|not visible", "description": "one sentence"}}

JSON:"""


# ══════════════════════════════════════════════════════════════════════════════
# IMAGE SEARCH CLASS
# ══════════════════════════════════════════════════════════════════════════════

class ImageSearch:
    """
    Image-guided robot search.

    Rotates Marcus while comparing camera frames to a reference image.
    Uses LLaVA for visual matching, YOLO as optional pre-filter.

    Thread-safe. Supports abort via Ctrl+C.

    Usage:
        searcher = ImageSearch(
            get_frame_fn=get_frame,
            send_vel_fn=send_vel,
            gradual_stop_fn=gradual_stop,
            llava_fn=_call_llava,
            yolo_sees_fn=yolo_sees,
            model="qwen2.5vl:3b"
        )
        result = searcher.search(ref_img_b64, hint="person in blue shirt")
    """

    def __init__(self, get_frame_fn, send_vel_fn, gradual_stop_fn,
                 llava_fn, yolo_sees_fn=None, model="qwen2.5vl:3b"):
        self._get_frame    = get_frame_fn
        self._send_vel     = send_vel_fn
        self._gradual_stop = gradual_stop_fn
        self._call_llava   = llava_fn
        self._yolo_sees    = yolo_sees_fn
        self._model        = model
        self._abort        = [False]

    def abort(self):
        """Signal the current search to stop."""
        self._abort[0] = True

    def search_from_file(self, image_path: str, hint: str = "",
                         max_steps: int = DEFAULT_MAX_STEPS,
                         direction: str = "left") -> dict:
        """
        Search for a target shown in an image file.

        Args:
            image_path : path to JPEG/PNG reference image
            hint       : text description of the target (optional, improves accuracy)
            max_steps  : max rotation steps
            direction  : "left" or "right"

        Returns:
            {"found": bool, "position": str, "steps": int,
             "description": str, "confidence": str}
        """
        ref_b64 = _load_image_b64(image_path)
        if ref_b64 is None:
            return {"found": False, "position": "error",
                    "steps": 0, "description": "Could not load reference image",
                    "confidence": "none"}
        return self.search(ref_b64, hint=hint, max_steps=max_steps, direction=direction)

    def search(self, ref_img_b64: str = None, hint: str = "",
               max_steps: int = DEFAULT_MAX_STEPS,
               direction: str = "left",
               yolo_prefilter: str = None) -> dict:
        """
        Search for a target by rotating and comparing camera frames.

        Args:
            ref_img_b64   : reference image as base64 JPEG (None = use hint only)
            hint          : text description e.g. "person in blue shirt", "red backpack"
            max_steps     : max steps before giving up (each step ~0.4s)
            direction     : "left" or "right" rotation
            yolo_prefilter: YOLO class to pre-filter (e.g. "person") before LLaVA check
                           None = always use LLaVA on every step

        Returns:
            dict with keys: found, position, steps, description, confidence

        Edge cases:
            - ref_img_b64 None + hint empty → warns, returns not found
            - Camera not ready → waits up to 5s, then skips frame
            - Keyboard interrupt → graceful stop
            - YOLO not available → skips pre-filter, uses LLaVA only
            - LLaVA error → logs and continues
        """
        self._abort[0] = False

        # Validate inputs
        if ref_img_b64 is None and not hint:
            print("  [ImgSearch] ⚠️  No reference image and no hint — cannot search")
            return {"found": False, "position": "error", "steps": 0,
                    "description": "No reference image or hint provided", "confidence": "none"}

        has_ref = ref_img_b64 is not None

        # Choose prompt builder
        if has_ref:
            prompt = _build_compare_prompt(hint)
            print(f"\n  [ImgSearch] Reference image provided")
        else:
            prompt = _build_single_prompt(hint)
            print(f"\n  [ImgSearch] Text-only search: '{hint}'")

        print(f"  [ImgSearch] Direction: {direction} | Max steps: {max_steps}")
        if yolo_prefilter:
            print(f"  [ImgSearch] YOLO pre-filter: '{yolo_prefilter}'")
        print(f"  [ImgSearch] Starting rotation...\n")

        # Start continuous rotation
        _keep_rotating = [True]
        vyaw = ROTATE_SPEED if direction == "left" else -ROTATE_SPEED

        def _rotate():
            while _keep_rotating[0]:
                self._send_vel(vyaw=vyaw)
                time.sleep(0.05)

        rot_thread = threading.Thread(target=_rotate, daemon=True)
        rot_thread.start()

        result = {"found": False, "position": "not visible",
                  "steps": 0, "description": "Not found", "confidence": "none"}

        try:
            for step in range(1, max_steps + 1):

                if self._abort[0]:
                    print("  [ImgSearch] Aborted by user")
                    break

                result["steps"] = step
                time.sleep(STEP_DELAY)

                # Warmup — skip first frames (stale)
                if step <= MIN_STEPS_WARMUP:
                    print(f"  [{step}/{max_steps}] Warming up...")
                    continue

                # ── YOLO pre-filter ───────────────────────────────────────────
                if yolo_prefilter and self._yolo_sees:
                    if not self._yolo_sees(yolo_prefilter):
                        print(f"  [{step}/{max_steps}] YOLO: no {yolo_prefilter} — skip LLaVA")
                        continue
                    print(f"  [{step}/{max_steps}] YOLO: {yolo_prefilter} found — running LLaVA comparison")

                # ── Get current frame ─────────────────────────────────────────
                current_frame = self._get_frame()
                if current_frame is None:
                    print(f"  [{step}/{max_steps}] ⏳ Camera not ready — skipping")
                    # Wait up to 3s for camera
                    for _ in range(6):
                        time.sleep(0.5)
                        current_frame = self._get_frame()
                        if current_frame:
                            break
                    if current_frame is None:
                        continue

                # ── LLaVA comparison ─────────────────────────────────────────
                print(f"  [{step}/{max_steps}] 🔍 LLaVA comparing...")

                try:
                    if has_ref:
                        # Pass BOTH images: [reference, current_frame]
                        # num_batch/num_ctx mirror llava_api.py — without these
                        # caps the compute graph OOMs the runner on Jetson.
                        import ollama as _ollama
                        from API.llava_api import NUM_BATCH, NUM_CTX
                        r = _ollama.chat(
                            model=self._model,
                            messages=[{
                                "role": "user",
                                "content": prompt,
                                "images": [ref_img_b64, current_frame]
                            }],
                            options={
                                "temperature": 0.0,
                                "num_predict": 60,
                                "num_batch":   NUM_BATCH,
                                "num_ctx":     NUM_CTX,
                            }
                        )
                        raw = r["message"]["content"].strip()
                    else:
                        # Text-only description search
                        raw = self._call_llava(prompt, current_frame, num_predict=60)

                    # Parse response
                    raw_clean = raw.replace("```json", "").replace("```", "").strip()
                    s = raw_clean.find("{"); e = raw_clean.rfind("}") + 1
                    d = json.loads(raw_clean[s:e]) if s != -1 and e > 0 else None

                    if d is None:
                        print(f"  [{step}/{max_steps}] ⚠️  Bad JSON: {raw[:60]}")
                        continue

                    found      = d.get("found", False)
                    confidence = d.get("confidence", "low")
                    position   = d.get("position", "not visible")
                    description= d.get("description", "")

                    if isinstance(found, str):
                        found = found.lower() in ("true", "yes", "1")

                    print(f"  [{step}/{max_steps}] {'✅ MATCH' if found else '❌ no match'} "
                          f"| conf={confidence} | pos={position} | {description[:60]}")

                    if found and confidence in ("medium", "high"):
                        result = {
                            "found":       True,
                            "position":    position,
                            "steps":       step,
                            "description": description,
                            "confidence":  confidence,
                        }
                        break

                except json.JSONDecodeError:
                    print(f"  [{step}/{max_steps}] ⚠️  JSON parse error")
                except Exception as ex:
                    print(f"  [{step}/{max_steps}] ⚠️  LLaVA error: {ex}")

        except KeyboardInterrupt:
            print("\n  [ImgSearch] Interrupted by user")

        finally:
            _keep_rotating[0] = False
            self._gradual_stop()

        # Print summary
        if result["found"]:
            print(f"\n  ✅ Target found at step {result['steps']}!")
            print(f"  📍 Position: {result['position']}")
            print(f"  📝 {result['description']}\n")
        else:
            print(f"\n  ❌ Target not found after {result['steps']} steps\n")

        return result


# ══════════════════════════════════════════════════════════════════════════════
# WIRE INTO marcus_llava.py — add to main loop
# ══════════════════════════════════════════════════════════════════════════════

"""
Add to marcus_llava.py imports:
    from marcus_imgsearch import ImageSearch

Add after Memory init:
    _img_searcher = ImageSearch(
        get_frame_fn    = get_frame,
        send_vel_fn     = send_vel,
        gradual_stop_fn = gradual_stop,
        llava_fn        = _call_llava,
        yolo_sees_fn    = yolo_sees,
        model           = OLLAMA_MODEL,
    )

Add to main loop (before standard LLaVA command):

    # Image search — "search/ path/to/photo.jpg [hint]"
    # or            "search/ hint text only"
    if cmd.lower().startswith("search/"):
        args = cmd[7:].strip()
        # Check if first arg is a file path
        parts = args.split(None, 1)
        if parts and os.path.exists(parts[0]):
            img_path = parts[0]
            hint     = parts[1] if len(parts) > 1 else ""
            print(f"  [Search] Reference image: {img_path}")
            print(f"  [Search] Hint: '{hint}'")
            _img_searcher.search_from_file(
                img_path, hint=hint, yolo_prefilter="person" if "person" in hint or not hint else None
            )
        else:
            # Text description only
            hint = args
            print(f"  [Search] Text search: '{hint}'")
            _img_searcher.search(ref_img_b64=None, hint=hint)
        continue
"""


# ══════════════════════════════════════════════════════════════════════════════
# STANDALONE TEST
# ══════════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Marcus Image Search — standalone test")
    parser.add_argument("--image", type=str, default=None, help="Reference image path")
    parser.add_argument("--hint",  type=str, default="",   help="Text description of target")
    parser.add_argument("--steps", type=int, default=20,   help="Max search steps")
    parser.add_argument("--test",  action="store_true",    help="Run self-test without robot")
    args = parser.parse_args()

    if args.test:
        # Test image loading and prompt building
        print("=== Self-test (no robot) ===\n")

        print("1. Prompt builder test:")
        p = _build_compare_prompt("person in blue shirt")
        print(f"   With hint: {p[:80]}...")
        p2 = _build_compare_prompt()
        print(f"   No hint:   {p2[:80]}...")

        print("\n2. Single prompt test:")
        p3 = _build_single_prompt("red backpack near the door")
        print(f"   {p3[:80]}...")

        if args.image:
            print("\n3. Image loading test:")
            b64 = _load_image_b64(args.image)
            if b64:
                size = len(b64) * 3 // 4 // 1024
                print(f"   Loaded: ~{size}KB")
            else:
                print("   Failed to load")

        print("\nSelf-test complete.")

    elif args.image or args.hint:
        # Real search — needs robot hardware
        print("Real search requires robot hardware.")
        print("Import ImageSearch into marcus_llava.py instead.")
        print(f"  image: {args.image}")
        print(f"  hint:  {args.hint}")

    else:
        parser.print_help()