Sanad_lite/voice/text_utils.py

"""Arabic text normalization and voice-command phrase matching.

Ported from gemini_interact/sanad_text_utils.py — unified for Sanad.
"""

from __future__ import annotations

import re
from pathlib import Path
from typing import Any

# Arabic diacritics (tashkeel) — stripped for matching.
_DIACRITICS_RE = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]")
_AR_PUNCT = re.compile(r"[؟،؛]")
_NON_WORD = re.compile(r"[^\w\u0600-\u06FF\s]", re.UNICODE)
_MULTI_WS = re.compile(r"\s+")


def normalize_arabic(text: str) -> str:
    """Normalize Arabic + English text for matching."""
    s = text.strip().lower()
    s = _AR_PUNCT.sub(" ", s)
    s = _NON_WORD.sub(" ", s)
    s = _MULTI_WS.sub(" ", s)
    # Hamza variants → bare alif
    s = s.replace("\u0623", "\u0627")  # أ → ا
    s = s.replace("\u0625", "\u0627")  # إ → ا
    s = s.replace("\u0622", "\u0627")  # آ → ا
    # Ta marbuta / alif maqsoora
    s = s.replace("\u0629", "\u0647")  # ة → ه
    s = s.replace("\u0649", "\u064A")  # ى → ي
    # Tatweel
    s = s.replace("\u0640", "")
    # Strip diacritics last
    s = _DIACRITICS_RE.sub("", s)
    return s.strip()


def strip_diacritics(text: str) -> str:
    return _DIACRITICS_RE.sub("", text)


def load_phrase_map(filepath: str | Path) -> dict[str, set[str]]:
    """Load a phrase file mapping command names to trigger phrases.

    Format (per command):
        WAKE_PHRASES_shake_hand = {
            "مصافحه", "handshake", "shake hands",
        }

    Returns: {"shake_hand": {"مصافحه", "handshake", ...}, ...}
    """
    path = Path(filepath)
    if not path.exists():
        return {}

    content = path.read_text(encoding="utf-8")
    result: dict[str, set[str]] = {}
    current_name: str | None = None
    current_phrases: set[str] = set()

    for raw_line in content.splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#"):
            continue

        # Header: WAKE_PHRASES_shake_hand = {
        header_match = re.match(r"WAKE_PHRASES_(\w+)\s*=\s*\{", line)
        if header_match:
            if current_name and current_phrases:
                result[current_name] = current_phrases
            current_name = header_match.group(1)
            current_phrases = set()
            continue

        # Closing brace
        if line == "}":
            if current_name and current_phrases:
                result[current_name] = current_phrases
            current_name = None
            current_phrases = set()
            continue

        # Phrase line: "some phrase",
        phrase_match = re.match(r'"([^"]+)"', line)
        if phrase_match and current_name is not None:
            phrase = normalize_arabic(phrase_match.group(1))
            if phrase:
                current_phrases.add(phrase)

    if current_name and current_phrases:
        result[current_name] = current_phrases

    return result


def match_phrase(text: str, phrase_sets: dict[str, set[str]]) -> str | None:
    """Return the command name if normalized *text* matches any phrase set.

    Token-set matching: every word of the phrase must appear as a whole
    word in *text*. Prevents short phrases (e.g. 'hi') from matching
    longer words (e.g. 'this').
    """
    norm = normalize_arabic(text)
    if not norm:
        return None
    text_tokens = set(norm.split())
    if not text_tokens:
        return None
    best_command: str | None = None
    best_len = 0
    for command_name, phrases in phrase_sets.items():
        for phrase in phrases:
            phrase_tokens = phrase.split()
            if not phrase_tokens:
                continue
            if all(t in text_tokens for t in phrase_tokens):
                if len(phrase) > best_len:
                    best_command = command_name
                    best_len = len(phrase)
    return best_command