"""Arabic text normalization and voice-command phrase matching. Ported from gemini_interact/sanad_text_utils.py — unified for Sanad. """ from __future__ import annotations import re from pathlib import Path from typing import Any # Arabic diacritics (tashkeel) — stripped for matching. _DIACRITICS_RE = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]") _AR_PUNCT = re.compile(r"[؟،؛]") _NON_WORD = re.compile(r"[^\w\u0600-\u06FF\s]", re.UNICODE) _MULTI_WS = re.compile(r"\s+") def normalize_arabic(text: str) -> str: """Normalize Arabic + English text for matching.""" s = text.strip().lower() s = _AR_PUNCT.sub(" ", s) s = _NON_WORD.sub(" ", s) s = _MULTI_WS.sub(" ", s) # Hamza variants → bare alif s = s.replace("\u0623", "\u0627") # أ → ا s = s.replace("\u0625", "\u0627") # إ → ا s = s.replace("\u0622", "\u0627") # آ → ا # Ta marbuta / alif maqsoora s = s.replace("\u0629", "\u0647") # ة → ه s = s.replace("\u0649", "\u064A") # ى → ي # Tatweel s = s.replace("\u0640", "") # Strip diacritics last s = _DIACRITICS_RE.sub("", s) return s.strip() def strip_diacritics(text: str) -> str: return _DIACRITICS_RE.sub("", text) def load_phrase_map(filepath: str | Path) -> dict[str, set[str]]: """Load a phrase file mapping command names to trigger phrases. Format (per command): WAKE_PHRASES_shake_hand = { "مصافحه", "handshake", "shake hands", } Returns: {"shake_hand": {"مصافحه", "handshake", ...}, ...} """ path = Path(filepath) if not path.exists(): return {} content = path.read_text(encoding="utf-8") result: dict[str, set[str]] = {} current_name: str | None = None current_phrases: set[str] = set() for raw_line in content.splitlines(): line = raw_line.strip() if not line or line.startswith("#"): continue # Header: WAKE_PHRASES_shake_hand = { header_match = re.match(r"WAKE_PHRASES_(\w+)\s*=\s*\{", line) if header_match: if current_name and current_phrases: result[current_name] = current_phrases current_name = header_match.group(1) current_phrases = set() continue # Closing brace if line == "}": if current_name and current_phrases: result[current_name] = current_phrases current_name = None current_phrases = set() continue # Phrase line: "some phrase", phrase_match = re.match(r'"([^"]+)"', line) if phrase_match and current_name is not None: phrase = normalize_arabic(phrase_match.group(1)) if phrase: current_phrases.add(phrase) if current_name and current_phrases: result[current_name] = current_phrases return result def match_phrase(text: str, phrase_sets: dict[str, set[str]]) -> str | None: """Return the command name if normalized *text* matches any phrase set. Token-set matching: every word of the phrase must appear as a whole word in *text*. Prevents short phrases (e.g. 'hi') from matching longer words (e.g. 'this'). """ norm = normalize_arabic(text) if not norm: return None text_tokens = set(norm.split()) if not text_tokens: return None best_command: str | None = None best_len = 0 for command_name, phrases in phrase_sets.items(): for phrase in phrases: phrase_tokens = phrase.split() if not phrase_tokens: continue if all(t in text_tokens for t in phrase_tokens): if len(phrase) > best_len: best_command = command_name best_len = len(phrase) return best_command # ────────────────────── stateful ASR-buffer matcher ────────────────────── # Port of gemini_interact/sanad_text_utils.py:_maybe_trigger_arm # # Why stateful: Gemini streams short ASR pieces like "مر", "حب", "ا" that # need to be joined across ~2 s to match "مرحبا". This matcher buffers # incoming transcript pieces, dedups repeats, and fires when any phrase # in the wake set is found. import time import asyncio import threading _YA_PREFIX_RE = re.compile(r"^يا\s*") def _strip_ya_prefix(s: str) -> str: s = (s or "").strip() return _YA_PREFIX_RE.sub("", s).strip() def _remove_al_prefix_words(text: str) -> str: if not text: return "" out = [] for w in text.split(): if w.startswith("ال") and len(w) > 2: out.append(w[2:]) else: out.append(w) return " ".join(out).strip() def _is_valid_text(s: str) -> bool: has_ar = bool(re.search(r"[\u0600-\u06FF]", s or "")) has_en = bool(re.search(r"[a-zA-Z]", s or "")) return has_ar or has_en def maybe_trigger_arm( state: Any, transcript_text: str, wake_phrases: set[str], *, fire_on_wake_match: bool = True, arm_trigger_fn=None, ) -> bool: """Buffer-aware wake-phrase matcher. `state` is any object — attributes are lazily initialized on first use. Suitable targets: a session dataclass, or even a plain `types.SimpleNamespace`. On match: - Clears ASR buffer to avoid re-trigger on next chunk - If fire_on_wake_match: runs arm_trigger_fn in a background thread immediately (wrapped in asyncio.to_thread if in a loop, else threading.Thread) - If not fire_on_wake_match: marks _pending_arm_wave=True so the caller can fire it on turn_complete Returns True if a phrase fired, False otherwise. """ if not transcript_text or not wake_phrases: return False # ── lazy state init ──────────────────────────────────────── for attr, default in ( ("_asr_buf", ""), ("_asr_last_time", 0.0), ("ASR_WINDOW_SEC", 2.0), ("ASR_SHORT_TOKEN_BONUS_SEC", 1.0), ("ASR_JOIN_NO_SPACE_MAXLEN", 2), ("ASR_MAX_CHARS", 120), ("_last_trigger_norm", ""), ("_last_trigger_time", 0.0), ("TRIGGER_DEDUP_WINDOW", 2.0), ("_pending_arm_wave", False), ("_pending_arm_wave_fired", False), ("_pending_arm_wave_set_time", 0.0), ("PENDING_ARM_TTL", 6.0), ("_pending_arm_trigger_fn", None), ("_pending_arm_fallback_time", 0.0), ("_last_piece_call_norm", ""), ("_last_piece_call_time", 0.0), ("_asr_stream", ""), ("ASR_STREAM_MAX_CHARS", 80), ): if not hasattr(state, attr): setattr(state, attr, default) dup_call_window = float(getattr(state, "DUP_CALL_WINDOW_SEC", 0.25)) dup_asr_repeat_window = float(getattr(state, "DUP_ASR_REPEAT_WINDOW_SEC", 0.9)) pending_fallback_sec = float(getattr(state, "PENDING_ARM_FALLBACK_SEC", 0.65)) piece_raw = transcript_text.strip() if not piece_raw: return False piece_norm = normalize_arabic(piece_raw) if not piece_norm or not _is_valid_text(piece_norm): return False now = time.time() duplicate_call = ( piece_norm == state._last_piece_call_norm and (now - state._last_piece_call_time) < dup_call_window ) repeated_asr = ( piece_norm == state._last_piece_call_norm and (now - state._last_piece_call_time) < dup_asr_repeat_window ) state._last_piece_call_norm = piece_norm state._last_piece_call_time = now # Buffer update if not duplicate_call and not repeated_asr: if state._asr_last_time: gap = now - state._asr_last_time window = state.ASR_WINDOW_SEC if len(piece_norm) <= state.ASR_JOIN_NO_SPACE_MAXLEN: window += state.ASR_SHORT_TOKEN_BONUS_SEC if gap > window: state._asr_buf = "" state._asr_stream = "" state._asr_last_time = now # Join logic — no-space for very short pieces if state._asr_buf: if len(piece_norm) <= state.ASR_JOIN_NO_SPACE_MAXLEN: state._asr_buf = (state._asr_buf + piece_norm).strip() else: state._asr_buf = (state._asr_buf + " " + piece_norm).strip() else: state._asr_buf = piece_norm compact = piece_norm.replace(" ", "") state._asr_stream = (state._asr_stream + compact)[-state.ASR_STREAM_MAX_CHARS:] if len(state._asr_buf) > state.ASR_MAX_CHARS: state._asr_buf = state._asr_buf[-state.ASR_MAX_CHARS:] buf_norm = normalize_arabic(state._asr_buf) buf_nospace = buf_norm.replace(" ", "") buf_noal = _remove_al_prefix_words(buf_norm) buf_noal_nospace = buf_noal.replace(" ", "") stream = normalize_arabic(state._asr_stream).replace(" ", "") stream_noal = _remove_al_prefix_words(stream) # Dedup — don't fire same buffer twice within TRIGGER_DEDUP_WINDOW if (buf_norm == state._last_trigger_norm and (now - state._last_trigger_time) < state.TRIGGER_DEDUP_WINDOW): return False # Match loop for phrase in wake_phrases: p_norm = _strip_ya_prefix(normalize_arabic(str(phrase))) if not p_norm: continue p_nospace = p_norm.replace(" ", "") p_noal = _remove_al_prefix_words(p_norm) p_noal_nospace = p_noal.replace(" ", "") pattern = r"\b" + re.escape(p_norm) + r"\b" hit_buf = bool(re.search(pattern, buf_norm)) \ or (p_nospace and p_nospace == buf_nospace) \ or (p_noal and (p_noal in buf_noal or (p_noal_nospace and p_noal_nospace in buf_noal_nospace))) hit_stream = bool(p_nospace and p_nospace in stream) \ or bool(p_noal_nospace and p_noal_nospace in stream_noal) if hit_buf or hit_stream: state._last_trigger_norm = buf_norm state._last_trigger_time = now state._asr_buf = "" state._asr_last_time = 0.0 state._asr_stream = "" if fire_on_wake_match: if arm_trigger_fn: _fire_arm_trigger(arm_trigger_fn) state._pending_arm_wave = False state._pending_arm_wave_fired = False state._pending_arm_wave_set_time = 0.0 state._pending_arm_trigger_fn = None state._pending_arm_fallback_time = 0.0 else: state._pending_arm_wave = True state._pending_arm_wave_fired = False state._pending_arm_wave_set_time = now state._pending_arm_trigger_fn = arm_trigger_fn state._pending_arm_fallback_time = now + pending_fallback_sec return True return False def _fire_arm_trigger(fn) -> None: """Run the arm trigger callback in a background thread, regardless of whether we're inside an asyncio loop.""" try: asyncio.get_running_loop() asyncio.create_task(asyncio.to_thread(fn)) except RuntimeError: threading.Thread(target=fn, daemon=True).start() def load_arm_phrase_dispatch( sanad_arm_txt: str | Path, option_list: list, ) -> dict[int, set[str]]: """Build {action_id: set_of_phrases} from sanad_arm.txt × OPTION_LIST. Each OPTION has .id and .name. The sanad_arm.txt file defines WAKE_PHRASES_. """ phrase_map = load_phrase_map(sanad_arm_txt) # {name_var: set[phrase]} dispatch: dict[int, set[str]] = {} for opt in option_list: var = opt.name.replace(" ", "_").replace("-", "_") phrases = phrase_map.get(var) if phrases: dispatch[opt.id] = phrases return dispatch