""" canonical_normalizer.py — translate Arabic / dialect motion phrasings to canonical English form before regex matching. Runs in the dispatcher pipeline AFTER normalise_numbers (which handles spelled-out numbers in both languages) and BEFORE the parametric regex scan. The output is English-shaped text the existing English parametric regexes can match — replacing the previous approach of maintaining 14+ dialect-specific Arabic regexes per command type. Pipeline: raw chunk ──► _STATE_ECHO_RE strip ──► _QUOTED_RE strip ──► _QUESTION_RE strip ──► normalise_numbers ──► to_canonical_english ←──── HERE ──► English regex scan ──► dispatch Input/output examples: أمشي خطوة واحدة. → walk 1 step. أمشي خطوتين. → walk 2 steps. لف يسار 90 درجة. → turn left 90 degree. أستدير يميناً 100 درجة. → turn right 100 degree. أمشي للأمام 5 خطوات. → walk forward 5 steps. أمشي للخلف 3 خطوات. → walk backward 3 steps. أرجع 3 خطوات. → walk backward 3 steps. أتي إليك. → (unchanged — bot_phrase fixed canonical) Compound chains (مع conjunctions) are translated in place: أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة. → walk backward 3 steps, then turn right 90 degree. Idempotent for already-English text: Walking forward 5 steps. → Walking forward 5 steps. Ambiguity note: tokens like 'left' / 'right' have surface ambiguity ('I left the room' vs 'turn left'). The current normalizer uses context-free token substitution and accepts that risk — same behaviour as today's regex layer. A future memory + LiDAR-aware disambiguation hook (using prior dialogue context and current spatial state) belongs ABOVE this layer; see TODO at the bottom. """ from __future__ import annotations import re # ─── Vocabulary tables loaded from Config/language_tables.json ──── # # Single source of truth for verb / direction / unit / dual / conj / # connective dictionaries. Adding a new dialect = JSON edit. See # Voice/_language_tables.py for the loader and flattening helpers. from Voice._language_tables import ( flat_arabic_verbs, flat_arabic_directions, flat_arabic_units, flat_arabic_duals, flat_arabic_conjunctions, flat_arabic_connectives, ) # Verb roots: Arabic → English gerund. Loaded once at module import. _AR_VERB = flat_arabic_verbs() # Direction words: Arabic → English direction. _AR_DIR = flat_arabic_directions() # Units: Arabic → English unit (singular/plural preserved). _AR_UNIT = flat_arabic_units() # Dual forms: single Arabic word → 'N units' English target. _AR_DUAL = flat_arabic_duals() # Conjunctions for compound chains: Arabic → space-padded English. _AR_CONJ = flat_arabic_conjunctions() # Connectives (prepositions / determiners): Arabic → English. _AR_CONNECTIVES = flat_arabic_connectives() _HAS_ARABIC_RE = re.compile(r"[؀-ۿ]") def _ar_word_sub(text: str, mapping: dict) -> str: """Substitute Arabic tokens from `mapping` keys to their English values. Uses Arabic-LETTER boundary on both sides (NOT the full Arabic block — Arabic punctuation '،' / '؟' / '؛' is U+060C/061F/061B and is INSIDE the Arabic block but should be treated as a boundary so 'خطوات،' substitutes correctly). Letter range is U+0621-U+064A (ء to ي); also include diacritics U+064B-U+065F as in-word so 'البَاب' (with fatha) isn't split. Longest-first ordering resolves substring conflicts (e.g. match 'ألف' before its substring 'لف').""" keys = sorted(mapping.keys(), key=len, reverse=True) for ar in keys: en = mapping[ar] # In-word characters: Arabic letters + diacritics. Outside this # class is a boundary (whitespace, ASCII, Arabic punctuation). pattern = ( r"(? str: """Translate Arabic / dialect structural motion phrasings to English canonical form so the existing English parametric regex layer can match them. Order of operations (each is idempotent on its output): 1. DUAL substitution (single-word counts: خطوتين → 2 steps) 2. CONJ substitution (ثم → then) — done early so subsequent passes don't accidentally treat conjunctions as words. 3. VERB substitution (أمشي → walk, لف → turn, ...) 4. DIR substitution (يمين → right, ...) 5. UNIT substitution (خطوة → step, متر → meter, ...) 6. CONNECTIVES (إلى → to, etc.) 7. Arabic comma → English comma (preserves compound parsing) 8. Word-order fix: 'walk step 1' → 'walk 1 step' 'walk forward step 1' → 'walk forward 1 step' 9. Whitespace collapse. Bails out early if the input contains no Arabic (idempotent for pure English). """ if not text or not _HAS_ARABIC_RE.search(text): return text # 1. Dual forms first — these are single Arabic words that carry # both count and unit ('خطوتين' = '2 steps'). text = _ar_word_sub(text, _AR_DUAL) # 2. Conjunctions — translate before verbs so 'ثم' between two # motions becomes ' then ' for the regex layer's compound parser. text = _ar_word_sub(text, _AR_CONJ) # 3. Verbs. text = _ar_word_sub(text, _AR_VERB) # 4. Directions. text = _ar_word_sub(text, _AR_DIR) # 5. Units. text = _ar_word_sub(text, _AR_UNIT) # 6. Connectives. text = _ar_word_sub(text, _AR_CONNECTIVES) # 7. Arabic comma → ASCII comma (so the English regex's compound # detection sees the boundary). text = text.replace("،", ",") # 8. Word-order fix. After token substitution we may have # 'walk [direction]? step 1' / 'walk [direction]? meter 2' # which has the unit before the number. The English regex expects # '[verb] [direction]? '. Swap them. text = re.sub( r"\b(walk(?:ing)?|turn(?:ing)?|mov(?:e|ing))" r"(\s+(?:forward|backward|back|left|right|around))?" r"\s+(steps?|meters?|degrees?)" r"\s+(\d+(?:\.\d+)?)", r"\1\2 \4 \3", text, flags=re.IGNORECASE, ) # 9a. Idiomatic post-fix: 'turning backward' (which is what we get # from 'استدر للخلف' / 'لف ورا') should map to 'turning around'. # 'turning backward' isn't a meaningful phrase in English motion # vocab; the canonical is 'turn around' (180° rotation). text = re.sub( r"\bturn(?:ing)?\s+backward\b", "turning around", text, flags=re.IGNORECASE, ) # 9b. Double-direction collapse. After substitution we sometimes get # 'walking backward backward 2 steps' because the Arabic verb # ('أرجع' = walking-backward gerund) AND the explicit direction # ('للخلف'/'لورا' = backward) both translated to 'backward'. Collapse # repeated adjacent direction tokens so the parametric regex matches. # Field-observed cases: # 'أرجع للخلف خطوتين' → 'walking backward backward 2 steps' → # 'walking backward 2 steps' # 'أتقدم للأمام خطوة' → 'walking forward forward 1 step' → # 'walking forward 1 step' text = re.sub( r"\b(forward|backward|back|left|right|around)(?:\s+\1)+\b", r"\1", text, flags=re.IGNORECASE, ) # 9c. Spin-in-place idiom — Arabic phrases for turning around oneself # (`حول نفسي`, `حول نفسك`, `حوالين نفسي`, `حوالين نفسك`, `على نفسي`, # `على نفسك`, `بنفسي`, `بنفسك`) all mean 360°/180° self-rotation. # Translated to ' around ' so 'turning حول نفسي' becomes 'turning # around' and the turn_around canonical regex catches it. # Field case: user says 'لف حول نفسك' → Gemini replies 'أستدير حول # نفسي' → after verb sub = 'turning حول نفسي' → after this pass = # 'turning around' → dispatch turn_around. Without this, the bot # phrase didn't dispatch and the robot stood still. text = re.sub( r"\s*(?:حوال?ين|حول|على)\s+(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b", " around", text, ) text = re.sub(r"\s*ب(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b", " around", text) # 9. Collapse extra whitespace introduced by substitutions. text = re.sub(r"\s+", " ", text).strip() return text # ──────────────────────────────────────────────────────────────── # TODO (future) — AMBIGUITY DISAMBIGUATION HOOK # # 'left' / 'right' / 'forward' have surface ambiguity (direction vs # past-tense / surname / general adverb). When LiDAR + dialogue memory # are wired in, a higher-level resolver should sit ABOVE this module # and decide: is the current chunk a motion intent or descriptive # language? Inputs to that resolver could include: # - command_history.json (recent commands — context for repeat/follow) # - LiDAR snapshot (is the path clear? is something blocking?) # - prior 1-2 Gemini turns (what was discussed? still on motion topic?) # - mic energy + duration (long quiet vs decisive utterance) # The resolver gates whether to call to_canonical_english at all OR # returns the original text unchanged when context says 'this is chat, # not motion'. # # Current behaviour: context-free token substitution. Same risk profile # as today's regex layer. # ──────────────────────────────────────────────────────────────── # Standalone smoke check if __name__ == "__main__": # Apply normalise_numbers FIRST to match production pipeline from Voice.number_words import normalise_numbers as _nn def pipeline(text): return to_canonical_english(_nn(text)) cases = [ ("أمشي خطوة واحدة.", "walk 1 step."), ("أمشي خطوتين.", "walk 2 steps."), ("لف يسار 90 درجة.", "turn left 90 degree."), ("أستدير يميناً 100 درجة.", "turn right 100 degree."), ("أمشي للأمام 5 خطوات.", "walk forward 5 steps."), ("أمشي للخلف 3 خطوات.", "walk backward 3 steps."), ("أرجع 3 خطوات.", "walk backward 3 steps."), ("أمشي خطوة 1.", "walk 1 step."), ("أستدير يساراً 1 خطوات.", "turn left 1 steps."), ("أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة.", "walk backward 3 steps, then turn right 90 degree."), ("Walking forward 5 steps.", "Walking forward 5 steps."), ("Turning right 90 degrees.", "Turning right 90 degrees."), ("لف على اليمين.", "turn on right."), # Spelled-out + structural together ("أستدير يساراً تسعين درجة.", "turn left 90 degree."), ("أمشي خمس خطوات.", "walk 5 steps."), # Compound chain ("أمشي خطوة واحدة، ثم أستدير يميناً.", "walk 1 step, then turn right."), ] ok = bad = 0 for inp, exp in cases: got = pipeline(inp) success = got == exp mark = "✓" if success else "✗" if success: ok += 1 else: bad += 1 print(f" {mark} {inp!r:55s}") print(f" → {got!r}") if not success: print(f" expected: {exp!r}") print(f"\n{ok}/{ok+bad} passed")