Marcus/Voice/canonical_normalizer.py

"""
canonical_normalizer.py — translate Arabic / dialect motion phrasings
to canonical English form before regex matching.

Runs in the dispatcher pipeline AFTER normalise_numbers (which handles
spelled-out numbers in both languages) and BEFORE the parametric regex
scan. The output is English-shaped text the existing English parametric
regexes can match — replacing the previous approach of maintaining 14+
dialect-specific Arabic regexes per command type.

Pipeline:

    raw chunk  ──►  _STATE_ECHO_RE strip  ──►  _QUOTED_RE strip
              ──►  _QUESTION_RE strip   ──►  normalise_numbers
              ──►  to_canonical_english  ←──── HERE
              ──►  English regex scan  ──►  dispatch

Input/output examples:

    أمشي خطوة واحدة.            → walk 1 step.
    أمشي خطوتين.                 → walk 2 steps.
    لف يسار 90 درجة.             → turn left 90 degree.
    أستدير يميناً 100 درجة.      → turn right 100 degree.
    أمشي للأمام 5 خطوات.         → walk forward 5 steps.
    أمشي للخلف 3 خطوات.          → walk backward 3 steps.
    أرجع 3 خطوات.                → walk backward 3 steps.
    أتي إليك.                    → (unchanged — bot_phrase fixed canonical)

Compound chains (مع conjunctions) are translated in place:

    أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة.
        → walk backward 3 steps, then turn right 90 degree.

Idempotent for already-English text:

    Walking forward 5 steps.    → Walking forward 5 steps.

Ambiguity note: tokens like 'left' / 'right' have surface ambiguity
('I left the room' vs 'turn left'). The current normalizer uses
context-free token substitution and accepts that risk — same
behaviour as today's regex layer. A future memory + LiDAR-aware
disambiguation hook (using prior dialogue context and current
spatial state) belongs ABOVE this layer; see TODO at the bottom.
"""
from __future__ import annotations

import re

# ─── Vocabulary tables loaded from Config/language_tables.json ────
#
# Single source of truth for verb / direction / unit / dual / conj /
# connective dictionaries. Adding a new dialect = JSON edit. See
# Voice/_language_tables.py for the loader and flattening helpers.
from Voice._language_tables import (
    flat_arabic_verbs, flat_arabic_directions, flat_arabic_units,
    flat_arabic_duals, flat_arabic_conjunctions, flat_arabic_connectives,
)


# Verb roots: Arabic → English gerund. Loaded once at module import.
_AR_VERB = flat_arabic_verbs()
# Direction words: Arabic → English direction.
_AR_DIR = flat_arabic_directions()
# Units: Arabic → English unit (singular/plural preserved).
_AR_UNIT = flat_arabic_units()
# Dual forms: single Arabic word → 'N units' English target.
_AR_DUAL = flat_arabic_duals()
# Conjunctions for compound chains: Arabic → space-padded English.
_AR_CONJ = flat_arabic_conjunctions()
# Connectives (prepositions / determiners): Arabic → English.
_AR_CONNECTIVES = flat_arabic_connectives()


_HAS_ARABIC_RE = re.compile(r"[؀-ۿ]")


def _ar_word_sub(text: str, mapping: dict) -> str:
    """Substitute Arabic tokens from `mapping` keys to their English
    values. Uses Arabic-LETTER boundary on both sides (NOT the full
    Arabic block — Arabic punctuation '،' / '؟' / '؛' is U+060C/061F/061B
    and is INSIDE the Arabic block but should be treated as a boundary
    so 'خطوات،' substitutes correctly). Letter range is U+0621-U+064A
    (ء to ي); also include diacritics U+064B-U+065F as in-word so
    'البَاب' (with fatha) isn't split.
    Longest-first ordering resolves substring conflicts (e.g. match
    'ألف' before its substring 'لف')."""
    keys = sorted(mapping.keys(), key=len, reverse=True)
    for ar in keys:
        en = mapping[ar]
        # In-word characters: Arabic letters + diacritics. Outside this
        # class is a boundary (whitespace, ASCII, Arabic punctuation).
        pattern = (
            r"(?<![ء-يً-ٟ])"
            + re.escape(ar)
            + r"(?![ء-يً-ٟ])"
        )
        text = re.sub(pattern, en, text)
    return text


def to_canonical_english(text: str) -> str:
    """Translate Arabic / dialect structural motion phrasings to
    English canonical form so the existing English parametric regex
    layer can match them.

    Order of operations (each is idempotent on its output):
      1. DUAL substitution (single-word counts: خطوتين → 2 steps)
      2. CONJ substitution (ثم → then) — done early so subsequent
         passes don't accidentally treat conjunctions as words.
      3. VERB substitution (أمشي → walk, لف → turn, ...)
      4. DIR substitution (يمين → right, ...)
      5. UNIT substitution (خطوة → step, متر → meter, ...)
      6. CONNECTIVES (إلى → to, etc.)
      7. Arabic comma → English comma (preserves compound parsing)
      8. Word-order fix: 'walk step 1' → 'walk 1 step'
                         'walk forward step 1' → 'walk forward 1 step'
      9. Whitespace collapse.

    Bails out early if the input contains no Arabic (idempotent for
    pure English).
    """
    if not text or not _HAS_ARABIC_RE.search(text):
        return text

    # 1. Dual forms first — these are single Arabic words that carry
    # both count and unit ('خطوتين' = '2 steps').
    text = _ar_word_sub(text, _AR_DUAL)

    # 2. Conjunctions — translate before verbs so 'ثم' between two
    # motions becomes ' then ' for the regex layer's compound parser.
    text = _ar_word_sub(text, _AR_CONJ)

    # 3. Verbs.
    text = _ar_word_sub(text, _AR_VERB)

    # 4. Directions.
    text = _ar_word_sub(text, _AR_DIR)

    # 5. Units.
    text = _ar_word_sub(text, _AR_UNIT)

    # 6. Connectives.
    text = _ar_word_sub(text, _AR_CONNECTIVES)

    # 7. Arabic comma → ASCII comma (so the English regex's compound
    # detection sees the boundary).
    text = text.replace("،", ",")

    # 8. Word-order fix. After token substitution we may have
    #     'walk [direction]? step 1' / 'walk [direction]? meter 2'
    # which has the unit before the number. The English regex expects
    # '[verb] [direction]? <number> <unit>'. Swap them.
    text = re.sub(
        r"\b(walk(?:ing)?|turn(?:ing)?|mov(?:e|ing))"
        r"(\s+(?:forward|backward|back|left|right|around))?"
        r"\s+(steps?|meters?|degrees?)"
        r"\s+(\d+(?:\.\d+)?)",
        r"\1\2 \4 \3",
        text,
        flags=re.IGNORECASE,
    )

    # 9a. Idiomatic post-fix: 'turning backward' (which is what we get
    # from 'استدر للخلف' / 'لف ورا') should map to 'turning around'.
    # 'turning backward' isn't a meaningful phrase in English motion
    # vocab; the canonical is 'turn around' (180° rotation).
    text = re.sub(
        r"\bturn(?:ing)?\s+backward\b",
        "turning around",
        text,
        flags=re.IGNORECASE,
    )

    # 9b. Double-direction collapse. After substitution we sometimes get
    # 'walking backward backward 2 steps' because the Arabic verb
    # ('أرجع' = walking-backward gerund) AND the explicit direction
    # ('للخلف'/'لورا' = backward) both translated to 'backward'. Collapse
    # repeated adjacent direction tokens so the parametric regex matches.
    # Field-observed cases:
    #   'أرجع للخلف خطوتين' → 'walking backward backward 2 steps' →
    #                         'walking backward 2 steps'
    #   'أتقدم للأمام خطوة'  → 'walking forward forward 1 step' →
    #                         'walking forward 1 step'
    text = re.sub(
        r"\b(forward|backward|back|left|right|around)(?:\s+\1)+\b",
        r"\1",
        text,
        flags=re.IGNORECASE,
    )

    # 9c. Spin-in-place idiom — Arabic phrases for turning around oneself
    # (`حول نفسي`, `حول نفسك`, `حوالين نفسي`, `حوالين نفسك`, `على نفسي`,
    # `على نفسك`, `بنفسي`, `بنفسك`) all mean 360°/180° self-rotation.
    # Translated to ' around ' so 'turning حول نفسي' becomes 'turning
    # around' and the turn_around canonical regex catches it.
    # Field case: user says 'لف حول نفسك' → Gemini replies 'أستدير حول
    # نفسي' → after verb sub = 'turning حول نفسي' → after this pass =
    # 'turning around' → dispatch turn_around. Without this, the bot
    # phrase didn't dispatch and the robot stood still.
    text = re.sub(
        r"\s*(?:حوال?ين|حول|على)\s+(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b",
        " around",
        text,
    )
    text = re.sub(r"\s*ب(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b", " around", text)

    # 9. Collapse extra whitespace introduced by substitutions.
    text = re.sub(r"\s+", " ", text).strip()

    return text


# ────────────────────────────────────────────────────────────────
# TODO (future) — AMBIGUITY DISAMBIGUATION HOOK
#
# 'left' / 'right' / 'forward' have surface ambiguity (direction vs
# past-tense / surname / general adverb). When LiDAR + dialogue memory
# are wired in, a higher-level resolver should sit ABOVE this module
# and decide: is the current chunk a motion intent or descriptive
# language? Inputs to that resolver could include:
#   - command_history.json (recent commands — context for repeat/follow)
#   - LiDAR snapshot (is the path clear? is something blocking?)
#   - prior 1-2 Gemini turns (what was discussed? still on motion topic?)
#   - mic energy + duration (long quiet vs decisive utterance)
# The resolver gates whether to call to_canonical_english at all OR
# returns the original text unchanged when context says 'this is chat,
# not motion'.
#
# Current behaviour: context-free token substitution. Same risk profile
# as today's regex layer.
# ────────────────────────────────────────────────────────────────


# Standalone smoke check
if __name__ == "__main__":
    # Apply normalise_numbers FIRST to match production pipeline
    from Voice.number_words import normalise_numbers as _nn

    def pipeline(text):
        return to_canonical_english(_nn(text))

    cases = [
        ("أمشي خطوة واحدة.",                    "walk 1 step."),
        ("أمشي خطوتين.",                          "walk 2 steps."),
        ("لف يسار 90 درجة.",                     "turn left 90 degree."),
        ("أستدير يميناً 100 درجة.",              "turn right 100 degree."),
        ("أمشي للأمام 5 خطوات.",                 "walk forward 5 steps."),
        ("أمشي للخلف 3 خطوات.",                  "walk backward 3 steps."),
        ("أرجع 3 خطوات.",                         "walk backward 3 steps."),
        ("أمشي خطوة 1.",                          "walk 1 step."),
        ("أستدير يساراً 1 خطوات.",               "turn left 1 steps."),
        ("أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة.",
         "walk backward 3 steps, then turn right 90 degree."),
        ("Walking forward 5 steps.",              "Walking forward 5 steps."),
        ("Turning right 90 degrees.",             "Turning right 90 degrees."),
        ("لف على اليمين.",                        "turn on right."),
        # Spelled-out + structural together
        ("أستدير يساراً تسعين درجة.",            "turn left 90 degree."),
        ("أمشي خمس خطوات.",                       "walk 5 steps."),
        # Compound chain
        ("أمشي خطوة واحدة، ثم أستدير يميناً.",
         "walk 1 step, then turn right."),
    ]
    ok = bad = 0
    for inp, exp in cases:
        got = pipeline(inp)
        success = got == exp
        mark = "✓" if success else "✗"
        if success: ok += 1
        else: bad += 1
        print(f"  {mark} {inp!r:55s}")
        print(f"      → {got!r}")
        if not success:
            print(f"      expected: {exp!r}")
    print(f"\n{ok}/{ok+bad} passed")