Marcus/Voice/canonical_normalizer.py

277 lines
12 KiB
Python

"""
canonical_normalizer.py — translate Arabic / dialect motion phrasings
to canonical English form before regex matching.
Runs in the dispatcher pipeline AFTER normalise_numbers (which handles
spelled-out numbers in both languages) and BEFORE the parametric regex
scan. The output is English-shaped text the existing English parametric
regexes can match — replacing the previous approach of maintaining 14+
dialect-specific Arabic regexes per command type.
Pipeline:
raw chunk ──► _STATE_ECHO_RE strip ──► _QUOTED_RE strip
──► _QUESTION_RE strip ──► normalise_numbers
──► to_canonical_english ←──── HERE
──► English regex scan ──► dispatch
Input/output examples:
أمشي خطوة واحدة. → walk 1 step.
أمشي خطوتين. → walk 2 steps.
لف يسار 90 درجة. → turn left 90 degree.
أستدير يميناً 100 درجة. → turn right 100 degree.
أمشي للأمام 5 خطوات. → walk forward 5 steps.
أمشي للخلف 3 خطوات. → walk backward 3 steps.
أرجع 3 خطوات. → walk backward 3 steps.
أتي إليك. → (unchanged — bot_phrase fixed canonical)
Compound chains (مع conjunctions) are translated in place:
أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة.
→ walk backward 3 steps, then turn right 90 degree.
Idempotent for already-English text:
Walking forward 5 steps. → Walking forward 5 steps.
Ambiguity note: tokens like 'left' / 'right' have surface ambiguity
('I left the room' vs 'turn left'). The current normalizer uses
context-free token substitution and accepts that risk — same
behaviour as today's regex layer. A future memory + LiDAR-aware
disambiguation hook (using prior dialogue context and current
spatial state) belongs ABOVE this layer; see TODO at the bottom.
"""
from __future__ import annotations
import re
# ─── Vocabulary tables loaded from Config/language_tables.json ────
#
# Single source of truth for verb / direction / unit / dual / conj /
# connective dictionaries. Adding a new dialect = JSON edit. See
# Voice/_language_tables.py for the loader and flattening helpers.
from Voice._language_tables import (
flat_arabic_verbs, flat_arabic_directions, flat_arabic_units,
flat_arabic_duals, flat_arabic_conjunctions, flat_arabic_connectives,
)
# Verb roots: Arabic → English gerund. Loaded once at module import.
_AR_VERB = flat_arabic_verbs()
# Direction words: Arabic → English direction.
_AR_DIR = flat_arabic_directions()
# Units: Arabic → English unit (singular/plural preserved).
_AR_UNIT = flat_arabic_units()
# Dual forms: single Arabic word → 'N units' English target.
_AR_DUAL = flat_arabic_duals()
# Conjunctions for compound chains: Arabic → space-padded English.
_AR_CONJ = flat_arabic_conjunctions()
# Connectives (prepositions / determiners): Arabic → English.
_AR_CONNECTIVES = flat_arabic_connectives()
_HAS_ARABIC_RE = re.compile(r"[؀-ۿ]")
def _ar_word_sub(text: str, mapping: dict) -> str:
"""Substitute Arabic tokens from `mapping` keys to their English
values. Uses Arabic-LETTER boundary on both sides (NOT the full
Arabic block — Arabic punctuation '،' / '؟' / '؛' is U+060C/061F/061B
and is INSIDE the Arabic block but should be treated as a boundary
so 'خطوات،' substitutes correctly). Letter range is U+0621-U+064A
(ء to ي); also include diacritics U+064B-U+065F as in-word so
'البَاب' (with fatha) isn't split.
Longest-first ordering resolves substring conflicts (e.g. match
'ألف' before its substring 'لف')."""
keys = sorted(mapping.keys(), key=len, reverse=True)
for ar in keys:
en = mapping[ar]
# In-word characters: Arabic letters + diacritics. Outside this
# class is a boundary (whitespace, ASCII, Arabic punctuation).
pattern = (
r"(?<![ء-يً-ٟ])"
+ re.escape(ar)
+ r"(?![ء-يً-ٟ])"
)
text = re.sub(pattern, en, text)
return text
def to_canonical_english(text: str) -> str:
"""Translate Arabic / dialect structural motion phrasings to
English canonical form so the existing English parametric regex
layer can match them.
Order of operations (each is idempotent on its output):
1. DUAL substitution (single-word counts: خطوتين → 2 steps)
2. CONJ substitution (ثم → then) — done early so subsequent
passes don't accidentally treat conjunctions as words.
3. VERB substitution (أمشي → walk, لف → turn, ...)
4. DIR substitution (يمين → right, ...)
5. UNIT substitution (خطوة → step, متر → meter, ...)
6. CONNECTIVES (إلى → to, etc.)
7. Arabic comma → English comma (preserves compound parsing)
8. Word-order fix: 'walk step 1''walk 1 step'
'walk forward step 1''walk forward 1 step'
9. Whitespace collapse.
Bails out early if the input contains no Arabic (idempotent for
pure English).
"""
if not text or not _HAS_ARABIC_RE.search(text):
return text
# 1. Dual forms first — these are single Arabic words that carry
# both count and unit ('خطوتين' = '2 steps').
text = _ar_word_sub(text, _AR_DUAL)
# 2. Conjunctions — translate before verbs so 'ثم' between two
# motions becomes ' then ' for the regex layer's compound parser.
text = _ar_word_sub(text, _AR_CONJ)
# 3. Verbs.
text = _ar_word_sub(text, _AR_VERB)
# 4. Directions.
text = _ar_word_sub(text, _AR_DIR)
# 5. Units.
text = _ar_word_sub(text, _AR_UNIT)
# 6. Connectives.
text = _ar_word_sub(text, _AR_CONNECTIVES)
# 7. Arabic comma → ASCII comma (so the English regex's compound
# detection sees the boundary).
text = text.replace("،", ",")
# 8. Word-order fix. After token substitution we may have
# 'walk [direction]? step 1' / 'walk [direction]? meter 2'
# which has the unit before the number. The English regex expects
# '[verb] [direction]? <number> <unit>'. Swap them.
text = re.sub(
r"\b(walk(?:ing)?|turn(?:ing)?|mov(?:e|ing))"
r"(\s+(?:forward|backward|back|left|right|around))?"
r"\s+(steps?|meters?|degrees?)"
r"\s+(\d+(?:\.\d+)?)",
r"\1\2 \4 \3",
text,
flags=re.IGNORECASE,
)
# 9a. Idiomatic post-fix: 'turning backward' (which is what we get
# from 'استدر للخلف' / 'لف ورا') should map to 'turning around'.
# 'turning backward' isn't a meaningful phrase in English motion
# vocab; the canonical is 'turn around' (180° rotation).
text = re.sub(
r"\bturn(?:ing)?\s+backward\b",
"turning around",
text,
flags=re.IGNORECASE,
)
# 9b. Double-direction collapse. After substitution we sometimes get
# 'walking backward backward 2 steps' because the Arabic verb
# ('أرجع' = walking-backward gerund) AND the explicit direction
# ('للخلف'/'لورا' = backward) both translated to 'backward'. Collapse
# repeated adjacent direction tokens so the parametric regex matches.
# Field-observed cases:
# 'أرجع للخلف خطوتين' → 'walking backward backward 2 steps' →
# 'walking backward 2 steps'
# 'أتقدم للأمام خطوة' → 'walking forward forward 1 step' →
# 'walking forward 1 step'
text = re.sub(
r"\b(forward|backward|back|left|right|around)(?:\s+\1)+\b",
r"\1",
text,
flags=re.IGNORECASE,
)
# 9c. Spin-in-place idiom — Arabic phrases for turning around oneself
# (`حول نفسي`, `حول نفسك`, `حوالين نفسي`, `حوالين نفسك`, `على نفسي`,
# `على نفسك`, `بنفسي`, `بنفسك`) all mean 360°/180° self-rotation.
# Translated to ' around ' so 'turning حول نفسي' becomes 'turning
# around' and the turn_around canonical regex catches it.
# Field case: user says 'لف حول نفسك' → Gemini replies 'أستدير حول
# نفسي' → after verb sub = 'turning حول نفسي' → after this pass =
# 'turning around' → dispatch turn_around. Without this, the bot
# phrase didn't dispatch and the robot stood still.
text = re.sub(
r"\s*(?:حوال?ين|حول|على)\s+(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b",
" around",
text,
)
text = re.sub(r"\s*ب(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b", " around", text)
# 9. Collapse extra whitespace introduced by substitutions.
text = re.sub(r"\s+", " ", text).strip()
return text
# ────────────────────────────────────────────────────────────────
# TODO (future) — AMBIGUITY DISAMBIGUATION HOOK
#
# 'left' / 'right' / 'forward' have surface ambiguity (direction vs
# past-tense / surname / general adverb). When LiDAR + dialogue memory
# are wired in, a higher-level resolver should sit ABOVE this module
# and decide: is the current chunk a motion intent or descriptive
# language? Inputs to that resolver could include:
# - command_history.json (recent commands — context for repeat/follow)
# - LiDAR snapshot (is the path clear? is something blocking?)
# - prior 1-2 Gemini turns (what was discussed? still on motion topic?)
# - mic energy + duration (long quiet vs decisive utterance)
# The resolver gates whether to call to_canonical_english at all OR
# returns the original text unchanged when context says 'this is chat,
# not motion'.
#
# Current behaviour: context-free token substitution. Same risk profile
# as today's regex layer.
# ────────────────────────────────────────────────────────────────
# Standalone smoke check
if __name__ == "__main__":
# Apply normalise_numbers FIRST to match production pipeline
from Voice.number_words import normalise_numbers as _nn
def pipeline(text):
return to_canonical_english(_nn(text))
cases = [
("أمشي خطوة واحدة.", "walk 1 step."),
("أمشي خطوتين.", "walk 2 steps."),
("لف يسار 90 درجة.", "turn left 90 degree."),
("أستدير يميناً 100 درجة.", "turn right 100 degree."),
("أمشي للأمام 5 خطوات.", "walk forward 5 steps."),
("أمشي للخلف 3 خطوات.", "walk backward 3 steps."),
("أرجع 3 خطوات.", "walk backward 3 steps."),
("أمشي خطوة 1.", "walk 1 step."),
("أستدير يساراً 1 خطوات.", "turn left 1 steps."),
("أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة.",
"walk backward 3 steps, then turn right 90 degree."),
("Walking forward 5 steps.", "Walking forward 5 steps."),
("Turning right 90 degrees.", "Turning right 90 degrees."),
("لف على اليمين.", "turn on right."),
# Spelled-out + structural together
("أستدير يساراً تسعين درجة.", "turn left 90 degree."),
("أمشي خمس خطوات.", "walk 5 steps."),
# Compound chain
("أمشي خطوة واحدة، ثم أستدير يميناً.",
"walk 1 step, then turn right."),
]
ok = bad = 0
for inp, exp in cases:
got = pipeline(inp)
success = got == exp
mark = "" if success else ""
if success: ok += 1
else: bad += 1
print(f" {mark} {inp!r:55s}")
print(f"{got!r}")
if not success:
print(f" expected: {exp!r}")
print(f"\n{ok}/{ok+bad} passed")