277 lines
12 KiB
Python
277 lines
12 KiB
Python
"""
|
|
canonical_normalizer.py — translate Arabic / dialect motion phrasings
|
|
to canonical English form before regex matching.
|
|
|
|
Runs in the dispatcher pipeline AFTER normalise_numbers (which handles
|
|
spelled-out numbers in both languages) and BEFORE the parametric regex
|
|
scan. The output is English-shaped text the existing English parametric
|
|
regexes can match — replacing the previous approach of maintaining 14+
|
|
dialect-specific Arabic regexes per command type.
|
|
|
|
Pipeline:
|
|
|
|
raw chunk ──► _STATE_ECHO_RE strip ──► _QUOTED_RE strip
|
|
──► _QUESTION_RE strip ──► normalise_numbers
|
|
──► to_canonical_english ←──── HERE
|
|
──► English regex scan ──► dispatch
|
|
|
|
Input/output examples:
|
|
|
|
أمشي خطوة واحدة. → walk 1 step.
|
|
أمشي خطوتين. → walk 2 steps.
|
|
لف يسار 90 درجة. → turn left 90 degree.
|
|
أستدير يميناً 100 درجة. → turn right 100 degree.
|
|
أمشي للأمام 5 خطوات. → walk forward 5 steps.
|
|
أمشي للخلف 3 خطوات. → walk backward 3 steps.
|
|
أرجع 3 خطوات. → walk backward 3 steps.
|
|
أتي إليك. → (unchanged — bot_phrase fixed canonical)
|
|
|
|
Compound chains (مع conjunctions) are translated in place:
|
|
|
|
أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة.
|
|
→ walk backward 3 steps, then turn right 90 degree.
|
|
|
|
Idempotent for already-English text:
|
|
|
|
Walking forward 5 steps. → Walking forward 5 steps.
|
|
|
|
Ambiguity note: tokens like 'left' / 'right' have surface ambiguity
|
|
('I left the room' vs 'turn left'). The current normalizer uses
|
|
context-free token substitution and accepts that risk — same
|
|
behaviour as today's regex layer. A future memory + LiDAR-aware
|
|
disambiguation hook (using prior dialogue context and current
|
|
spatial state) belongs ABOVE this layer; see TODO at the bottom.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
# ─── Vocabulary tables loaded from Config/language_tables.json ────
|
|
#
|
|
# Single source of truth for verb / direction / unit / dual / conj /
|
|
# connective dictionaries. Adding a new dialect = JSON edit. See
|
|
# Voice/_language_tables.py for the loader and flattening helpers.
|
|
from Voice._language_tables import (
|
|
flat_arabic_verbs, flat_arabic_directions, flat_arabic_units,
|
|
flat_arabic_duals, flat_arabic_conjunctions, flat_arabic_connectives,
|
|
)
|
|
|
|
|
|
# Verb roots: Arabic → English gerund. Loaded once at module import.
|
|
_AR_VERB = flat_arabic_verbs()
|
|
# Direction words: Arabic → English direction.
|
|
_AR_DIR = flat_arabic_directions()
|
|
# Units: Arabic → English unit (singular/plural preserved).
|
|
_AR_UNIT = flat_arabic_units()
|
|
# Dual forms: single Arabic word → 'N units' English target.
|
|
_AR_DUAL = flat_arabic_duals()
|
|
# Conjunctions for compound chains: Arabic → space-padded English.
|
|
_AR_CONJ = flat_arabic_conjunctions()
|
|
# Connectives (prepositions / determiners): Arabic → English.
|
|
_AR_CONNECTIVES = flat_arabic_connectives()
|
|
|
|
|
|
|
|
_HAS_ARABIC_RE = re.compile(r"[-ۿ]")
|
|
|
|
|
|
def _ar_word_sub(text: str, mapping: dict) -> str:
|
|
"""Substitute Arabic tokens from `mapping` keys to their English
|
|
values. Uses Arabic-LETTER boundary on both sides (NOT the full
|
|
Arabic block — Arabic punctuation '،' / '؟' / '؛' is U+060C/061F/061B
|
|
and is INSIDE the Arabic block but should be treated as a boundary
|
|
so 'خطوات،' substitutes correctly). Letter range is U+0621-U+064A
|
|
(ء to ي); also include diacritics U+064B-U+065F as in-word so
|
|
'البَاب' (with fatha) isn't split.
|
|
Longest-first ordering resolves substring conflicts (e.g. match
|
|
'ألف' before its substring 'لف')."""
|
|
keys = sorted(mapping.keys(), key=len, reverse=True)
|
|
for ar in keys:
|
|
en = mapping[ar]
|
|
# In-word characters: Arabic letters + diacritics. Outside this
|
|
# class is a boundary (whitespace, ASCII, Arabic punctuation).
|
|
pattern = (
|
|
r"(?<![ء-يً-ٟ])"
|
|
+ re.escape(ar)
|
|
+ r"(?![ء-يً-ٟ])"
|
|
)
|
|
text = re.sub(pattern, en, text)
|
|
return text
|
|
|
|
|
|
def to_canonical_english(text: str) -> str:
|
|
"""Translate Arabic / dialect structural motion phrasings to
|
|
English canonical form so the existing English parametric regex
|
|
layer can match them.
|
|
|
|
Order of operations (each is idempotent on its output):
|
|
1. DUAL substitution (single-word counts: خطوتين → 2 steps)
|
|
2. CONJ substitution (ثم → then) — done early so subsequent
|
|
passes don't accidentally treat conjunctions as words.
|
|
3. VERB substitution (أمشي → walk, لف → turn, ...)
|
|
4. DIR substitution (يمين → right, ...)
|
|
5. UNIT substitution (خطوة → step, متر → meter, ...)
|
|
6. CONNECTIVES (إلى → to, etc.)
|
|
7. Arabic comma → English comma (preserves compound parsing)
|
|
8. Word-order fix: 'walk step 1' → 'walk 1 step'
|
|
'walk forward step 1' → 'walk forward 1 step'
|
|
9. Whitespace collapse.
|
|
|
|
Bails out early if the input contains no Arabic (idempotent for
|
|
pure English).
|
|
"""
|
|
if not text or not _HAS_ARABIC_RE.search(text):
|
|
return text
|
|
|
|
# 1. Dual forms first — these are single Arabic words that carry
|
|
# both count and unit ('خطوتين' = '2 steps').
|
|
text = _ar_word_sub(text, _AR_DUAL)
|
|
|
|
# 2. Conjunctions — translate before verbs so 'ثم' between two
|
|
# motions becomes ' then ' for the regex layer's compound parser.
|
|
text = _ar_word_sub(text, _AR_CONJ)
|
|
|
|
# 3. Verbs.
|
|
text = _ar_word_sub(text, _AR_VERB)
|
|
|
|
# 4. Directions.
|
|
text = _ar_word_sub(text, _AR_DIR)
|
|
|
|
# 5. Units.
|
|
text = _ar_word_sub(text, _AR_UNIT)
|
|
|
|
# 6. Connectives.
|
|
text = _ar_word_sub(text, _AR_CONNECTIVES)
|
|
|
|
# 7. Arabic comma → ASCII comma (so the English regex's compound
|
|
# detection sees the boundary).
|
|
text = text.replace("،", ",")
|
|
|
|
# 8. Word-order fix. After token substitution we may have
|
|
# 'walk [direction]? step 1' / 'walk [direction]? meter 2'
|
|
# which has the unit before the number. The English regex expects
|
|
# '[verb] [direction]? <number> <unit>'. Swap them.
|
|
text = re.sub(
|
|
r"\b(walk(?:ing)?|turn(?:ing)?|mov(?:e|ing))"
|
|
r"(\s+(?:forward|backward|back|left|right|around))?"
|
|
r"\s+(steps?|meters?|degrees?)"
|
|
r"\s+(\d+(?:\.\d+)?)",
|
|
r"\1\2 \4 \3",
|
|
text,
|
|
flags=re.IGNORECASE,
|
|
)
|
|
|
|
# 9a. Idiomatic post-fix: 'turning backward' (which is what we get
|
|
# from 'استدر للخلف' / 'لف ورا') should map to 'turning around'.
|
|
# 'turning backward' isn't a meaningful phrase in English motion
|
|
# vocab; the canonical is 'turn around' (180° rotation).
|
|
text = re.sub(
|
|
r"\bturn(?:ing)?\s+backward\b",
|
|
"turning around",
|
|
text,
|
|
flags=re.IGNORECASE,
|
|
)
|
|
|
|
# 9b. Double-direction collapse. After substitution we sometimes get
|
|
# 'walking backward backward 2 steps' because the Arabic verb
|
|
# ('أرجع' = walking-backward gerund) AND the explicit direction
|
|
# ('للخلف'/'لورا' = backward) both translated to 'backward'. Collapse
|
|
# repeated adjacent direction tokens so the parametric regex matches.
|
|
# Field-observed cases:
|
|
# 'أرجع للخلف خطوتين' → 'walking backward backward 2 steps' →
|
|
# 'walking backward 2 steps'
|
|
# 'أتقدم للأمام خطوة' → 'walking forward forward 1 step' →
|
|
# 'walking forward 1 step'
|
|
text = re.sub(
|
|
r"\b(forward|backward|back|left|right|around)(?:\s+\1)+\b",
|
|
r"\1",
|
|
text,
|
|
flags=re.IGNORECASE,
|
|
)
|
|
|
|
# 9c. Spin-in-place idiom — Arabic phrases for turning around oneself
|
|
# (`حول نفسي`, `حول نفسك`, `حوالين نفسي`, `حوالين نفسك`, `على نفسي`,
|
|
# `على نفسك`, `بنفسي`, `بنفسك`) all mean 360°/180° self-rotation.
|
|
# Translated to ' around ' so 'turning حول نفسي' becomes 'turning
|
|
# around' and the turn_around canonical regex catches it.
|
|
# Field case: user says 'لف حول نفسك' → Gemini replies 'أستدير حول
|
|
# نفسي' → after verb sub = 'turning حول نفسي' → after this pass =
|
|
# 'turning around' → dispatch turn_around. Without this, the bot
|
|
# phrase didn't dispatch and the robot stood still.
|
|
text = re.sub(
|
|
r"\s*(?:حوال?ين|حول|على)\s+(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b",
|
|
" around",
|
|
text,
|
|
)
|
|
text = re.sub(r"\s*ب(?:نفسي|نفسك|نفسه|نفسها|نفسنا)\b", " around", text)
|
|
|
|
# 9. Collapse extra whitespace introduced by substitutions.
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
|
|
return text
|
|
|
|
|
|
# ────────────────────────────────────────────────────────────────
|
|
# TODO (future) — AMBIGUITY DISAMBIGUATION HOOK
|
|
#
|
|
# 'left' / 'right' / 'forward' have surface ambiguity (direction vs
|
|
# past-tense / surname / general adverb). When LiDAR + dialogue memory
|
|
# are wired in, a higher-level resolver should sit ABOVE this module
|
|
# and decide: is the current chunk a motion intent or descriptive
|
|
# language? Inputs to that resolver could include:
|
|
# - command_history.json (recent commands — context for repeat/follow)
|
|
# - LiDAR snapshot (is the path clear? is something blocking?)
|
|
# - prior 1-2 Gemini turns (what was discussed? still on motion topic?)
|
|
# - mic energy + duration (long quiet vs decisive utterance)
|
|
# The resolver gates whether to call to_canonical_english at all OR
|
|
# returns the original text unchanged when context says 'this is chat,
|
|
# not motion'.
|
|
#
|
|
# Current behaviour: context-free token substitution. Same risk profile
|
|
# as today's regex layer.
|
|
# ────────────────────────────────────────────────────────────────
|
|
|
|
|
|
# Standalone smoke check
|
|
if __name__ == "__main__":
|
|
# Apply normalise_numbers FIRST to match production pipeline
|
|
from Voice.number_words import normalise_numbers as _nn
|
|
|
|
def pipeline(text):
|
|
return to_canonical_english(_nn(text))
|
|
|
|
cases = [
|
|
("أمشي خطوة واحدة.", "walk 1 step."),
|
|
("أمشي خطوتين.", "walk 2 steps."),
|
|
("لف يسار 90 درجة.", "turn left 90 degree."),
|
|
("أستدير يميناً 100 درجة.", "turn right 100 degree."),
|
|
("أمشي للأمام 5 خطوات.", "walk forward 5 steps."),
|
|
("أمشي للخلف 3 خطوات.", "walk backward 3 steps."),
|
|
("أرجع 3 خطوات.", "walk backward 3 steps."),
|
|
("أمشي خطوة 1.", "walk 1 step."),
|
|
("أستدير يساراً 1 خطوات.", "turn left 1 steps."),
|
|
("أمشي للخلف 3 خطوات، ثم أستدير يميناً 90 درجة.",
|
|
"walk backward 3 steps, then turn right 90 degree."),
|
|
("Walking forward 5 steps.", "Walking forward 5 steps."),
|
|
("Turning right 90 degrees.", "Turning right 90 degrees."),
|
|
("لف على اليمين.", "turn on right."),
|
|
# Spelled-out + structural together
|
|
("أستدير يساراً تسعين درجة.", "turn left 90 degree."),
|
|
("أمشي خمس خطوات.", "walk 5 steps."),
|
|
# Compound chain
|
|
("أمشي خطوة واحدة، ثم أستدير يميناً.",
|
|
"walk 1 step, then turn right."),
|
|
]
|
|
ok = bad = 0
|
|
for inp, exp in cases:
|
|
got = pipeline(inp)
|
|
success = got == exp
|
|
mark = "✓" if success else "✗"
|
|
if success: ok += 1
|
|
else: bad += 1
|
|
print(f" {mark} {inp!r:55s}")
|
|
print(f" → {got!r}")
|
|
if not success:
|
|
print(f" expected: {exp!r}")
|
|
print(f"\n{ok}/{ok+bad} passed")
|