Marcus/Voice/number_words.py

308 lines
13 KiB
Python

"""
number_words.py — convert spelled-out numbers to digits.
Used by Voice/marcus_voice.py to preprocess Gemini's spoken text
before the dispatcher scans it for parametric motion phrases. Reason:
some Gemini Live voices occasionally speak 'ninety degrees' instead
of '90 degrees' even when the persona prompt asks for digits, and
the parametric regexes only recognise digits.
Scope (intentional):
- English compound numbers up to 999 ('three hundred sixty', 'one
hundred and eighty', 'forty-five').
- Arabic ones/tens/hundreds with و-conjunction ('تسعون', 'تسعين',
'مائة وثمانون', 'ثلاثمائة وستون', 'خمس', 'خمسة').
- Word-boundary-aware so it does NOT eat the substring 'one' inside
'someone' or 'thirty' inside 'thirtysomething'.
Out-of-scope:
- Decimals ('two point five').
- Numbers above 999 (motion vocab maxes around 360 degrees / 30
meters / 100 steps; 999 is plenty of headroom).
- Ordinals ('first', 'الأول') — those don't appear in motion text.
API:
>>> normalise_numbers("Walking forward five steps.")
'Walking forward 5 steps.'
>>> normalise_numbers("أستدير تسعين درجة")
'أستدير 90 درجة'
>>> normalise_numbers("turn three hundred sixty degrees")
'turn 360 degrees'
"""
from __future__ import annotations
import re
# ─── English ─────────────────────────────────────────────────────
#
# Number-word tables loaded from Config/language_tables.json. Adding
# a new word (e.g. 'twenty-five' as a single hyphenated token, or a
# new dialectal form) is a JSON edit, not a Python change.
from Voice._language_tables import (
english_numbers_ones, english_numbers_tens,
english_numbers_scale, english_numbers_glue,
)
_EN_ONES = english_numbers_ones()
_EN_TENS = english_numbers_tens()
_EN_SCALE = english_numbers_scale()
_EN_GLUE = english_numbers_glue()
_EN_NUMBER_TOKEN = (
set(_EN_ONES.keys())
| set(_EN_TENS.keys())
| set(_EN_SCALE.keys())
| _EN_GLUE
)
def _en_words_to_int(tokens: list) -> int:
"""Convert a list of English number-word tokens to an integer.
Tokens are lowercase and stripped. Glue ('and', '-') is ignored.
Returns the parsed int. Caller guarantees tokens are all in
_EN_NUMBER_TOKEN."""
total = 0
current = 0
for t in tokens:
if t in _EN_GLUE:
continue
if t in _EN_ONES:
current += _EN_ONES[t]
elif t in _EN_TENS:
current += _EN_TENS[t]
elif t in _EN_SCALE:
current = max(current, 1) * _EN_SCALE[t]
total += current
current = 0
return total + current
# Match a maximal run of English number-word tokens. Word-boundary on
# each side so we don't eat 'one' inside 'someone' or 'ten' inside
# 'often'. Allow hyphens between (twenty-five) and 'and' between scales.
_EN_RUN = re.compile(
r"(?:(?<=^)|(?<=[\s\.,!?;:\"\'\(\)\[\]\{\}\-]))"
r"((?:" + "|".join(re.escape(w) for w in sorted(_EN_NUMBER_TOKEN, key=len, reverse=True))
+ r")(?:[\s\-]+(?:" + "|".join(re.escape(w) for w in sorted(_EN_NUMBER_TOKEN, key=len, reverse=True))
+ r"))*)"
r"(?=$|[\s\.,!?;:\"\'\(\)\[\]\{\}\-])",
re.IGNORECASE,
)
def _normalise_english(text: str) -> str:
def _sub(match):
run = match.group(1)
# Tokenise the run, splitting on whitespace and hyphens.
toks = [t for t in re.split(r"[\s\-]+", run.lower()) if t]
# If the run is JUST a single glue word ('and'), don't replace.
digit_toks = [t for t in toks if t not in _EN_GLUE]
if not digit_toks:
return run
# If the run is exactly one ones-word and the surrounding context
# likely doesn't refer to a count (e.g. 'one' in 'one of the
# things'), still replace — it's safer to over-digitize for the
# parametric scan than under-digitize. The dispatcher only fires
# when the digit appears in a parametric pattern context anyway.
try:
value = _en_words_to_int(digit_toks)
except Exception:
return run
return str(value)
return _EN_RUN.sub(_sub, text)
# ─── Arabic ──────────────────────────────────────────────────────
#
# Arabic spelled-out numbers. The list lives in
# Config/language_tables.json under arabic_numbers.literals. Adding
# new dialectal variants (Egyptian / Maghrebi specific forms) is a
# JSON edit, not a Python change. Order in the JSON is preserved here
# (longest-first so multi-token phrases match before their prefixes).
from Voice._language_tables import arabic_number_literals
_AR_LITERALS = arabic_number_literals()
def _normalise_arabic(text: str) -> str:
# Apply longest first so multi-word values claim before singles.
sorted_lits = sorted(_AR_LITERALS, key=lambda p: len(p[0]), reverse=True)
for word, value in sorted_lits:
# Substring replacement is safe for Arabic here because our
# literals are full Arabic tokens with characters outside
# English wordspace; no risk of eating into other words. Still
# use word boundaries (whitespace OR start/end OR punctuation)
# to avoid eating 'خمس' inside 'خمسة'.
pattern = (
r"(?<![ء-ي])"
+ re.escape(word)
+ r"(?![ء-ي])"
)
text = re.sub(pattern, str(value), text)
return text
# ─── Public ─────────────────────────────────────────────────────
def normalise_numbers(text: str) -> str:
"""Convert spelled-out numbers to digits in `text`. Idempotent —
running it twice produces the same result. Returns the original
text unchanged if it contains no recognisable number words.
Order: integer-words pass FIRST (so 'three and a half' becomes
'3 and a half' before fraction handling), then fractions ('3 and
a half''3.5'). Without that order the fraction pass wouldn't
see digits to attach to."""
if not text:
return text
out = _normalise_english(text)
out = _normalise_arabic(out)
out = _apply_fractions(out)
return out
# ─── Fraction parser (English + Arabic) ──────────────────────────
#
# Tables loaded from Config/language_tables.json. Two flavours:
# - additive : combines with a preceding digit ('3 and a half'
# / '3 ونصف' → 3.5). Includes a special-case
# handler for Arabic 'N <unit> ونصف' where the
# fraction trails after the unit noun.
# - leading : standalone before a unit ('half a meter' / 'نصف
# متر' → 0.5 meter / 0.5 متر).
from Voice._language_tables import (
english_fractions_additive, english_fractions_leading,
arabic_fractions_additive, arabic_fractions_leading,
arabic_unit_words,
)
_EN_FRAC_ADD = english_fractions_additive()
_EN_FRAC_LEADING = english_fractions_leading()
_AR_FRAC_ADD = arabic_fractions_additive()
_AR_FRAC_LEADING = arabic_fractions_leading()
_AR_UNITS = arabic_unit_words()
def _fmt_decimal(v: float) -> str:
"""Format a float without trailing zeros: 3.0 → '3', 3.5 → '3.5',
1.79 → '1.79'."""
if v == int(v):
return str(int(v))
s = "{:.4f}".format(v).rstrip("0").rstrip(".")
return s
def _apply_fractions(text: str) -> str:
"""Convert fractional expressions to decimal digits.
Patterns handled (all idempotent — leaves digit-only text alone):
EN '3 and a half steps' / '3 and half''3.5 steps'
EN 'half a meter' / 'half meter''0.5 meter'
EN 'a quarter step''0.25 step'
AR '3 ونصف خطوات''3.5 خطوات'
AR '3 خطوات ونصف' (trailing fraction) → '3.5 خطوات'
AR 'نصف متر''0.5 متر'
AR 'متر ونصف' (no preceding count) → '0.5 متر' (interpreted as 1.5
if a leading 1 is implicit; we
treat as 0.5 — explicit form
'1 ونصف متر' is preferred).
"""
if not text:
return text
# ── ENGLISH additive: '<N> and (a/an)? <frac>' ─────────────────
en_add_alt = "|".join(re.escape(k) for k in sorted(_EN_FRAC_ADD.keys(), key=len, reverse=True))
if en_add_alt:
def _en_add(m):
n = float(m.group(1))
frac_word = m.group(2).lower()
return _fmt_decimal(n + _EN_FRAC_ADD[frac_word])
text = re.sub(
r"\b(\d+(?:\.\d+)?)\s+and\s+(?:an?\s+)?(" + en_add_alt + r")\b",
_en_add, text, flags=re.IGNORECASE,
)
# ── ENGLISH leading: '(a/an)? <frac> (a/an)? <noun>' ───────────
en_lead_alt = "|".join(re.escape(k) for k in sorted(_EN_FRAC_LEADING.keys(), key=len, reverse=True))
if en_lead_alt:
def _en_lead(m):
frac_word = m.group(1).lower()
return _fmt_decimal(_EN_FRAC_LEADING[frac_word]) + " "
text = re.sub(
r"\b(?:an?\s+)?(" + en_lead_alt + r")\s+(?:an?\s+)?(?=[A-Za-z])",
_en_lead, text, flags=re.IGNORECASE,
)
# ── ARABIC trailing-fraction with unit: 'N <unit> ونصف' ─────────
if _AR_FRAC_ADD and _AR_UNITS:
unit_alt = "|".join(re.escape(u) for u in sorted(_AR_UNITS, key=len, reverse=True))
ar_add_alt = "|".join(re.escape(k) for k in sorted(_AR_FRAC_ADD.keys(), key=len, reverse=True))
def _ar_trail(m):
n = float(m.group(1))
unit = m.group(2)
frac_word = m.group(3)
return "{} {}".format(_fmt_decimal(n + _AR_FRAC_ADD[frac_word]), unit)
text = re.sub(
r"(\d+(?:\.\d+)?)\s+(" + unit_alt + r")\s+و(" + ar_add_alt + r")\b",
_ar_trail, text,
)
# ── ARABIC additive: 'N ونصف' / 'N و نصف' ──────────────────────
if _AR_FRAC_ADD:
ar_add_alt = "|".join(re.escape(k) for k in sorted(_AR_FRAC_ADD.keys(), key=len, reverse=True))
def _ar_add(m):
n = float(m.group(1))
frac_word = m.group(2)
return _fmt_decimal(n + _AR_FRAC_ADD[frac_word])
text = re.sub(
r"(\d+(?:\.\d+)?)\s*و\s*(" + ar_add_alt + r")\b",
_ar_add, text,
)
# ── ARABIC leading: '<frac> <noun>' ────────────────────────────
if _AR_FRAC_LEADING:
ar_lead_alt = "|".join(re.escape(k) for k in sorted(_AR_FRAC_LEADING.keys(), key=len, reverse=True))
def _ar_lead(m):
frac_word = m.group(1)
return _fmt_decimal(_AR_FRAC_LEADING[frac_word]) + " "
# Boundary: previous char is non-Arabic-letter, next char is
# an Arabic letter (so 'نصف' inside another word is preserved).
text = re.sub(
r"(?<![ء-يً-ٟ])(" + ar_lead_alt + r")\s+(?=[ء-ي])",
_ar_lead, text,
)
return text
# Standalone smoke check
if __name__ == "__main__":
cases = [
("Walking forward five steps.", "Walking forward 5 steps."),
("turn ninety degrees", "turn 90 degrees"),
("turn left ninety degrees.", "turn left 90 degrees."),
("turn three hundred sixty degrees", "turn 360 degrees"),
("turn one hundred and eighty deg", "turn 180 deg"),
("walk forty-five meters", "walk 45 meters"),
("أستدير تسعين درجة", "أستدير 90 درجة"),
("لف يمين تسعون درجة", "لف يمين 90 درجة"),
("أمشي خمس خطوات", "أمشي 5 خطوات"),
("أستدير مائة وثمانين درجة", "أستدير 180 درجة"),
("turn 90 degrees", "turn 90 degrees"), # already digits
("I see someone there.", "I see someone there."), # 'one' inside 'someone' must NOT be eaten
("often", "often"), # 'ten' inside 'often' must NOT be eaten
]
ok = bad = 0
for inp, expected in cases:
got = normalise_numbers(inp)
mark = "" if got == expected else ""
if got == expected: ok += 1
else: bad += 1
print(f" {mark} {inp!r:45s} -> {got!r}")
print(f"\n{ok}/{ok+bad} passed")