308 lines
13 KiB
Python
308 lines
13 KiB
Python
"""
|
|
number_words.py — convert spelled-out numbers to digits.
|
|
|
|
Used by Voice/marcus_voice.py to preprocess Gemini's spoken text
|
|
before the dispatcher scans it for parametric motion phrases. Reason:
|
|
some Gemini Live voices occasionally speak 'ninety degrees' instead
|
|
of '90 degrees' even when the persona prompt asks for digits, and
|
|
the parametric regexes only recognise digits.
|
|
|
|
Scope (intentional):
|
|
- English compound numbers up to 999 ('three hundred sixty', 'one
|
|
hundred and eighty', 'forty-five').
|
|
- Arabic ones/tens/hundreds with و-conjunction ('تسعون', 'تسعين',
|
|
'مائة وثمانون', 'ثلاثمائة وستون', 'خمس', 'خمسة').
|
|
- Word-boundary-aware so it does NOT eat the substring 'one' inside
|
|
'someone' or 'thirty' inside 'thirtysomething'.
|
|
|
|
Out-of-scope:
|
|
- Decimals ('two point five').
|
|
- Numbers above 999 (motion vocab maxes around 360 degrees / 30
|
|
meters / 100 steps; 999 is plenty of headroom).
|
|
- Ordinals ('first', 'الأول') — those don't appear in motion text.
|
|
|
|
API:
|
|
>>> normalise_numbers("Walking forward five steps.")
|
|
'Walking forward 5 steps.'
|
|
>>> normalise_numbers("أستدير تسعين درجة")
|
|
'أستدير 90 درجة'
|
|
>>> normalise_numbers("turn three hundred sixty degrees")
|
|
'turn 360 degrees'
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
# ─── English ─────────────────────────────────────────────────────
|
|
#
|
|
# Number-word tables loaded from Config/language_tables.json. Adding
|
|
# a new word (e.g. 'twenty-five' as a single hyphenated token, or a
|
|
# new dialectal form) is a JSON edit, not a Python change.
|
|
from Voice._language_tables import (
|
|
english_numbers_ones, english_numbers_tens,
|
|
english_numbers_scale, english_numbers_glue,
|
|
)
|
|
|
|
_EN_ONES = english_numbers_ones()
|
|
_EN_TENS = english_numbers_tens()
|
|
_EN_SCALE = english_numbers_scale()
|
|
_EN_GLUE = english_numbers_glue()
|
|
|
|
_EN_NUMBER_TOKEN = (
|
|
set(_EN_ONES.keys())
|
|
| set(_EN_TENS.keys())
|
|
| set(_EN_SCALE.keys())
|
|
| _EN_GLUE
|
|
)
|
|
|
|
|
|
def _en_words_to_int(tokens: list) -> int:
|
|
"""Convert a list of English number-word tokens to an integer.
|
|
Tokens are lowercase and stripped. Glue ('and', '-') is ignored.
|
|
Returns the parsed int. Caller guarantees tokens are all in
|
|
_EN_NUMBER_TOKEN."""
|
|
total = 0
|
|
current = 0
|
|
for t in tokens:
|
|
if t in _EN_GLUE:
|
|
continue
|
|
if t in _EN_ONES:
|
|
current += _EN_ONES[t]
|
|
elif t in _EN_TENS:
|
|
current += _EN_TENS[t]
|
|
elif t in _EN_SCALE:
|
|
current = max(current, 1) * _EN_SCALE[t]
|
|
total += current
|
|
current = 0
|
|
return total + current
|
|
|
|
|
|
# Match a maximal run of English number-word tokens. Word-boundary on
|
|
# each side so we don't eat 'one' inside 'someone' or 'ten' inside
|
|
# 'often'. Allow hyphens between (twenty-five) and 'and' between scales.
|
|
_EN_RUN = re.compile(
|
|
r"(?:(?<=^)|(?<=[\s\.,!?;:\"\'\(\)\[\]\{\}\-]))"
|
|
r"((?:" + "|".join(re.escape(w) for w in sorted(_EN_NUMBER_TOKEN, key=len, reverse=True))
|
|
+ r")(?:[\s\-]+(?:" + "|".join(re.escape(w) for w in sorted(_EN_NUMBER_TOKEN, key=len, reverse=True))
|
|
+ r"))*)"
|
|
r"(?=$|[\s\.,!?;:\"\'\(\)\[\]\{\}\-])",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _normalise_english(text: str) -> str:
|
|
def _sub(match):
|
|
run = match.group(1)
|
|
# Tokenise the run, splitting on whitespace and hyphens.
|
|
toks = [t for t in re.split(r"[\s\-]+", run.lower()) if t]
|
|
# If the run is JUST a single glue word ('and'), don't replace.
|
|
digit_toks = [t for t in toks if t not in _EN_GLUE]
|
|
if not digit_toks:
|
|
return run
|
|
# If the run is exactly one ones-word and the surrounding context
|
|
# likely doesn't refer to a count (e.g. 'one' in 'one of the
|
|
# things'), still replace — it's safer to over-digitize for the
|
|
# parametric scan than under-digitize. The dispatcher only fires
|
|
# when the digit appears in a parametric pattern context anyway.
|
|
try:
|
|
value = _en_words_to_int(digit_toks)
|
|
except Exception:
|
|
return run
|
|
return str(value)
|
|
|
|
return _EN_RUN.sub(_sub, text)
|
|
|
|
|
|
# ─── Arabic ──────────────────────────────────────────────────────
|
|
#
|
|
# Arabic spelled-out numbers. The list lives in
|
|
# Config/language_tables.json under arabic_numbers.literals. Adding
|
|
# new dialectal variants (Egyptian / Maghrebi specific forms) is a
|
|
# JSON edit, not a Python change. Order in the JSON is preserved here
|
|
# (longest-first so multi-token phrases match before their prefixes).
|
|
|
|
from Voice._language_tables import arabic_number_literals
|
|
|
|
_AR_LITERALS = arabic_number_literals()
|
|
|
|
|
|
def _normalise_arabic(text: str) -> str:
|
|
# Apply longest first so multi-word values claim before singles.
|
|
sorted_lits = sorted(_AR_LITERALS, key=lambda p: len(p[0]), reverse=True)
|
|
for word, value in sorted_lits:
|
|
# Substring replacement is safe for Arabic here because our
|
|
# literals are full Arabic tokens with characters outside
|
|
# English wordspace; no risk of eating into other words. Still
|
|
# use word boundaries (whitespace OR start/end OR punctuation)
|
|
# to avoid eating 'خمس' inside 'خمسة'.
|
|
pattern = (
|
|
r"(?<![ء-ي])"
|
|
+ re.escape(word)
|
|
+ r"(?![ء-ي])"
|
|
)
|
|
text = re.sub(pattern, str(value), text)
|
|
return text
|
|
|
|
|
|
# ─── Public ─────────────────────────────────────────────────────
|
|
|
|
def normalise_numbers(text: str) -> str:
|
|
"""Convert spelled-out numbers to digits in `text`. Idempotent —
|
|
running it twice produces the same result. Returns the original
|
|
text unchanged if it contains no recognisable number words.
|
|
|
|
Order: integer-words pass FIRST (so 'three and a half' becomes
|
|
'3 and a half' before fraction handling), then fractions ('3 and
|
|
a half' → '3.5'). Without that order the fraction pass wouldn't
|
|
see digits to attach to."""
|
|
if not text:
|
|
return text
|
|
out = _normalise_english(text)
|
|
out = _normalise_arabic(out)
|
|
out = _apply_fractions(out)
|
|
return out
|
|
|
|
|
|
# ─── Fraction parser (English + Arabic) ──────────────────────────
|
|
#
|
|
# Tables loaded from Config/language_tables.json. Two flavours:
|
|
# - additive : combines with a preceding digit ('3 and a half'
|
|
# / '3 ونصف' → 3.5). Includes a special-case
|
|
# handler for Arabic 'N <unit> ونصف' where the
|
|
# fraction trails after the unit noun.
|
|
# - leading : standalone before a unit ('half a meter' / 'نصف
|
|
# متر' → 0.5 meter / 0.5 متر).
|
|
|
|
from Voice._language_tables import (
|
|
english_fractions_additive, english_fractions_leading,
|
|
arabic_fractions_additive, arabic_fractions_leading,
|
|
arabic_unit_words,
|
|
)
|
|
|
|
_EN_FRAC_ADD = english_fractions_additive()
|
|
_EN_FRAC_LEADING = english_fractions_leading()
|
|
_AR_FRAC_ADD = arabic_fractions_additive()
|
|
_AR_FRAC_LEADING = arabic_fractions_leading()
|
|
_AR_UNITS = arabic_unit_words()
|
|
|
|
|
|
def _fmt_decimal(v: float) -> str:
|
|
"""Format a float without trailing zeros: 3.0 → '3', 3.5 → '3.5',
|
|
1.79 → '1.79'."""
|
|
if v == int(v):
|
|
return str(int(v))
|
|
s = "{:.4f}".format(v).rstrip("0").rstrip(".")
|
|
return s
|
|
|
|
|
|
def _apply_fractions(text: str) -> str:
|
|
"""Convert fractional expressions to decimal digits.
|
|
|
|
Patterns handled (all idempotent — leaves digit-only text alone):
|
|
|
|
EN '3 and a half steps' / '3 and half' → '3.5 steps'
|
|
EN 'half a meter' / 'half meter' → '0.5 meter'
|
|
EN 'a quarter step' → '0.25 step'
|
|
|
|
AR '3 ونصف خطوات' → '3.5 خطوات'
|
|
AR '3 خطوات ونصف' (trailing fraction) → '3.5 خطوات'
|
|
AR 'نصف متر' → '0.5 متر'
|
|
AR 'متر ونصف' (no preceding count) → '0.5 متر' (interpreted as 1.5
|
|
if a leading 1 is implicit; we
|
|
treat as 0.5 — explicit form
|
|
'1 ونصف متر' is preferred).
|
|
"""
|
|
if not text:
|
|
return text
|
|
|
|
# ── ENGLISH additive: '<N> and (a/an)? <frac>' ─────────────────
|
|
en_add_alt = "|".join(re.escape(k) for k in sorted(_EN_FRAC_ADD.keys(), key=len, reverse=True))
|
|
if en_add_alt:
|
|
def _en_add(m):
|
|
n = float(m.group(1))
|
|
frac_word = m.group(2).lower()
|
|
return _fmt_decimal(n + _EN_FRAC_ADD[frac_word])
|
|
text = re.sub(
|
|
r"\b(\d+(?:\.\d+)?)\s+and\s+(?:an?\s+)?(" + en_add_alt + r")\b",
|
|
_en_add, text, flags=re.IGNORECASE,
|
|
)
|
|
|
|
# ── ENGLISH leading: '(a/an)? <frac> (a/an)? <noun>' ───────────
|
|
en_lead_alt = "|".join(re.escape(k) for k in sorted(_EN_FRAC_LEADING.keys(), key=len, reverse=True))
|
|
if en_lead_alt:
|
|
def _en_lead(m):
|
|
frac_word = m.group(1).lower()
|
|
return _fmt_decimal(_EN_FRAC_LEADING[frac_word]) + " "
|
|
text = re.sub(
|
|
r"\b(?:an?\s+)?(" + en_lead_alt + r")\s+(?:an?\s+)?(?=[A-Za-z])",
|
|
_en_lead, text, flags=re.IGNORECASE,
|
|
)
|
|
|
|
# ── ARABIC trailing-fraction with unit: 'N <unit> ونصف' ─────────
|
|
if _AR_FRAC_ADD and _AR_UNITS:
|
|
unit_alt = "|".join(re.escape(u) for u in sorted(_AR_UNITS, key=len, reverse=True))
|
|
ar_add_alt = "|".join(re.escape(k) for k in sorted(_AR_FRAC_ADD.keys(), key=len, reverse=True))
|
|
def _ar_trail(m):
|
|
n = float(m.group(1))
|
|
unit = m.group(2)
|
|
frac_word = m.group(3)
|
|
return "{} {}".format(_fmt_decimal(n + _AR_FRAC_ADD[frac_word]), unit)
|
|
text = re.sub(
|
|
r"(\d+(?:\.\d+)?)\s+(" + unit_alt + r")\s+و(" + ar_add_alt + r")\b",
|
|
_ar_trail, text,
|
|
)
|
|
|
|
# ── ARABIC additive: 'N ونصف' / 'N و نصف' ──────────────────────
|
|
if _AR_FRAC_ADD:
|
|
ar_add_alt = "|".join(re.escape(k) for k in sorted(_AR_FRAC_ADD.keys(), key=len, reverse=True))
|
|
def _ar_add(m):
|
|
n = float(m.group(1))
|
|
frac_word = m.group(2)
|
|
return _fmt_decimal(n + _AR_FRAC_ADD[frac_word])
|
|
text = re.sub(
|
|
r"(\d+(?:\.\d+)?)\s*و\s*(" + ar_add_alt + r")\b",
|
|
_ar_add, text,
|
|
)
|
|
|
|
# ── ARABIC leading: '<frac> <noun>' ────────────────────────────
|
|
if _AR_FRAC_LEADING:
|
|
ar_lead_alt = "|".join(re.escape(k) for k in sorted(_AR_FRAC_LEADING.keys(), key=len, reverse=True))
|
|
def _ar_lead(m):
|
|
frac_word = m.group(1)
|
|
return _fmt_decimal(_AR_FRAC_LEADING[frac_word]) + " "
|
|
# Boundary: previous char is non-Arabic-letter, next char is
|
|
# an Arabic letter (so 'نصف' inside another word is preserved).
|
|
text = re.sub(
|
|
r"(?<![ء-يً-ٟ])(" + ar_lead_alt + r")\s+(?=[ء-ي])",
|
|
_ar_lead, text,
|
|
)
|
|
|
|
return text
|
|
|
|
|
|
# Standalone smoke check
|
|
if __name__ == "__main__":
|
|
cases = [
|
|
("Walking forward five steps.", "Walking forward 5 steps."),
|
|
("turn ninety degrees", "turn 90 degrees"),
|
|
("turn left ninety degrees.", "turn left 90 degrees."),
|
|
("turn three hundred sixty degrees", "turn 360 degrees"),
|
|
("turn one hundred and eighty deg", "turn 180 deg"),
|
|
("walk forty-five meters", "walk 45 meters"),
|
|
("أستدير تسعين درجة", "أستدير 90 درجة"),
|
|
("لف يمين تسعون درجة", "لف يمين 90 درجة"),
|
|
("أمشي خمس خطوات", "أمشي 5 خطوات"),
|
|
("أستدير مائة وثمانين درجة", "أستدير 180 درجة"),
|
|
("turn 90 degrees", "turn 90 degrees"), # already digits
|
|
("I see someone there.", "I see someone there."), # 'one' inside 'someone' must NOT be eaten
|
|
("often", "often"), # 'ten' inside 'often' must NOT be eaten
|
|
]
|
|
ok = bad = 0
|
|
for inp, expected in cases:
|
|
got = normalise_numbers(inp)
|
|
mark = "✓" if got == expected else "✗"
|
|
if got == expected: ok += 1
|
|
else: bad += 1
|
|
print(f" {mark} {inp!r:45s} -> {got!r}")
|
|
print(f"\n{ok}/{ok+bad} passed")
|