Marcus/Voice/_language_tables.py

199 lines
6.9 KiB
Python

"""
_language_tables.py — load Config/language_tables.json once and cache.
All Voice/* modules that need vocabulary data import from here instead
of hardcoding tables. The JSON file is the single source of truth for
Arabic verbs / directions / units / numbers / English number words /
motion inverses / sequence-never-record list.
Loud failure on missing/malformed file (matches Marcus's 'no silent
config degradation' policy from earlier rounds).
Public API:
LANG = load() ← dict with all top-level sections
LANG["english_numbers"]["ones"] → {word: int}
LANG["arabic_verbs"]["walking"] → list of Arabic root strings
LANG["arabic_duals"]["2 steps"] → list of dual-form strings
LANG["motion_inverses"] → flat dict of pairs
LANG["sequence_never_record"]["canonicals"] → list of canonical names
Convenience flatteners (build the inverse maps consumers usually want):
flat_arabic_verbs() → {ar_root: en_gerund}
flat_arabic_directions() → {ar_word: en_direction}
flat_arabic_units() → {ar_unit: en_unit}
flat_arabic_duals() → {ar_dual_word: 'N units'}
flat_arabic_conjunctions()→ {ar_conj: ' english_glue '}
flat_arabic_connectives() → {ar_word: en_word}
"""
from __future__ import annotations
import json
import os
import sys
from typing import Dict, List
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
_CONFIG_PATH = os.path.join(_PROJECT_DIR, "Config", "language_tables.json")
_CACHE: dict = {}
def load() -> dict:
"""Load and cache the language tables. Raises RuntimeError on
missing/malformed config — voice processing is unusable without it,
so loud failure surfaces the problem at startup rather than during
a user demo."""
if _CACHE:
return _CACHE
if not os.path.isfile(_CONFIG_PATH):
raise RuntimeError(
"Config/language_tables.json missing at {} — voice motion "
"vocabulary cannot load without it. Restore from git or "
"rebuild from the documented schema.".format(_CONFIG_PATH)
)
try:
with open(_CONFIG_PATH, "r", encoding="utf-8") as f:
data = json.load(f) or {}
except Exception as e:
raise RuntimeError(
"Config/language_tables.json malformed: {}".format(e),
) from e
# Quick sanity — make sure the top-level keys we care about exist
required = (
"english_numbers", "arabic_numbers",
"english_fractions", "arabic_fractions",
"arabic_verbs", "arabic_directions", "arabic_units",
"arabic_duals", "arabic_conjunctions", "arabic_connectives",
"motion_inverses", "sequence_never_record",
)
missing = [k for k in required if k not in data]
if missing:
raise RuntimeError(
"Config/language_tables.json missing top-level sections: {}"
.format(missing)
)
_CACHE.update(data)
return _CACHE
def _invert_grouped(grouped: dict, drop_keys: tuple = ("_description",)) -> Dict[str, str]:
"""Helper: flip a {english_label: [arabic_word, ...]} dict into
{arabic_word: english_label}. Skips keys named in drop_keys (used
to skip the '_description' annotation in the JSON)."""
out: Dict[str, str] = {}
for en, ar_list in grouped.items():
if en in drop_keys:
continue
if not isinstance(ar_list, list):
continue
for ar in ar_list:
if isinstance(ar, str) and ar.strip():
out[ar] = en
return out
def flat_arabic_verbs() -> Dict[str, str]:
return _invert_grouped(load()["arabic_verbs"])
def flat_arabic_directions() -> Dict[str, str]:
return _invert_grouped(load()["arabic_directions"])
def flat_arabic_units() -> Dict[str, str]:
return _invert_grouped(load()["arabic_units"])
def flat_arabic_duals() -> Dict[str, str]:
return _invert_grouped(load()["arabic_duals"])
def flat_arabic_conjunctions() -> Dict[str, str]:
return _invert_grouped(load()["arabic_conjunctions"])
def flat_arabic_connectives() -> Dict[str, str]:
return _invert_grouped(load()["arabic_connectives"])
def english_numbers_ones() -> Dict[str, int]:
return {k: int(v) for k, v in load()["english_numbers"]["ones"].items()
if k != "_description"}
def english_numbers_tens() -> Dict[str, int]:
return {k: int(v) for k, v in load()["english_numbers"]["tens"].items()
if k != "_description"}
def english_numbers_scale() -> Dict[str, int]:
return {k: int(v) for k, v in load()["english_numbers"]["scale"].items()
if k != "_description"}
def english_numbers_glue() -> set:
return set(load()["english_numbers"]["glue"])
def arabic_number_literals() -> List[tuple]:
"""List of (arabic_word, integer_value) pairs, in declaration order.
Caller is expected to keep ordering (longest-first) when applying."""
out: List[tuple] = []
for entry in load()["arabic_numbers"].get("literals", []):
if isinstance(entry, list) and len(entry) == 2:
ar, val = entry
if isinstance(ar, str) and isinstance(val, int):
out.append((ar, val))
return out
def motion_inverses() -> Dict[str, str]:
return {k: v for k, v in load()["motion_inverses"].items()
if k != "_description"}
def sequence_never_record() -> set:
return set(load()["sequence_never_record"].get("canonicals", []))
def english_fractions_additive() -> Dict[str, float]:
"""English fraction words that COMBINE with a preceding integer
('3 and a half' → 3 + 0.5)."""
return {k: float(v) for k, v in load()["english_fractions"]["additive"].items()
if k != "_description"}
def english_fractions_leading() -> Dict[str, float]:
"""English fraction words that STAND ALONE before a unit
('half a meter' → 0.5 meter)."""
return {k: float(v) for k, v in load()["english_fractions"]["leading"].items()
if k != "_description"}
def arabic_fractions_additive() -> Dict[str, float]:
"""Arabic fraction words that COMBINE with a preceding digit
via و conjunction ('3 ونصف' → 3.5)."""
return {k: float(v) for k, v in load()["arabic_fractions"]["additive"].items()
if k != "_description"}
def arabic_fractions_leading() -> Dict[str, float]:
"""Arabic fraction words STANDING ALONE before a unit
('نصف متر' → 0.5 meter)."""
return {k: float(v) for k, v in load()["arabic_fractions"]["leading"].items()
if k != "_description"}
def arabic_unit_words() -> set:
"""Set of all Arabic unit words from arabic_units. Used by the
fraction parser to detect 'N <unit> ونصف' patterns."""
out = set()
for vals in load()["arabic_units"].values():
if isinstance(vals, list):
for v in vals:
if isinstance(v, str) and v.strip():
out.add(v)
return out