199 lines
6.9 KiB
Python
199 lines
6.9 KiB
Python
"""
|
|
_language_tables.py — load Config/language_tables.json once and cache.
|
|
|
|
All Voice/* modules that need vocabulary data import from here instead
|
|
of hardcoding tables. The JSON file is the single source of truth for
|
|
Arabic verbs / directions / units / numbers / English number words /
|
|
motion inverses / sequence-never-record list.
|
|
|
|
Loud failure on missing/malformed file (matches Marcus's 'no silent
|
|
config degradation' policy from earlier rounds).
|
|
|
|
Public API:
|
|
LANG = load() ← dict with all top-level sections
|
|
LANG["english_numbers"]["ones"] → {word: int}
|
|
LANG["arabic_verbs"]["walking"] → list of Arabic root strings
|
|
LANG["arabic_duals"]["2 steps"] → list of dual-form strings
|
|
LANG["motion_inverses"] → flat dict of pairs
|
|
LANG["sequence_never_record"]["canonicals"] → list of canonical names
|
|
|
|
Convenience flatteners (build the inverse maps consumers usually want):
|
|
flat_arabic_verbs() → {ar_root: en_gerund}
|
|
flat_arabic_directions() → {ar_word: en_direction}
|
|
flat_arabic_units() → {ar_unit: en_unit}
|
|
flat_arabic_duals() → {ar_dual_word: 'N units'}
|
|
flat_arabic_conjunctions()→ {ar_conj: ' english_glue '}
|
|
flat_arabic_connectives() → {ar_word: en_word}
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
from typing import Dict, List
|
|
|
|
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
if _PROJECT_DIR not in sys.path:
|
|
sys.path.insert(0, _PROJECT_DIR)
|
|
|
|
_CONFIG_PATH = os.path.join(_PROJECT_DIR, "Config", "language_tables.json")
|
|
_CACHE: dict = {}
|
|
|
|
|
|
def load() -> dict:
|
|
"""Load and cache the language tables. Raises RuntimeError on
|
|
missing/malformed config — voice processing is unusable without it,
|
|
so loud failure surfaces the problem at startup rather than during
|
|
a user demo."""
|
|
if _CACHE:
|
|
return _CACHE
|
|
if not os.path.isfile(_CONFIG_PATH):
|
|
raise RuntimeError(
|
|
"Config/language_tables.json missing at {} — voice motion "
|
|
"vocabulary cannot load without it. Restore from git or "
|
|
"rebuild from the documented schema.".format(_CONFIG_PATH)
|
|
)
|
|
try:
|
|
with open(_CONFIG_PATH, "r", encoding="utf-8") as f:
|
|
data = json.load(f) or {}
|
|
except Exception as e:
|
|
raise RuntimeError(
|
|
"Config/language_tables.json malformed: {}".format(e),
|
|
) from e
|
|
# Quick sanity — make sure the top-level keys we care about exist
|
|
required = (
|
|
"english_numbers", "arabic_numbers",
|
|
"english_fractions", "arabic_fractions",
|
|
"arabic_verbs", "arabic_directions", "arabic_units",
|
|
"arabic_duals", "arabic_conjunctions", "arabic_connectives",
|
|
"motion_inverses", "sequence_never_record",
|
|
)
|
|
missing = [k for k in required if k not in data]
|
|
if missing:
|
|
raise RuntimeError(
|
|
"Config/language_tables.json missing top-level sections: {}"
|
|
.format(missing)
|
|
)
|
|
_CACHE.update(data)
|
|
return _CACHE
|
|
|
|
|
|
def _invert_grouped(grouped: dict, drop_keys: tuple = ("_description",)) -> Dict[str, str]:
|
|
"""Helper: flip a {english_label: [arabic_word, ...]} dict into
|
|
{arabic_word: english_label}. Skips keys named in drop_keys (used
|
|
to skip the '_description' annotation in the JSON)."""
|
|
out: Dict[str, str] = {}
|
|
for en, ar_list in grouped.items():
|
|
if en in drop_keys:
|
|
continue
|
|
if not isinstance(ar_list, list):
|
|
continue
|
|
for ar in ar_list:
|
|
if isinstance(ar, str) and ar.strip():
|
|
out[ar] = en
|
|
return out
|
|
|
|
|
|
def flat_arabic_verbs() -> Dict[str, str]:
|
|
return _invert_grouped(load()["arabic_verbs"])
|
|
|
|
|
|
def flat_arabic_directions() -> Dict[str, str]:
|
|
return _invert_grouped(load()["arabic_directions"])
|
|
|
|
|
|
def flat_arabic_units() -> Dict[str, str]:
|
|
return _invert_grouped(load()["arabic_units"])
|
|
|
|
|
|
def flat_arabic_duals() -> Dict[str, str]:
|
|
return _invert_grouped(load()["arabic_duals"])
|
|
|
|
|
|
def flat_arabic_conjunctions() -> Dict[str, str]:
|
|
return _invert_grouped(load()["arabic_conjunctions"])
|
|
|
|
|
|
def flat_arabic_connectives() -> Dict[str, str]:
|
|
return _invert_grouped(load()["arabic_connectives"])
|
|
|
|
|
|
def english_numbers_ones() -> Dict[str, int]:
|
|
return {k: int(v) for k, v in load()["english_numbers"]["ones"].items()
|
|
if k != "_description"}
|
|
|
|
|
|
def english_numbers_tens() -> Dict[str, int]:
|
|
return {k: int(v) for k, v in load()["english_numbers"]["tens"].items()
|
|
if k != "_description"}
|
|
|
|
|
|
def english_numbers_scale() -> Dict[str, int]:
|
|
return {k: int(v) for k, v in load()["english_numbers"]["scale"].items()
|
|
if k != "_description"}
|
|
|
|
|
|
def english_numbers_glue() -> set:
|
|
return set(load()["english_numbers"]["glue"])
|
|
|
|
|
|
def arabic_number_literals() -> List[tuple]:
|
|
"""List of (arabic_word, integer_value) pairs, in declaration order.
|
|
Caller is expected to keep ordering (longest-first) when applying."""
|
|
out: List[tuple] = []
|
|
for entry in load()["arabic_numbers"].get("literals", []):
|
|
if isinstance(entry, list) and len(entry) == 2:
|
|
ar, val = entry
|
|
if isinstance(ar, str) and isinstance(val, int):
|
|
out.append((ar, val))
|
|
return out
|
|
|
|
|
|
def motion_inverses() -> Dict[str, str]:
|
|
return {k: v for k, v in load()["motion_inverses"].items()
|
|
if k != "_description"}
|
|
|
|
|
|
def sequence_never_record() -> set:
|
|
return set(load()["sequence_never_record"].get("canonicals", []))
|
|
|
|
|
|
def english_fractions_additive() -> Dict[str, float]:
|
|
"""English fraction words that COMBINE with a preceding integer
|
|
('3 and a half' → 3 + 0.5)."""
|
|
return {k: float(v) for k, v in load()["english_fractions"]["additive"].items()
|
|
if k != "_description"}
|
|
|
|
|
|
def english_fractions_leading() -> Dict[str, float]:
|
|
"""English fraction words that STAND ALONE before a unit
|
|
('half a meter' → 0.5 meter)."""
|
|
return {k: float(v) for k, v in load()["english_fractions"]["leading"].items()
|
|
if k != "_description"}
|
|
|
|
|
|
def arabic_fractions_additive() -> Dict[str, float]:
|
|
"""Arabic fraction words that COMBINE with a preceding digit
|
|
via و conjunction ('3 ونصف' → 3.5)."""
|
|
return {k: float(v) for k, v in load()["arabic_fractions"]["additive"].items()
|
|
if k != "_description"}
|
|
|
|
|
|
def arabic_fractions_leading() -> Dict[str, float]:
|
|
"""Arabic fraction words STANDING ALONE before a unit
|
|
('نصف متر' → 0.5 meter)."""
|
|
return {k: float(v) for k, v in load()["arabic_fractions"]["leading"].items()
|
|
if k != "_description"}
|
|
|
|
|
|
def arabic_unit_words() -> set:
|
|
"""Set of all Arabic unit words from arabic_units. Used by the
|
|
fraction parser to detect 'N <unit> ونصف' patterns."""
|
|
out = set()
|
|
for vals in load()["arabic_units"].values():
|
|
if isinstance(vals, list):
|
|
for v in vals:
|
|
if isinstance(v, str) and v.strip():
|
|
out.add(v)
|
|
return out
|