Marcus/Voice/_language_tables.py

"""
_language_tables.py — load Config/language_tables.json once and cache.

All Voice/* modules that need vocabulary data import from here instead
of hardcoding tables. The JSON file is the single source of truth for
Arabic verbs / directions / units / numbers / English number words /
motion inverses / sequence-never-record list.

Loud failure on missing/malformed file (matches Marcus's 'no silent
config degradation' policy from earlier rounds).

Public API:
    LANG = load()   ← dict with all top-level sections
    LANG["english_numbers"]["ones"]    → {word: int}
    LANG["arabic_verbs"]["walking"]    → list of Arabic root strings
    LANG["arabic_duals"]["2 steps"]    → list of dual-form strings
    LANG["motion_inverses"]            → flat dict of pairs
    LANG["sequence_never_record"]["canonicals"] → list of canonical names

Convenience flatteners (build the inverse maps consumers usually want):
    flat_arabic_verbs()       → {ar_root: en_gerund}
    flat_arabic_directions()  → {ar_word: en_direction}
    flat_arabic_units()       → {ar_unit: en_unit}
    flat_arabic_duals()       → {ar_dual_word: 'N units'}
    flat_arabic_conjunctions()→ {ar_conj: ' english_glue '}
    flat_arabic_connectives() → {ar_word: en_word}
"""
from __future__ import annotations

import json
import os
import sys
from typing import Dict, List

_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
    sys.path.insert(0, _PROJECT_DIR)

_CONFIG_PATH = os.path.join(_PROJECT_DIR, "Config", "language_tables.json")
_CACHE: dict = {}


def load() -> dict:
    """Load and cache the language tables. Raises RuntimeError on
    missing/malformed config — voice processing is unusable without it,
    so loud failure surfaces the problem at startup rather than during
    a user demo."""
    if _CACHE:
        return _CACHE
    if not os.path.isfile(_CONFIG_PATH):
        raise RuntimeError(
            "Config/language_tables.json missing at {} — voice motion "
            "vocabulary cannot load without it. Restore from git or "
            "rebuild from the documented schema.".format(_CONFIG_PATH)
        )
    try:
        with open(_CONFIG_PATH, "r", encoding="utf-8") as f:
            data = json.load(f) or {}
    except Exception as e:
        raise RuntimeError(
            "Config/language_tables.json malformed: {}".format(e),
        ) from e
    # Quick sanity — make sure the top-level keys we care about exist
    required = (
        "english_numbers", "arabic_numbers",
        "english_fractions", "arabic_fractions",
        "arabic_verbs", "arabic_directions", "arabic_units",
        "arabic_duals", "arabic_conjunctions", "arabic_connectives",
        "motion_inverses", "sequence_never_record",
    )
    missing = [k for k in required if k not in data]
    if missing:
        raise RuntimeError(
            "Config/language_tables.json missing top-level sections: {}"
            .format(missing)
        )
    _CACHE.update(data)
    return _CACHE


def _invert_grouped(grouped: dict, drop_keys: tuple = ("_description",)) -> Dict[str, str]:
    """Helper: flip a {english_label: [arabic_word, ...]} dict into
    {arabic_word: english_label}. Skips keys named in drop_keys (used
    to skip the '_description' annotation in the JSON)."""
    out: Dict[str, str] = {}
    for en, ar_list in grouped.items():
        if en in drop_keys:
            continue
        if not isinstance(ar_list, list):
            continue
        for ar in ar_list:
            if isinstance(ar, str) and ar.strip():
                out[ar] = en
    return out


def flat_arabic_verbs() -> Dict[str, str]:
    return _invert_grouped(load()["arabic_verbs"])


def flat_arabic_directions() -> Dict[str, str]:
    return _invert_grouped(load()["arabic_directions"])


def flat_arabic_units() -> Dict[str, str]:
    return _invert_grouped(load()["arabic_units"])


def flat_arabic_duals() -> Dict[str, str]:
    return _invert_grouped(load()["arabic_duals"])


def flat_arabic_conjunctions() -> Dict[str, str]:
    return _invert_grouped(load()["arabic_conjunctions"])


def flat_arabic_connectives() -> Dict[str, str]:
    return _invert_grouped(load()["arabic_connectives"])


def english_numbers_ones() -> Dict[str, int]:
    return {k: int(v) for k, v in load()["english_numbers"]["ones"].items()
            if k != "_description"}


def english_numbers_tens() -> Dict[str, int]:
    return {k: int(v) for k, v in load()["english_numbers"]["tens"].items()
            if k != "_description"}


def english_numbers_scale() -> Dict[str, int]:
    return {k: int(v) for k, v in load()["english_numbers"]["scale"].items()
            if k != "_description"}


def english_numbers_glue() -> set:
    return set(load()["english_numbers"]["glue"])


def arabic_number_literals() -> List[tuple]:
    """List of (arabic_word, integer_value) pairs, in declaration order.
    Caller is expected to keep ordering (longest-first) when applying."""
    out: List[tuple] = []
    for entry in load()["arabic_numbers"].get("literals", []):
        if isinstance(entry, list) and len(entry) == 2:
            ar, val = entry
            if isinstance(ar, str) and isinstance(val, int):
                out.append((ar, val))
    return out


def motion_inverses() -> Dict[str, str]:
    return {k: v for k, v in load()["motion_inverses"].items()
            if k != "_description"}


def sequence_never_record() -> set:
    return set(load()["sequence_never_record"].get("canonicals", []))


def english_fractions_additive() -> Dict[str, float]:
    """English fraction words that COMBINE with a preceding integer
    ('3 and a half' → 3 + 0.5)."""
    return {k: float(v) for k, v in load()["english_fractions"]["additive"].items()
            if k != "_description"}


def english_fractions_leading() -> Dict[str, float]:
    """English fraction words that STAND ALONE before a unit
    ('half a meter' → 0.5 meter)."""
    return {k: float(v) for k, v in load()["english_fractions"]["leading"].items()
            if k != "_description"}


def arabic_fractions_additive() -> Dict[str, float]:
    """Arabic fraction words that COMBINE with a preceding digit
    via و conjunction ('3 ونصف' → 3.5)."""
    return {k: float(v) for k, v in load()["arabic_fractions"]["additive"].items()
            if k != "_description"}


def arabic_fractions_leading() -> Dict[str, float]:
    """Arabic fraction words STANDING ALONE before a unit
    ('نصف متر' → 0.5 meter)."""
    return {k: float(v) for k, v in load()["arabic_fractions"]["leading"].items()
            if k != "_description"}


def arabic_unit_words() -> set:
    """Set of all Arabic unit words from arabic_units. Used by the
    fraction parser to detect 'N <unit> ونصف' patterns."""
    out = set()
    for vals in load()["arabic_units"].values():
        if isinstance(vals, list):
            for v in vals:
                if isinstance(v, str) and v.strip():
                    out.add(v)
    return out