123 lines
3.8 KiB
Python
123 lines
3.8 KiB
Python
"""Arabic text normalization and voice-command phrase matching.
|
||
|
||
Ported from gemini_interact/sanad_text_utils.py — unified for Sanad.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
# Arabic diacritics (tashkeel) — stripped for matching.
|
||
_DIACRITICS_RE = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]")
|
||
_AR_PUNCT = re.compile(r"[؟،؛]")
|
||
_NON_WORD = re.compile(r"[^\w\u0600-\u06FF\s]", re.UNICODE)
|
||
_MULTI_WS = re.compile(r"\s+")
|
||
|
||
|
||
def normalize_arabic(text: str) -> str:
|
||
"""Normalize Arabic + English text for matching."""
|
||
s = text.strip().lower()
|
||
s = _AR_PUNCT.sub(" ", s)
|
||
s = _NON_WORD.sub(" ", s)
|
||
s = _MULTI_WS.sub(" ", s)
|
||
# Hamza variants → bare alif
|
||
s = s.replace("\u0623", "\u0627") # أ → ا
|
||
s = s.replace("\u0625", "\u0627") # إ → ا
|
||
s = s.replace("\u0622", "\u0627") # آ → ا
|
||
# Ta marbuta / alif maqsoora
|
||
s = s.replace("\u0629", "\u0647") # ة → ه
|
||
s = s.replace("\u0649", "\u064A") # ى → ي
|
||
# Tatweel
|
||
s = s.replace("\u0640", "")
|
||
# Strip diacritics last
|
||
s = _DIACRITICS_RE.sub("", s)
|
||
return s.strip()
|
||
|
||
|
||
def strip_diacritics(text: str) -> str:
|
||
return _DIACRITICS_RE.sub("", text)
|
||
|
||
|
||
def load_phrase_map(filepath: str | Path) -> dict[str, set[str]]:
|
||
"""Load a phrase file mapping command names to trigger phrases.
|
||
|
||
Format (per command):
|
||
WAKE_PHRASES_shake_hand = {
|
||
"مصافحه", "handshake", "shake hands",
|
||
}
|
||
|
||
Returns: {"shake_hand": {"مصافحه", "handshake", ...}, ...}
|
||
"""
|
||
path = Path(filepath)
|
||
if not path.exists():
|
||
return {}
|
||
|
||
content = path.read_text(encoding="utf-8")
|
||
result: dict[str, set[str]] = {}
|
||
current_name: str | None = None
|
||
current_phrases: set[str] = set()
|
||
|
||
for raw_line in content.splitlines():
|
||
line = raw_line.strip()
|
||
if not line or line.startswith("#"):
|
||
continue
|
||
|
||
# Header: WAKE_PHRASES_shake_hand = {
|
||
header_match = re.match(r"WAKE_PHRASES_(\w+)\s*=\s*\{", line)
|
||
if header_match:
|
||
if current_name and current_phrases:
|
||
result[current_name] = current_phrases
|
||
current_name = header_match.group(1)
|
||
current_phrases = set()
|
||
continue
|
||
|
||
# Closing brace
|
||
if line == "}":
|
||
if current_name and current_phrases:
|
||
result[current_name] = current_phrases
|
||
current_name = None
|
||
current_phrases = set()
|
||
continue
|
||
|
||
# Phrase line: "some phrase",
|
||
phrase_match = re.match(r'"([^"]+)"', line)
|
||
if phrase_match and current_name is not None:
|
||
phrase = normalize_arabic(phrase_match.group(1))
|
||
if phrase:
|
||
current_phrases.add(phrase)
|
||
|
||
if current_name and current_phrases:
|
||
result[current_name] = current_phrases
|
||
|
||
return result
|
||
|
||
|
||
def match_phrase(text: str, phrase_sets: dict[str, set[str]]) -> str | None:
|
||
"""Return the command name if normalized *text* matches any phrase set.
|
||
|
||
Token-set matching: every word of the phrase must appear as a whole
|
||
word in *text*. Prevents short phrases (e.g. 'hi') from matching
|
||
longer words (e.g. 'this').
|
||
"""
|
||
norm = normalize_arabic(text)
|
||
if not norm:
|
||
return None
|
||
text_tokens = set(norm.split())
|
||
if not text_tokens:
|
||
return None
|
||
best_command: str | None = None
|
||
best_len = 0
|
||
for command_name, phrases in phrase_sets.items():
|
||
for phrase in phrases:
|
||
phrase_tokens = phrase.split()
|
||
if not phrase_tokens:
|
||
continue
|
||
if all(t in text_tokens for t in phrase_tokens):
|
||
if len(phrase) > best_len:
|
||
best_command = command_name
|
||
best_len = len(phrase)
|
||
return best_command
|
||
|