Sanad_lite/voice/text_utils.py

123 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Arabic text normalization and voice-command phrase matching.
Ported from gemini_interact/sanad_text_utils.py — unified for Sanad.
"""
from __future__ import annotations
import re
from pathlib import Path
from typing import Any
# Arabic diacritics (tashkeel) — stripped for matching.
_DIACRITICS_RE = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]")
_AR_PUNCT = re.compile(r"[؟،؛]")
_NON_WORD = re.compile(r"[^\w\u0600-\u06FF\s]", re.UNICODE)
_MULTI_WS = re.compile(r"\s+")
def normalize_arabic(text: str) -> str:
"""Normalize Arabic + English text for matching."""
s = text.strip().lower()
s = _AR_PUNCT.sub(" ", s)
s = _NON_WORD.sub(" ", s)
s = _MULTI_WS.sub(" ", s)
# Hamza variants → bare alif
s = s.replace("\u0623", "\u0627") # أ → ا
s = s.replace("\u0625", "\u0627") # إ → ا
s = s.replace("\u0622", "\u0627") # آ → ا
# Ta marbuta / alif maqsoora
s = s.replace("\u0629", "\u0647") # ة → ه
s = s.replace("\u0649", "\u064A") # ى → ي
# Tatweel
s = s.replace("\u0640", "")
# Strip diacritics last
s = _DIACRITICS_RE.sub("", s)
return s.strip()
def strip_diacritics(text: str) -> str:
return _DIACRITICS_RE.sub("", text)
def load_phrase_map(filepath: str | Path) -> dict[str, set[str]]:
"""Load a phrase file mapping command names to trigger phrases.
Format (per command):
WAKE_PHRASES_shake_hand = {
"مصافحه", "handshake", "shake hands",
}
Returns: {"shake_hand": {"مصافحه", "handshake", ...}, ...}
"""
path = Path(filepath)
if not path.exists():
return {}
content = path.read_text(encoding="utf-8")
result: dict[str, set[str]] = {}
current_name: str | None = None
current_phrases: set[str] = set()
for raw_line in content.splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
# Header: WAKE_PHRASES_shake_hand = {
header_match = re.match(r"WAKE_PHRASES_(\w+)\s*=\s*\{", line)
if header_match:
if current_name and current_phrases:
result[current_name] = current_phrases
current_name = header_match.group(1)
current_phrases = set()
continue
# Closing brace
if line == "}":
if current_name and current_phrases:
result[current_name] = current_phrases
current_name = None
current_phrases = set()
continue
# Phrase line: "some phrase",
phrase_match = re.match(r'"([^"]+)"', line)
if phrase_match and current_name is not None:
phrase = normalize_arabic(phrase_match.group(1))
if phrase:
current_phrases.add(phrase)
if current_name and current_phrases:
result[current_name] = current_phrases
return result
def match_phrase(text: str, phrase_sets: dict[str, set[str]]) -> str | None:
"""Return the command name if normalized *text* matches any phrase set.
Token-set matching: every word of the phrase must appear as a whole
word in *text*. Prevents short phrases (e.g. 'hi') from matching
longer words (e.g. 'this').
"""
norm = normalize_arabic(text)
if not norm:
return None
text_tokens = set(norm.split())
if not text_tokens:
return None
best_command: str | None = None
best_len = 0
for command_name, phrases in phrase_sets.items():
for phrase in phrases:
phrase_tokens = phrase.split()
if not phrase_tokens:
continue
if all(t in text_tokens for t in phrase_tokens):
if len(phrase) > best_len:
best_command = command_name
best_len = len(phrase)
return best_command