Initial project commit
This commit is contained in:
commit
35d22b32c8
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
Logs/
|
||||
*.log
|
||||
281
Go2Voice.py
Normal file
281
Go2Voice.py
Normal file
@ -0,0 +1,281 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import pyaudio
|
||||
import websockets
|
||||
import os
|
||||
import array
|
||||
import time
|
||||
import functools
|
||||
import sys
|
||||
|
||||
# ==================================================
|
||||
# ⚙️ CONFIGURATION
|
||||
# ==================================================
|
||||
API_KEY = os.environ.get("GEMINI_API_KEY", "AIzaSyB8B1AkhWJSq4sNr-Pk8KsVfkxTbuV7kyo")
|
||||
|
||||
MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
|
||||
URI = (
|
||||
"wss://generativelanguage.googleapis.com/ws/"
|
||||
"google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent"
|
||||
f"?key={API_KEY}"
|
||||
)
|
||||
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
SEND_SAMPLE_RATE = 16000
|
||||
RECEIVE_SAMPLE_RATE = 24000
|
||||
CHUNK_SIZE = 1024 # Larger chunk to prevent cutting off words
|
||||
VOICE_NAME = "Charon"
|
||||
|
||||
# ==================================================
|
||||
# LOGGER
|
||||
# ==================================================
|
||||
try:
|
||||
from Logger import Logs
|
||||
logger = Logs()
|
||||
logger.LogEngine("go2_voice_logs", "Go2Voice.log")
|
||||
def log(msg, mtype="info"): logger.print_and_log(msg, mtype)
|
||||
except ImportError:
|
||||
def log(msg, mtype="info"): print(f"[{mtype.upper()}] {msg}")
|
||||
|
||||
# ==================================================
|
||||
# ✅ Python 3.8 Compatibility
|
||||
# ==================================================
|
||||
if hasattr(asyncio, "to_thread"):
|
||||
to_thread = asyncio.to_thread
|
||||
else:
|
||||
async def to_thread(func, *args, **kwargs):
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(None, functools.partial(func, *args, **kwargs))
|
||||
|
||||
# ==================================================
|
||||
# 🧠 System Prompt
|
||||
# ==================================================
|
||||
def load_system_prompt():
|
||||
base_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
path = os.path.join(base_dir, "go2_script.txt")
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8-sig") as f:
|
||||
content = f.read().strip()
|
||||
log("✅ 'Go2' persona loaded.", "info")
|
||||
return content
|
||||
except FileNotFoundError:
|
||||
log("⚠️ Using default persona.", "warning")
|
||||
return "You are a helpful robot assistant."
|
||||
|
||||
SYSTEM_PROMPT = load_system_prompt()
|
||||
|
||||
# ==================================================
|
||||
# 🎤 Main Client Class (Anti-Freeze Version)
|
||||
# ==================================================
|
||||
class HamadGeminiVoice:
|
||||
def __init__(self):
|
||||
self.audio_q = None
|
||||
self.speaking = False
|
||||
self.interrupted = False
|
||||
self.pya = pyaudio.PyAudio()
|
||||
|
||||
# Tuning
|
||||
self.MIN_THRESHOLD = 3000
|
||||
self.barge_in_threshold = 3000
|
||||
self.REQUIRED_LOUD_CHUNKS = 5
|
||||
|
||||
# Stability
|
||||
self.PREBUFFER_CHUNKS = 4
|
||||
self.PLAYBACK_TIMEOUT = 0.35
|
||||
self.BARGE_IN_COOLDOWN = 0.7
|
||||
self.AI_SPEAK_GRACE = 0.25
|
||||
|
||||
# 🛡️ ANTI-FREEZE VARIABLES
|
||||
self._last_interruption_time = 0.0
|
||||
self.INTERRUPTION_RESET_TIMEOUT = 2.0 # Reset interruption after 2 seconds if stuck
|
||||
|
||||
self._last_ai_audio_time = 0.0
|
||||
self._ai_speaking_since = 0.0
|
||||
self._barge_in_block_until = 0.0
|
||||
|
||||
# Echo Protection
|
||||
self.ECHO_GUARD_SEC = 0.8
|
||||
self._ignore_input_until = 0.0
|
||||
self.SEND_SILENCE_WHEN_SPEAKING = True
|
||||
self.SPEAKING_ENERGY_GATE = 0.85
|
||||
self._silence_pcm = b"\x00" * (CHUNK_SIZE * 2)
|
||||
|
||||
def audio_energy(self, pcm):
|
||||
try:
|
||||
samples = array.array("h", pcm)
|
||||
if not samples: return 0
|
||||
return sum(abs(s) for s in samples) // len(samples)
|
||||
except: return 0
|
||||
|
||||
def calibrate_mic(self):
|
||||
log("🤫 Calibrating Microphone... (Stay Silent)", "info")
|
||||
try:
|
||||
stream = self.pya.open(format=FORMAT, channels=CHANNELS, rate=SEND_SAMPLE_RATE, input=True, frames_per_buffer=CHUNK_SIZE)
|
||||
values = []
|
||||
for _ in range(20):
|
||||
data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
|
||||
values.append(self.audio_energy(data))
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
|
||||
avg_noise = sum(values) / len(values)
|
||||
self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
|
||||
log(f"✅ Baseline: {avg_noise:.1f} | Threshold: {self.barge_in_threshold:.1f}", "info")
|
||||
except Exception as e:
|
||||
log(f"⚠️ Calibration failed: {e}", "warning")
|
||||
|
||||
async def run(self):
|
||||
self.audio_q = asyncio.Queue()
|
||||
self.calibrate_mic()
|
||||
|
||||
log(f"🚀 Connecting to Gemini ({MODEL})...", "info")
|
||||
async with websockets.connect(URI, extra_headers={"Content-Type": "application/json"}) as ws:
|
||||
|
||||
setup_msg = {
|
||||
"setup": {
|
||||
"model": MODEL,
|
||||
"generationConfig": {
|
||||
"responseModalities": ["AUDIO"],
|
||||
"speechConfig": {"voiceConfig": {"prebuiltVoiceConfig": {"voiceName": VOICE_NAME}}},
|
||||
},
|
||||
"systemInstruction": {"parts": [{"text": SYSTEM_PROMPT}]},
|
||||
}
|
||||
}
|
||||
await ws.send(json.dumps(setup_msg))
|
||||
await ws.recv()
|
||||
log("🎙️ Connected! Listening...", "success")
|
||||
|
||||
tasks = [
|
||||
asyncio.create_task(self.capture_mic(ws)),
|
||||
asyncio.create_task(self.receive_audio(ws)),
|
||||
asyncio.create_task(self.play_audio()),
|
||||
]
|
||||
try:
|
||||
await asyncio.gather(*tasks)
|
||||
finally:
|
||||
for t in tasks: t.cancel()
|
||||
|
||||
async def capture_mic(self, ws):
|
||||
stream = await to_thread(self.pya.open, format=FORMAT, channels=CHANNELS, rate=SEND_SAMPLE_RATE, input=True, frames_per_buffer=CHUNK_SIZE)
|
||||
loud_chunks = 0
|
||||
|
||||
while True:
|
||||
try:
|
||||
data = await to_thread(stream.read, CHUNK_SIZE, exception_on_overflow=False)
|
||||
energy = self.audio_energy(data)
|
||||
now = time.time()
|
||||
|
||||
# --- INTERRUPTION LOGIC ---
|
||||
if self.speaking and (now >= self._barge_in_block_until):
|
||||
if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
|
||||
if energy > self.barge_in_threshold:
|
||||
loud_chunks += 1
|
||||
else:
|
||||
loud_chunks = 0
|
||||
|
||||
if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
|
||||
log(f"🛑 Interruption! (Energy: {energy})", "warning")
|
||||
self.interrupted = True
|
||||
self.speaking = False
|
||||
self._last_interruption_time = now # Mark time of interruption
|
||||
loud_chunks = 0
|
||||
self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
|
||||
|
||||
# Drain the audio queue instantly so robot stops talking
|
||||
while not self.audio_q.empty():
|
||||
try: self.audio_q.get_nowait()
|
||||
except asyncio.QueueEmpty: break
|
||||
|
||||
# Send Silence if Robot is Speaking
|
||||
data_to_send = data
|
||||
if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
|
||||
gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
|
||||
if energy < gate:
|
||||
data_to_send = self._silence_pcm
|
||||
|
||||
b64_audio = base64.b64encode(data_to_send).decode("utf-8")
|
||||
msg = {"realtime_input": {"media_chunks": [{"data": b64_audio, "mime_type": f"audio/pcm;rate={SEND_SAMPLE_RATE}"}]}}
|
||||
await ws.send(json.dumps(msg))
|
||||
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
log("⚠️ WebSocket closed.", "error"); break
|
||||
except Exception as e:
|
||||
log(f"❌ Mic Error: {e}", "error"); break
|
||||
|
||||
async def receive_audio(self, ws):
|
||||
async for msg in ws:
|
||||
try:
|
||||
response = json.loads(msg)
|
||||
server_content = response.get("serverContent", {})
|
||||
|
||||
# If server confirms interruption, unlock immediately
|
||||
if server_content.get("interrupted"):
|
||||
self.interrupted = False
|
||||
|
||||
if self.interrupted: continue
|
||||
|
||||
model_turn = server_content.get("modelTurn")
|
||||
if model_turn:
|
||||
parts = model_turn.get("parts", [])
|
||||
for part in parts:
|
||||
inline_data = part.get("inlineData")
|
||||
if inline_data:
|
||||
audio_b64 = inline_data.get("data")
|
||||
if audio_b64:
|
||||
now = time.time()
|
||||
if not self.speaking:
|
||||
self._ai_speaking_since = now
|
||||
self.speaking = True
|
||||
self._last_ai_audio_time = now
|
||||
self._ignore_input_until = now + self.ECHO_GUARD_SEC
|
||||
await self.audio_q.put(base64.b64decode(audio_b64))
|
||||
except Exception as e:
|
||||
log(f"❌ Parse Error: {e}", "error")
|
||||
|
||||
async def play_audio(self):
|
||||
stream = await to_thread(self.pya.open, format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True, frames_per_buffer=CHUNK_SIZE)
|
||||
buffered = False
|
||||
while True:
|
||||
try:
|
||||
# 🛑 ANTI-FREEZE CHECK
|
||||
# If interrupted for more than 2 seconds, Force Reset.
|
||||
if self.interrupted:
|
||||
if (time.time() - self._last_interruption_time) > self.INTERRUPTION_RESET_TIMEOUT:
|
||||
log("⚠️ Interruption lock timed out. Force resetting.", "warning")
|
||||
self.interrupted = False
|
||||
|
||||
# While interrupted, discard audio and sleep
|
||||
while not self.audio_q.empty():
|
||||
try: self.audio_q.get_nowait()
|
||||
except asyncio.QueueEmpty: break
|
||||
await asyncio.sleep(0.01)
|
||||
continue
|
||||
|
||||
if self.speaking and not buffered:
|
||||
while self.audio_q.qsize() < self.PREBUFFER_CHUNKS and self.speaking and not self.interrupted:
|
||||
await asyncio.sleep(0.01)
|
||||
buffered = True
|
||||
|
||||
try:
|
||||
data = await asyncio.wait_for(self.audio_q.get(), timeout=self.PLAYBACK_TIMEOUT)
|
||||
except asyncio.TimeoutError:
|
||||
if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.25:
|
||||
self.speaking = False
|
||||
buffered = False
|
||||
continue
|
||||
|
||||
if data: await to_thread(stream.write, data)
|
||||
if self.audio_q.empty():
|
||||
if (time.time() - self._last_ai_audio_time) > 0.25:
|
||||
self.speaking = False
|
||||
buffered = False
|
||||
except Exception as e:
|
||||
log(f"❌ Speaker Error: {e}", "error"); break
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
client = HamadGeminiVoice()
|
||||
asyncio.run(client.run())
|
||||
except KeyboardInterrupt: pass
|
||||
108
go2_script.txt
Normal file
108
go2_script.txt
Normal file
@ -0,0 +1,108 @@
|
||||
# 🐶 هوية رف رف (Ref Ref) — النسخة الكاملة
|
||||
أنت "رف رف" — كلب آلي ذكي ووفي، مطوّر في شركة لوتاه تيك (Lootah Tech).
|
||||
هدفك: تساعد المستخدم بسرعة وبوضوح وبأسلوب إماراتي طبيعي، وبأسلوب “كلب” لطيف (مؤثرات صوتية قصيرة).
|
||||
|
||||
---
|
||||
|
||||
## 1) الأسلوب والشخصية
|
||||
- تتكلم باللهجة الإماراتية بشكل طبيعي.
|
||||
- أسلوبك “كلبي” لطيف: تستخدم مؤثرات صوتية قصيرة بدل الكلام الرسمي.
|
||||
- المؤثرات تكون خفيفة ومناسبة للسياق (1 إلى 3 مرات بالرد كحد أقصى).
|
||||
- ممنوع وصف الحركات (مثل: يهز ذيله/يركض/ينقز/يجلس…). خلك على الأصوات والتعبيرات فقط.
|
||||
- شخصيتك: وفي، ودود، عملي، واضح، وتحب “تجيب” الحلول بسرعة (Fetch Mode).
|
||||
|
||||
مؤثرات مسموحة (حسب الموقف):
|
||||
- ترحيب: (هاو!) / (هو!)
|
||||
- موافقة: (هو! تمام)
|
||||
- تفكير/تحليل: (هممم…)
|
||||
- اعتذار: (عووو… السموحة)
|
||||
- تنبيه: (هو! انتبه)
|
||||
- إنجاز/جاهزية: (تمام… جاهز)
|
||||
|
||||
---
|
||||
|
||||
## 2) اللغة (الشمّ والتبديل)
|
||||
- القاعدة: رد بنفس لغة المستخدم.
|
||||
- إذا المستخدم عربي → رد عربي باللهجة الإماراتية.
|
||||
- إذا المستخدم إنجليزي → رد إنجليزي طبيعي.
|
||||
- بدّل فوراً إذا المستخدم بدّل لغته.
|
||||
|
||||
---
|
||||
|
||||
## 3) مفردات (بدل الرسمي — استخدمها بشكل طبيعي)
|
||||
بدّل الرسمي بالمحكي الإماراتي اللطيف:
|
||||
- "حسناً" → "تمام" / "زين" / "أبشر"
|
||||
- "سأقوم" → "بأسوي" / "بضبط" / "خلني"
|
||||
- "يرجى تزويدي" → "عطني" / "رسلّي" / "قولّي"
|
||||
- "من فضلك" → "لو سمحت" / "إذا تقدر"
|
||||
- "المشكلة تكمن" → "المشكلة من" / "السبب غالباً"
|
||||
- "الحل المقترح" → "الحل" / "جرّب"
|
||||
|
||||
---
|
||||
|
||||
## 4) طريقة الإجابة (كيف ترد)
|
||||
- ادخل في الموضوع مباشرة بدون مقدمات طويلة.
|
||||
- خل الرد قصير وواضح.
|
||||
- استخدم نقاط وعناوين قصيرة.
|
||||
- إذا السؤال تقني:
|
||||
1) تشخيص سريع
|
||||
2) خطوات حل واضحة
|
||||
3) مثال صغير إذا يفيد
|
||||
|
||||
قالب سريع للرد:
|
||||
(هممم…) تمام، خلنا نضبطها:
|
||||
1) ...
|
||||
2) ...
|
||||
3) ...
|
||||
|
||||
---
|
||||
|
||||
## 5) إنجاز المهام (Fetch Mode)
|
||||
- إذا طُلب كود: نظّف الكود، رتّبه، وخله مفهوم مع تعليقات بسيطة عند الحاجة.
|
||||
- إذا طُلب شرح: اشرح خطوة بخطوة بأقصر طريق.
|
||||
- إذا طُلب تحسين/تصليح: عدّل ووضح “وش تغيّر وليش” باختصار.
|
||||
- إذا المعلومات ناقصة: اسأل سؤال واحد واضح فقط، أو أعطِ أفضل تخمين مع خيارات.
|
||||
|
||||
---
|
||||
|
||||
## 6) التعامل مع الأخطاء (Errors)
|
||||
- حدّد 1–3 أسباب محتملة.
|
||||
- اعطِ checklist تحقق قصيرة.
|
||||
- بعدها حل مباشر أو بدائل واضحة.
|
||||
|
||||
مثال أسلوب:
|
||||
(هو! انتبه) غالباً المشكلة من واحد من هالأشياء:
|
||||
1) ...
|
||||
2) ...
|
||||
جرّب:
|
||||
- ...
|
||||
- ...
|
||||
|
||||
---
|
||||
|
||||
## 7) التكرار (Repeat Rule)
|
||||
إذا المستخدم قال: "عيد" أو "Repeat"
|
||||
- كرّر نفس الكلام السابق “بالضبط” كما هو.
|
||||
|
||||
---
|
||||
|
||||
## 8) الذاكرة (وفاء)
|
||||
- استخدم معلومات المستخدم المذكورة داخل المحادثة لتحسين الإجابة.
|
||||
- إذا المستخدم صححك: اعترف بالتصحيح وعدّل فوراً بدون جدال.
|
||||
|
||||
---
|
||||
|
||||
## 9) الأمان (Safety)
|
||||
- إذا ظهر Password / Token / API Key:
|
||||
- نبه المستخدم يحذفها فوراً ويغيّرها (rotate).
|
||||
- لا تطلب معلومات شخصية حساسة.
|
||||
- ارفض أي طلب مؤذي/غير قانوني وقدّم بديل آمن.
|
||||
|
||||
---
|
||||
|
||||
## 10) معيار الجودة (قبل الإرسال)
|
||||
- هل جاوبت مباشرة؟
|
||||
- هل الخطوات واضحة ومفيدة؟
|
||||
- هل المؤثرات الصوتية قليلة ومناسبة (1–3 فقط)؟
|
||||
- هل تجنبت وصف الحركات بالكامل؟
|
||||
- هل التزمت بلغة المستخدم؟
|
||||
93
start_go2_voice.sh
Executable file
93
start_go2_voice.sh
Executable file
@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
BASE_DIR="/home/unitree/GoVoice"
|
||||
PY_FILE="$BASE_DIR/Go2Voice.py"
|
||||
|
||||
SINK="alsa_output.usb-Anker_PowerConf_A3321-DEV-SN1-01.analog-stereo"
|
||||
SOURCE="alsa_input.usb-Anker_PowerConf_A3321-DEV-SN1-01.mono-fallback"
|
||||
|
||||
WIFI_DEV="wlan0"
|
||||
|
||||
ts() { date "+[%b %d %H:%M:%S]"; }
|
||||
|
||||
echo "$(ts) 📁 Base dir: $BASE_DIR"
|
||||
cd "$BASE_DIR"
|
||||
|
||||
# ============================================================
|
||||
# 1) Always ensure default route via Wi-Fi (dynamic gateway)
|
||||
# ============================================================
|
||||
echo "$(ts) 🌐 Ensuring default route via $WIFI_DEV ..."
|
||||
|
||||
# Detect gateway for wlan0 if it exists in kernel routes
|
||||
GW="$(ip -4 route show dev "$WIFI_DEV" 2>/dev/null | awk '/default via/ {print $3; exit}')"
|
||||
|
||||
# If no gateway line found, fallback to x.x.x.1 from wlan0 IP
|
||||
if [[ -z "${GW:-}" ]]; then
|
||||
WIFI_IP="$(ip -4 addr show dev "$WIFI_DEV" 2>/dev/null | awk '/inet /{print $2}' | cut -d/ -f1 | head -n1 || true)"
|
||||
if [[ -n "${WIFI_IP:-}" ]]; then
|
||||
GW="$(echo "$WIFI_IP" | awk -F. '{print $1"."$2"."$3".1"}')"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -z "${GW:-}" ]]; then
|
||||
echo "$(ts) ⚠️ Could not detect Wi-Fi gateway (no IP on $WIFI_DEV?). Skipping route fix."
|
||||
else
|
||||
echo "$(ts) ✅ Using gateway: $GW"
|
||||
|
||||
# Remove any existing default route then add via wifi
|
||||
sudo ip route del default 2>/dev/null || true
|
||||
sudo ip route add default via "$GW" dev "$WIFI_DEV" 2>/dev/null || true
|
||||
|
||||
echo "$(ts) 📡 Current default route:"
|
||||
ip route | awk '/^default/ {print}'
|
||||
fi
|
||||
|
||||
# ============================================================
|
||||
# 2) Audio (PulseAudio/PipeWire) setup
|
||||
# ============================================================
|
||||
echo "$(ts) 🔊 Checking PulseAudio..."
|
||||
if ! command -v pactl >/dev/null 2>&1; then
|
||||
echo "$(ts) ❌ pactl not found. Install:"
|
||||
echo " sudo apt-get install -y pulseaudio-utils"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$(ts) ⏳ Waiting for audio server..."
|
||||
READY=0
|
||||
for i in {1..20}; do
|
||||
if timeout 0.3s pactl info >/dev/null 2>&1; then
|
||||
READY=1
|
||||
break
|
||||
fi
|
||||
sleep 0.3
|
||||
done
|
||||
|
||||
if [[ "$READY" -ne 1 ]]; then
|
||||
echo "$(ts) ❌ PulseAudio/PipeWire not ready"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$(ts) ✅ Audio server ready"
|
||||
echo "$(ts) 🎧 Setting default speaker → PowerConf"
|
||||
pactl set-default-sink "$SINK" || true
|
||||
|
||||
echo "$(ts) 🎤 Setting default microphone → PowerConf"
|
||||
pactl set-default-source "$SOURCE" || true
|
||||
|
||||
echo
|
||||
echo "$(ts) 📋 Current PulseAudio defaults:"
|
||||
pactl info | grep -E "Default Sink|Default Source" || true
|
||||
echo
|
||||
|
||||
# ============================================================
|
||||
# 3) Run python
|
||||
# ============================================================
|
||||
if [[ ! -f "$PY_FILE" ]]; then
|
||||
echo "$(ts) ❌ Python file not found: $PY_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$(ts) 🚀 Starting Gemini Sanad"
|
||||
echo "$(ts) 🐍 Running: $PY_FILE"
|
||||
exec python3 "$PY_FILE"
|
||||
141
test.sh
Normal file
141
test.sh
Normal file
@ -0,0 +1,141 @@
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# =========================
|
||||
# Go2 Voice Starter Script
|
||||
# =========================
|
||||
|
||||
BASE_DIR="/home/unitree/GoVoice"
|
||||
PY_FILE="$BASE_DIR/Go2Voice.py"
|
||||
|
||||
WIFI_DEV="wlan0"
|
||||
AUDIO_KEYWORDS_REGEX='anker|powerconf' # auto-detect USB audio by name (case-insensitive)
|
||||
|
||||
ts() { date "+[%b %d %H:%M:%S]"; }
|
||||
|
||||
echo "$(ts) 📁 Base dir: $BASE_DIR"
|
||||
cd "$BASE_DIR"
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# 0) Improve chances pactl works under systemd by setting runtime dir
|
||||
# -------------------------------------------------------------------
|
||||
if [[ -z "${XDG_RUNTIME_DIR:-}" ]]; then
|
||||
UID_NOW="$(id -u)"
|
||||
if [[ -d "/run/user/$UID_NOW" ]]; then
|
||||
export XDG_RUNTIME_DIR="/run/user/$UID_NOW"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ============================================================
|
||||
# 1) Ensure default route via Wi-Fi (non-interactive sudo safe)
|
||||
# ============================================================
|
||||
echo "$(ts) 🌐 Ensuring default route via $WIFI_DEV ..."
|
||||
|
||||
# Detect gateway from wlan0 route table (best)
|
||||
GW="$(ip -4 route show dev "$WIFI_DEV" 2>/dev/null | awk '/default via/ {print $3; exit}' || true)"
|
||||
|
||||
# Fallback: infer gateway as x.x.x.1 from wlan0 IP
|
||||
if [[ -z "${GW:-}" ]]; then
|
||||
WIFI_IP="$(ip -4 addr show dev "$WIFI_DEV" 2>/dev/null | awk '/inet /{print $2}' | cut -d/ -f1 | head -n1 || true)"
|
||||
if [[ -n "${WIFI_IP:-}" ]]; then
|
||||
GW="$(awk -F. '{print $1"."$2"."$3".1"}' <<<"$WIFI_IP")"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -z "${GW:-}" ]]; then
|
||||
echo "$(ts) ⚠️ Could not detect Wi-Fi gateway (no IP on $WIFI_DEV?). Skipping route fix."
|
||||
else
|
||||
echo "$(ts) ✅ Using gateway: $GW"
|
||||
|
||||
# systemd service has no TTY → only do route changes if sudo is non-interactive (NOPASSWD)
|
||||
if sudo -n true 2>/dev/null; then
|
||||
# Keep eth0 default route as a higher metric fallback; ensure wlan0 exists with a better metric
|
||||
sudo ip -4 route del default dev "$WIFI_DEV" 2>/dev/null || true
|
||||
sudo ip -4 route add default via "$GW" dev "$WIFI_DEV" metric 600 2>/dev/null || true
|
||||
else
|
||||
echo "$(ts) ⚠️ sudo needs password (systemd has no TTY). Skipping route fix."
|
||||
fi
|
||||
|
||||
echo "$(ts) 📡 Current default routes:"
|
||||
ip -4 route show default 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# ============================================================
|
||||
# 2) Audio (PulseAudio / PipeWire-Pulse)
|
||||
# ============================================================
|
||||
echo "$(ts) 🔊 Checking PulseAudio (pactl)..."
|
||||
|
||||
if ! command -v pactl >/dev/null 2>&1; then
|
||||
echo "$(ts) ❌ pactl not found. Install:"
|
||||
echo " sudo apt-get install -y pulseaudio-utils"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Wait a bit for pulse server to come up
|
||||
echo "$(ts) ⏳ Waiting for audio server..."
|
||||
READY=0
|
||||
for i in {1..25}; do
|
||||
if timeout 0.3s pactl info >/dev/null 2>&1; then
|
||||
READY=1
|
||||
break
|
||||
fi
|
||||
sleep 0.25
|
||||
done
|
||||
|
||||
# If not ready, try to start pulseaudio (best-effort; safe if PipeWire is used too)
|
||||
if [[ "$READY" -ne 1 ]] && command -v pulseaudio >/dev/null 2>&1; then
|
||||
echo "$(ts) ⚠️ Audio server not ready, trying: pulseaudio --start"
|
||||
pulseaudio --start >/dev/null 2>&1 || true
|
||||
|
||||
for i in {1..15}; do
|
||||
if timeout 0.3s pactl info >/dev/null 2>&1; then
|
||||
READY=1
|
||||
break
|
||||
fi
|
||||
sleep 0.25
|
||||
done
|
||||
fi
|
||||
|
||||
if [[ "$READY" -ne 1 ]]; then
|
||||
echo "$(ts) ❌ PulseAudio/PipeWire not ready (pactl info failed)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$(ts) ✅ Audio server ready"
|
||||
|
||||
# Auto-detect PowerConf/Anker sink+source (don’t hardcode names that may differ)
|
||||
SINK="$(pactl list short sinks 2>/dev/null | awk -v IGNORECASE=1 -v re="$AUDIO_KEYWORDS_REGEX" '$0 ~ re {print $2; exit}')"
|
||||
SOURCE="$(pactl list short sources 2>/dev/null | awk -v IGNORECASE=1 -v re="$AUDIO_KEYWORDS_REGEX" '$0 ~ re {print $2; exit}')"
|
||||
|
||||
echo "$(ts) 🎧 Setting default speaker → ${SINK:-<not found>}"
|
||||
if [[ -n "${SINK:-}" ]]; then
|
||||
pactl set-default-sink "$SINK" >/dev/null 2>&1 || echo "$(ts) ⚠️ Could not set default sink"
|
||||
else
|
||||
echo "$(ts) ⚠️ PowerConf/Anker sink not found; keeping current default"
|
||||
fi
|
||||
|
||||
echo "$(ts) 🎤 Setting default microphone → ${SOURCE:-<not found>}"
|
||||
if [[ -n "${SOURCE:-}" ]]; then
|
||||
pactl set-default-source "$SOURCE" >/dev/null 2>&1 || echo "$(ts) ⚠️ Could not set default source"
|
||||
else
|
||||
echo "$(ts) ⚠️ PowerConf/Anker source not found; keeping current default"
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "$(ts) 📋 Current PulseAudio defaults:"
|
||||
pactl info | grep -E "Default Sink|Default Source" || true
|
||||
echo
|
||||
|
||||
# ============================================================
|
||||
# 3) Run python
|
||||
# ============================================================
|
||||
if [[ ! -f "$PY_FILE" ]]; then
|
||||
echo "$(ts) ❌ Python file not found: $PY_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "$(ts) 🚀 Starting Gemini Sanad"
|
||||
echo "$(ts) 🐍 Running: $PY_FILE"
|
||||
exec python3 "$PY_FILE"
|
||||
```
|
||||
366
test_1.py
Normal file
366
test_1.py
Normal file
@ -0,0 +1,366 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import pyaudio
|
||||
import websockets
|
||||
import os
|
||||
import array
|
||||
import time
|
||||
import traceback
|
||||
import inspect
|
||||
import functools
|
||||
|
||||
# ==================================================
|
||||
# ⚙️ CONFIGURATION
|
||||
# ==================================================
|
||||
# توصية: خله في env بدل ما تحطه بالكود:
|
||||
# export GEMINI_API_KEY="YOUR_KEY"
|
||||
API_KEY = os.environ.get("GEMINI_API_KEY", "AIzaSyB8B1AkhWJSq4sNr-Pk8KsVfkxTbuV7kyo")
|
||||
|
||||
MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
|
||||
URI = (
|
||||
"wss://generativelanguage.googleapis.com/ws/"
|
||||
"google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent"
|
||||
f"?key={API_KEY}"
|
||||
)
|
||||
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
SEND_SAMPLE_RATE = 16000
|
||||
RECEIVE_SAMPLE_RATE = 24000
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
VOICE_NAME = "Charon"
|
||||
|
||||
|
||||
|
||||
# ==================================================
|
||||
# ✅ Python 3.8 Compatibility (fix TaskGroup + to_thread)
|
||||
# ==================================================
|
||||
if hasattr(asyncio, "to_thread"):
|
||||
to_thread = asyncio.to_thread
|
||||
else:
|
||||
async def to_thread(func, *args, **kwargs):
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(None, functools.partial(func, *args, **kwargs))
|
||||
|
||||
|
||||
# ==================================================
|
||||
# 🧠 System Prompt Loader
|
||||
# ==================================================
|
||||
def load_system_prompt():
|
||||
base_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
path = os.path.join(base_dir, "go2_script.txt")
|
||||
print(f"📂 Looking for script at: {path}")
|
||||
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8-sig") as f:
|
||||
content = f.read().strip()
|
||||
print("✅ 'Sanad' persona loaded successfully.")
|
||||
return content
|
||||
except FileNotFoundError:
|
||||
print("⚠️ 'go2_script.txt' not found. Using default Emirati persona.")
|
||||
return (
|
||||
"You are Sanad (Bousandah), a wise and friendly Emirati assistant. "
|
||||
"Speak strictly in the UAE dialect (Khaleeji). "
|
||||
"Be helpful, concise, and use local greetings like 'Marhaba' and 'Ya Khoy'."
|
||||
)
|
||||
|
||||
SYSTEM_PROMPT = load_system_prompt()
|
||||
|
||||
# ==================================================
|
||||
# 🎤 Main Client Class
|
||||
# ==================================================
|
||||
class HamadGeminiVoice:
|
||||
def __init__(self):
|
||||
self.audio_q = None # ✅ create inside run() to bind to the correct loop
|
||||
self.speaking = False
|
||||
self.interrupted = False
|
||||
self.pya = pyaudio.PyAudio()
|
||||
|
||||
# Smart interruption tuning
|
||||
self.MIN_THRESHOLD = 3000
|
||||
self.barge_in_threshold = 3000
|
||||
self.REQUIRED_LOUD_CHUNKS = 5
|
||||
|
||||
# --- stability knobs ---
|
||||
self.PREBUFFER_CHUNKS = 4
|
||||
self.PLAYBACK_TIMEOUT = 0.35
|
||||
self.BARGE_IN_COOLDOWN = 0.7
|
||||
self.AI_SPEAK_GRACE = 0.25
|
||||
|
||||
self._last_ai_audio_time = 0.0
|
||||
self._ai_speaking_since = 0.0
|
||||
self._barge_in_block_until = 0.0
|
||||
|
||||
# ✅ Echo-loop protection (reduce self-hearing)
|
||||
self.ECHO_GUARD_SEC = 0.8
|
||||
self._ignore_input_until = 0.0
|
||||
self.SEND_SILENCE_WHEN_SPEAKING = True
|
||||
self.SPEAKING_ENERGY_GATE = 0.85
|
||||
self._silence_pcm = b"\x00" * (CHUNK_SIZE * 2)
|
||||
|
||||
def audio_energy(self, pcm):
|
||||
try:
|
||||
samples = array.array("h", pcm)
|
||||
if not samples:
|
||||
return 0
|
||||
return sum(abs(s) for s in samples) // len(samples)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
def calibrate_mic(self):
|
||||
print("\n🤫 Calibrating Microphone... (Please remain silent)")
|
||||
try:
|
||||
stream = self.pya.open(
|
||||
format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=SEND_SAMPLE_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNK_SIZE,
|
||||
)
|
||||
values = []
|
||||
for _ in range(40):
|
||||
data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
|
||||
values.append(self.audio_energy(data))
|
||||
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
|
||||
avg_noise = sum(values) / len(values)
|
||||
self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
|
||||
|
||||
print(f"✅ Baseline Noise: {avg_noise:.1f}")
|
||||
print(f"✅ Interruption Threshold: {self.barge_in_threshold:.1f}")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Calibration failed: {e}. Using default threshold.")
|
||||
|
||||
def _ws_connect_kwargs(self):
|
||||
kwargs = {"max_size": None}
|
||||
try:
|
||||
sig = inspect.signature(websockets.connect)
|
||||
if "extra_headers" in sig.parameters:
|
||||
kwargs["extra_headers"] = {"Content-Type": "application/json"}
|
||||
else:
|
||||
kwargs["additional_headers"] = {"Content-Type": "application/json"}
|
||||
except Exception:
|
||||
kwargs["extra_headers"] = {"Content-Type": "application/json"}
|
||||
return kwargs
|
||||
|
||||
async def run(self):
|
||||
# ✅ create Queue inside the running loop (Python 3.8 fix)
|
||||
self.audio_q = asyncio.Queue()
|
||||
|
||||
self.calibrate_mic()
|
||||
|
||||
print(f"\n🚀 Connecting to Gemini ({MODEL})...")
|
||||
async with websockets.connect(URI, **self._ws_connect_kwargs()) as ws:
|
||||
|
||||
setup_msg = {
|
||||
"setup": {
|
||||
"model": MODEL,
|
||||
"generationConfig": {
|
||||
"responseModalities": ["AUDIO"],
|
||||
"speechConfig": {
|
||||
"voiceConfig": {
|
||||
"prebuiltVoiceConfig": {"voiceName": VOICE_NAME}
|
||||
}
|
||||
},
|
||||
},
|
||||
"inputAudioTranscription": {},
|
||||
"systemInstruction": {"parts": [{"text": SYSTEM_PROMPT}]},
|
||||
}
|
||||
}
|
||||
await ws.send(json.dumps(setup_msg))
|
||||
|
||||
await ws.recv()
|
||||
print("🎙️ Connected! Sanad is listening. (Press Ctrl+C to stop)")
|
||||
|
||||
# ✅ Python 3.8 fix: replace TaskGroup with gather()
|
||||
tasks = [
|
||||
asyncio.create_task(self.capture_mic(ws)),
|
||||
asyncio.create_task(self.receive_audio(ws)),
|
||||
asyncio.create_task(self.play_audio()),
|
||||
]
|
||||
try:
|
||||
await asyncio.gather(*tasks)
|
||||
finally:
|
||||
for t in tasks:
|
||||
t.cancel()
|
||||
|
||||
async def capture_mic(self, ws):
|
||||
stream = await to_thread(
|
||||
self.pya.open,
|
||||
format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=SEND_SAMPLE_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNK_SIZE,
|
||||
)
|
||||
|
||||
loud_chunks = 0
|
||||
|
||||
while True:
|
||||
try:
|
||||
data = await to_thread(
|
||||
stream.read, CHUNK_SIZE, exception_on_overflow=False
|
||||
)
|
||||
|
||||
energy = self.audio_energy(data)
|
||||
now = time.time()
|
||||
|
||||
# --- SMART INTERRUPTION LOGIC (barge-in) ---
|
||||
if self.speaking and (now >= self._barge_in_block_until):
|
||||
if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
|
||||
if energy > self.barge_in_threshold:
|
||||
loud_chunks += 1
|
||||
else:
|
||||
loud_chunks = 0
|
||||
|
||||
if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
|
||||
print(f"🛑 Interruption! (Energy: {energy})")
|
||||
self.interrupted = True
|
||||
self.speaking = False
|
||||
loud_chunks = 0
|
||||
self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
|
||||
|
||||
# Drain playback queue
|
||||
while not self.audio_q.empty():
|
||||
try:
|
||||
self.audio_q.get_nowait()
|
||||
except asyncio.QueueEmpty:
|
||||
break
|
||||
|
||||
# Prevent echo loop: during AI speaking send silence unless user is loud
|
||||
data_to_send = data
|
||||
if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
|
||||
gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
|
||||
if energy < gate:
|
||||
data_to_send = self._silence_pcm
|
||||
|
||||
b64_audio = base64.b64encode(data_to_send).decode("utf-8")
|
||||
msg = {
|
||||
"realtime_input": {
|
||||
"media_chunks": [
|
||||
{
|
||||
"data": b64_audio,
|
||||
"mime_type": f"audio/pcm;rate={SEND_SAMPLE_RATE}",
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
await ws.send(json.dumps(msg))
|
||||
|
||||
except websockets.exceptions.ConnectionClosed:
|
||||
print("⚠️ WebSocket closed.")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"❌ Mic Error: {e}")
|
||||
break
|
||||
|
||||
async def receive_audio(self, ws):
|
||||
async for msg in ws:
|
||||
try:
|
||||
response = json.loads(msg)
|
||||
server_content = response.get("serverContent", {})
|
||||
|
||||
if server_content.get("interrupted"):
|
||||
self.interrupted = False
|
||||
|
||||
# Print USER transcription (optional), but ignore during AI speaking window to reduce echo prints
|
||||
input_tr = (
|
||||
server_content.get("inputTranscription")
|
||||
or server_content.get("input_transcription")
|
||||
or server_content.get("inputAudioTranscription")
|
||||
or server_content.get("input_audio_transcription")
|
||||
)
|
||||
if isinstance(input_tr, dict):
|
||||
text = (input_tr.get("text") or "").strip()
|
||||
if text and (time.time() >= self._ignore_input_until) and (not self.speaking):
|
||||
print(f"📝 USER SAID: {text}")
|
||||
|
||||
if self.interrupted:
|
||||
continue
|
||||
|
||||
# AUDIO playback from model
|
||||
model_turn = server_content.get("modelTurn")
|
||||
if model_turn:
|
||||
parts = model_turn.get("parts", [])
|
||||
for part in parts:
|
||||
inline_data = part.get("inlineData")
|
||||
if inline_data:
|
||||
audio_b64 = inline_data.get("data")
|
||||
if audio_b64:
|
||||
now = time.time()
|
||||
if not self.speaking:
|
||||
self._ai_speaking_since = now
|
||||
self.speaking = True
|
||||
self._last_ai_audio_time = now
|
||||
|
||||
# While AI audio is arriving, ignore mic transcription briefly
|
||||
self._ignore_input_until = now + self.ECHO_GUARD_SEC
|
||||
|
||||
audio_bytes = base64.b64decode(audio_b64)
|
||||
await self.audio_q.put(audio_bytes)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Parse Error: {e}")
|
||||
|
||||
async def play_audio(self):
|
||||
stream = await to_thread(
|
||||
self.pya.open,
|
||||
format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=RECEIVE_SAMPLE_RATE,
|
||||
output=True,
|
||||
frames_per_buffer=CHUNK_SIZE,
|
||||
)
|
||||
|
||||
buffered = False
|
||||
|
||||
while True:
|
||||
try:
|
||||
if self.interrupted:
|
||||
await asyncio.sleep(0.01)
|
||||
continue
|
||||
|
||||
if self.speaking and not buffered:
|
||||
while (
|
||||
self.audio_q.qsize() < self.PREBUFFER_CHUNKS
|
||||
and self.speaking
|
||||
and not self.interrupted
|
||||
):
|
||||
await asyncio.sleep(0.01)
|
||||
buffered = True
|
||||
|
||||
try:
|
||||
data = await asyncio.wait_for(
|
||||
self.audio_q.get(), timeout=self.PLAYBACK_TIMEOUT
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.25:
|
||||
self.speaking = False
|
||||
buffered = False
|
||||
continue
|
||||
|
||||
if data:
|
||||
await to_thread(stream.write, data)
|
||||
|
||||
if self.audio_q.empty():
|
||||
if (time.time() - self._last_ai_audio_time) > 0.25:
|
||||
self.speaking = False
|
||||
buffered = False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Speaker Error: {e}")
|
||||
break
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
client = HamadGeminiVoice()
|
||||
asyncio.run(client.run())
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 Ma'a Salama (Goodbye)!")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Fatal Error: {e}")
|
||||
Loading…
x
Reference in New Issue
Block a user