165 lines
7.5 KiB
Bash

#!/usr/bin/env bash
#
# start_all.sh — ONE command to launch the entire Sanad humanoid stack:
#
# 1. web_nav3 nav stack → backend :8765 + rosbridge :9090 + map_relay
# + internal rosbridge watchdog
# (runs in its OWN g1_lidar conda env + CycloneDDS)
#
# 2. SanadV3 dashboard → Gemini voice + LED face + Nav tabs, :8001
# (runs in its OWN gemini_sdk conda env + Unitree SDK on eth0)
#
# The two stacks use incompatible Python envs + DDS configs, so they CANNOT
# share a process. This launcher starts each detached, supervises the pair
# (restarts one only if it truly dies), and Ctrl+C here stops EVERYTHING.
#
# Liveness is checked by PROCESS PATTERN (pgrep), NOT by a captured PID:
# `setsid ... &` returns the ephemeral setsid-wrapper pid which exits
# instantly, so a pid check would false-fire and restart-storm (which then
# makes each failed relaunch's cleanup trap nuke the working rosbridge).
#
# Usage: bash ~/Sanadv3/shell_scripts/start_all.sh
# Env knobs (optional): WEBNAV3_HOME, SANAD_HOME, PORT, LOG_DIR, NO_WEBNAV3=1
set -u
WEBNAV3_HOME="${WEBNAV3_HOME:-$HOME/web_nav3}"
SANAD_HOME="${SANAD_HOME:-$HOME/Sanadv3}"
SANAD_PORT="${PORT:-8001}"
LOG_DIR="${LOG_DIR:-$HOME/sanad_logs}"
NO_WEBNAV3="${NO_WEBNAV3:-0}"
mkdir -p "$LOG_DIR"
echo "════════════════════════════════════════════════"
echo " Sanad humanoid — full stack launcher"
echo " web_nav3 : $WEBNAV3_HOME $([ "$NO_WEBNAV3" = 1 ] && echo '(skipped)')"
echo " SanadV3 : $SANAD_HOME (port $SANAD_PORT)"
echo " logs : $LOG_DIR"
echo "════════════════════════════════════════════════"
# liveness by process pattern (robust — no pid tracking)
# NOTE: start_sanad.sh `cd $SANAD_HOME && exec python3 main.py --port N`, so the
# process cmdline is "python3 main.py --port <PORT>" (no Sanadv3/ in argv). Match
# on the PORT — also keeps us from ever touching the live Sanad on :8000.
SANAD_PAT="main.py --port $SANAD_PORT"
webnav3_up() { pgrep -f "web/backend.py" >/dev/null 2>&1; }
# Count a SanadV3 as "up" if EITHER main.py is bound OR start_sanad.sh is still
# mid-boot (conda activate + up-to-20s DDS-iface wait + heavy lazy imports come
# BEFORE `exec python3 main.py`, so "main.py --port" is absent from argv during
# that whole window). Matching the booting shell too prevents the supervisor
# from pkill-ing a process that is simply still cold-booting (restart storm).
sanad_up() { pgrep -f "$SANAD_PAT" >/dev/null 2>&1 || pgrep -f "shell_scripts/start_sanad.sh" >/dev/null 2>&1; }
start_webnav3() {
setsid bash "$WEBNAV3_HOME/scripts/start_web.sh" \
> "$LOG_DIR/web_nav3.log" 2>&1 < /dev/null &
echo " web_nav3 launching → $LOG_DIR/web_nav3.log"
}
start_sanad() {
SANAD_HOME="$SANAD_HOME" PORT="$SANAD_PORT" \
setsid bash "$SANAD_HOME/shell_scripts/start_sanad.sh" \
> "$LOG_DIR/sanadv3.log" 2>&1 < /dev/null &
echo " SanadV3 launching → $LOG_DIR/sanadv3.log"
}
kill_webnav3() {
pkill -9 -f "scripts/start_web.sh" 2>/dev/null || true
pkill -9 -f "web/backend.py|rosbridge_websocket|start_rosbridge.sh|start_map_relay.sh|map_relay.py" 2>/dev/null || true
}
# ---- teardown on Ctrl+C / TERM ----
# Installed BEFORE pre-flight/launch so an interrupt during the "clearing
# stale instances" sleep or right after a setsid child is spawned still
# triggers a full teardown (otherwise those detached children would be
# orphaned with no supervisor).
_cleaning=0
cleanup() {
[ "$_cleaning" = 1 ] && return; _cleaning=1
echo
echo "── stopping everything ──"
# 1. ask the backend to stop the BRINGUP cleanly (it owns the pgid, so a
# killpg takes down rtabmap + all Nav2 nodes in one shot).
curl -s -m 8 -X POST "http://127.0.0.1:8765/api/shutdown" >/dev/null 2>&1 || true
# 2. graceful TERM: SanadV3 (clean loco stop) + start_web.sh (own trap).
pkill -TERM -f "$SANAD_PAT" 2>/dev/null || true
pkill -TERM -f "scripts/start_web.sh" 2>/dev/null || true
sleep 3
# 3. force-kill the web_nav3 stack + SanadV3.
kill_webnav3
pkill -9 -f "$SANAD_PAT" 2>/dev/null || true
pkill -9 -f "shell_scripts/start_sanad.sh" 2>/dev/null || true
# 4. belt-and-suspenders — any orphaned bringup tree (if the backend was
# already gone in step 1, killpg never ran).
pkill -9 -f "scripts/bringup.sh|run_on_jetson.sh|g1_nav2_slam" 2>/dev/null || true
pkill -9 -f "rtabmap_slam/rtabmap|livox_pcd_bridge|dog_odom_to_tf|pointcloud_to_laserscan" 2>/dev/null || true
echo " done."
exit 0
}
trap cleanup INT TERM
# ---- pre-flight: clear any stale instances so we start clean ----
echo "── clearing any stale instances … ──"
[ "$NO_WEBNAV3" != 1 ] && kill_webnav3
pkill -9 -f "shell_scripts/start_sanad.sh" 2>/dev/null || true
pkill -9 -f "$SANAD_PAT" 2>/dev/null || true
sleep 2
# ---- launch ----
if [ "$NO_WEBNAV3" != 1 ]; then echo "[1/2] starting web_nav3 nav stack …"; start_webnav3; fi
echo "[2/2] starting SanadV3 dashboard …"; start_sanad
# ---- wait for ports, then report ----
echo
echo "── waiting for services to bind (up to ~80s) … ──"
for _ in $(seq 1 40); do
up8001=$(ss -tnln 2>/dev/null | grep -c ":$SANAD_PORT ")
up8765=$(ss -tnln 2>/dev/null | grep -c ':8765 ')
up9090=$(ss -tnln 2>/dev/null | grep -c ':9090 ')
if [ "$NO_WEBNAV3" = 1 ]; then
[ "$up8001" -ge 1 ] && break
else
[ "$up8001" -ge 1 ] && [ "$up8765" -ge 1 ] && [ "$up9090" -ge 1 ] && break
fi
sleep 2
done
IP=$(hostname -I | awk '{print $1}')
st() { [ "$(ss -tnln 2>/dev/null | grep -c "$1")" -ge 1 ] && echo UP || echo DOWN; }
echo
echo "════════════════════════════════════════════════"
echo " STATUS"
[ "$NO_WEBNAV3" != 1 ] && echo " web_nav3 backend :8765 $(st ':8765 ')"
[ "$NO_WEBNAV3" != 1 ] && echo " rosbridge :9090 $(st ':9090 ')"
echo " SanadV3 dashboard :$SANAD_PORT $(st ":$SANAD_PORT ")"
echo
echo " OPEN: http://$IP:$SANAD_PORT (full dashboard)"
[ "$NO_WEBNAV3" != 1 ] && echo " http://$IP:8765 (web_nav3 nav only)"
echo
echo " logs: tail -f $LOG_DIR/sanadv3.log"
[ "$NO_WEBNAV3" != 1 ] && echo " tail -f $LOG_DIR/web_nav3.log"
echo " >>> Ctrl+C here stops EVERYTHING <<<"
echo "════════════════════════════════════════════════"
# ---- supervise: restart a child ONLY if it truly died (pgrep), with grace ----
while true; do
sleep 10
if [ "$NO_WEBNAV3" != 1 ] && ! webnav3_up; then
echo "[supervisor] $(date +%T) web_nav3 down — restarting"
kill_webnav3 # clear any half-dead remnants first
sleep 1
start_webnav3
sleep 15 # grace: let it boot before the next check
fi
if ! sanad_up; then
echo "[supervisor] $(date +%T) SanadV3 down — restarting"
pkill -9 -f "shell_scripts/start_sanad.sh" 2>/dev/null || true
sleep 1
start_sanad
# grace MUST exceed worst-case cold boot: up to 20s DDS-iface wait +
# conda activate + heavy lazy imports (torch/transformers/gemini) +
# arm.init before uvicorn binds. 15s was shorter than that and caused a
# restart storm (kill a still-booting instance, relaunch, repeat).
sleep 45
fi
done