165 lines
7.5 KiB
Bash
165 lines
7.5 KiB
Bash
#!/usr/bin/env bash
|
|
#
|
|
# start_all.sh — ONE command to launch the entire Sanad humanoid stack:
|
|
#
|
|
# 1. web_nav3 nav stack → backend :8765 + rosbridge :9090 + map_relay
|
|
# + internal rosbridge watchdog
|
|
# (runs in its OWN g1_lidar conda env + CycloneDDS)
|
|
#
|
|
# 2. SanadV3 dashboard → Gemini voice + LED face + Nav tabs, :8001
|
|
# (runs in its OWN gemini_sdk conda env + Unitree SDK on eth0)
|
|
#
|
|
# The two stacks use incompatible Python envs + DDS configs, so they CANNOT
|
|
# share a process. This launcher starts each detached, supervises the pair
|
|
# (restarts one only if it truly dies), and Ctrl+C here stops EVERYTHING.
|
|
#
|
|
# Liveness is checked by PROCESS PATTERN (pgrep), NOT by a captured PID:
|
|
# `setsid ... &` returns the ephemeral setsid-wrapper pid which exits
|
|
# instantly, so a pid check would false-fire and restart-storm (which then
|
|
# makes each failed relaunch's cleanup trap nuke the working rosbridge).
|
|
#
|
|
# Usage: bash ~/Sanadv3/shell_scripts/start_all.sh
|
|
# Env knobs (optional): WEBNAV3_HOME, SANAD_HOME, PORT, LOG_DIR, NO_WEBNAV3=1
|
|
|
|
set -u
|
|
|
|
WEBNAV3_HOME="${WEBNAV3_HOME:-$HOME/web_nav3}"
|
|
SANAD_HOME="${SANAD_HOME:-$HOME/Sanadv3}"
|
|
SANAD_PORT="${PORT:-8001}"
|
|
LOG_DIR="${LOG_DIR:-$HOME/sanad_logs}"
|
|
NO_WEBNAV3="${NO_WEBNAV3:-0}"
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
echo "════════════════════════════════════════════════"
|
|
echo " Sanad humanoid — full stack launcher"
|
|
echo " web_nav3 : $WEBNAV3_HOME $([ "$NO_WEBNAV3" = 1 ] && echo '(skipped)')"
|
|
echo " SanadV3 : $SANAD_HOME (port $SANAD_PORT)"
|
|
echo " logs : $LOG_DIR"
|
|
echo "════════════════════════════════════════════════"
|
|
|
|
# liveness by process pattern (robust — no pid tracking)
|
|
# NOTE: start_sanad.sh `cd $SANAD_HOME && exec python3 main.py --port N`, so the
|
|
# process cmdline is "python3 main.py --port <PORT>" (no Sanadv3/ in argv). Match
|
|
# on the PORT — also keeps us from ever touching the live Sanad on :8000.
|
|
SANAD_PAT="main.py --port $SANAD_PORT"
|
|
webnav3_up() { pgrep -f "web/backend.py" >/dev/null 2>&1; }
|
|
# Count a SanadV3 as "up" if EITHER main.py is bound OR start_sanad.sh is still
|
|
# mid-boot (conda activate + up-to-20s DDS-iface wait + heavy lazy imports come
|
|
# BEFORE `exec python3 main.py`, so "main.py --port" is absent from argv during
|
|
# that whole window). Matching the booting shell too prevents the supervisor
|
|
# from pkill-ing a process that is simply still cold-booting (restart storm).
|
|
sanad_up() { pgrep -f "$SANAD_PAT" >/dev/null 2>&1 || pgrep -f "shell_scripts/start_sanad.sh" >/dev/null 2>&1; }
|
|
|
|
start_webnav3() {
|
|
setsid bash "$WEBNAV3_HOME/scripts/start_web.sh" \
|
|
> "$LOG_DIR/web_nav3.log" 2>&1 < /dev/null &
|
|
echo " web_nav3 launching → $LOG_DIR/web_nav3.log"
|
|
}
|
|
start_sanad() {
|
|
SANAD_HOME="$SANAD_HOME" PORT="$SANAD_PORT" \
|
|
setsid bash "$SANAD_HOME/shell_scripts/start_sanad.sh" \
|
|
> "$LOG_DIR/sanadv3.log" 2>&1 < /dev/null &
|
|
echo " SanadV3 launching → $LOG_DIR/sanadv3.log"
|
|
}
|
|
|
|
kill_webnav3() {
|
|
pkill -9 -f "scripts/start_web.sh" 2>/dev/null || true
|
|
pkill -9 -f "web/backend.py|rosbridge_websocket|start_rosbridge.sh|start_map_relay.sh|map_relay.py" 2>/dev/null || true
|
|
}
|
|
|
|
# ---- teardown on Ctrl+C / TERM ----
|
|
# Installed BEFORE pre-flight/launch so an interrupt during the "clearing
|
|
# stale instances" sleep or right after a setsid child is spawned still
|
|
# triggers a full teardown (otherwise those detached children would be
|
|
# orphaned with no supervisor).
|
|
_cleaning=0
|
|
cleanup() {
|
|
[ "$_cleaning" = 1 ] && return; _cleaning=1
|
|
echo
|
|
echo "── stopping everything ──"
|
|
# 1. ask the backend to stop the BRINGUP cleanly (it owns the pgid, so a
|
|
# killpg takes down rtabmap + all Nav2 nodes in one shot).
|
|
curl -s -m 8 -X POST "http://127.0.0.1:8765/api/shutdown" >/dev/null 2>&1 || true
|
|
# 2. graceful TERM: SanadV3 (clean loco stop) + start_web.sh (own trap).
|
|
pkill -TERM -f "$SANAD_PAT" 2>/dev/null || true
|
|
pkill -TERM -f "scripts/start_web.sh" 2>/dev/null || true
|
|
sleep 3
|
|
# 3. force-kill the web_nav3 stack + SanadV3.
|
|
kill_webnav3
|
|
pkill -9 -f "$SANAD_PAT" 2>/dev/null || true
|
|
pkill -9 -f "shell_scripts/start_sanad.sh" 2>/dev/null || true
|
|
# 4. belt-and-suspenders — any orphaned bringup tree (if the backend was
|
|
# already gone in step 1, killpg never ran).
|
|
pkill -9 -f "scripts/bringup.sh|run_on_jetson.sh|g1_nav2_slam" 2>/dev/null || true
|
|
pkill -9 -f "rtabmap_slam/rtabmap|livox_pcd_bridge|dog_odom_to_tf|pointcloud_to_laserscan" 2>/dev/null || true
|
|
echo " done."
|
|
exit 0
|
|
}
|
|
trap cleanup INT TERM
|
|
|
|
# ---- pre-flight: clear any stale instances so we start clean ----
|
|
echo "── clearing any stale instances … ──"
|
|
[ "$NO_WEBNAV3" != 1 ] && kill_webnav3
|
|
pkill -9 -f "shell_scripts/start_sanad.sh" 2>/dev/null || true
|
|
pkill -9 -f "$SANAD_PAT" 2>/dev/null || true
|
|
sleep 2
|
|
|
|
# ---- launch ----
|
|
if [ "$NO_WEBNAV3" != 1 ]; then echo "[1/2] starting web_nav3 nav stack …"; start_webnav3; fi
|
|
echo "[2/2] starting SanadV3 dashboard …"; start_sanad
|
|
|
|
# ---- wait for ports, then report ----
|
|
echo
|
|
echo "── waiting for services to bind (up to ~80s) … ──"
|
|
for _ in $(seq 1 40); do
|
|
up8001=$(ss -tnln 2>/dev/null | grep -c ":$SANAD_PORT ")
|
|
up8765=$(ss -tnln 2>/dev/null | grep -c ':8765 ')
|
|
up9090=$(ss -tnln 2>/dev/null | grep -c ':9090 ')
|
|
if [ "$NO_WEBNAV3" = 1 ]; then
|
|
[ "$up8001" -ge 1 ] && break
|
|
else
|
|
[ "$up8001" -ge 1 ] && [ "$up8765" -ge 1 ] && [ "$up9090" -ge 1 ] && break
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
IP=$(hostname -I | awk '{print $1}')
|
|
st() { [ "$(ss -tnln 2>/dev/null | grep -c "$1")" -ge 1 ] && echo UP || echo DOWN; }
|
|
echo
|
|
echo "════════════════════════════════════════════════"
|
|
echo " STATUS"
|
|
[ "$NO_WEBNAV3" != 1 ] && echo " web_nav3 backend :8765 $(st ':8765 ')"
|
|
[ "$NO_WEBNAV3" != 1 ] && echo " rosbridge :9090 $(st ':9090 ')"
|
|
echo " SanadV3 dashboard :$SANAD_PORT $(st ":$SANAD_PORT ")"
|
|
echo
|
|
echo " OPEN: http://$IP:$SANAD_PORT (full dashboard)"
|
|
[ "$NO_WEBNAV3" != 1 ] && echo " http://$IP:8765 (web_nav3 nav only)"
|
|
echo
|
|
echo " logs: tail -f $LOG_DIR/sanadv3.log"
|
|
[ "$NO_WEBNAV3" != 1 ] && echo " tail -f $LOG_DIR/web_nav3.log"
|
|
echo " >>> Ctrl+C here stops EVERYTHING <<<"
|
|
echo "════════════════════════════════════════════════"
|
|
|
|
# ---- supervise: restart a child ONLY if it truly died (pgrep), with grace ----
|
|
while true; do
|
|
sleep 10
|
|
if [ "$NO_WEBNAV3" != 1 ] && ! webnav3_up; then
|
|
echo "[supervisor] $(date +%T) web_nav3 down — restarting"
|
|
kill_webnav3 # clear any half-dead remnants first
|
|
sleep 1
|
|
start_webnav3
|
|
sleep 15 # grace: let it boot before the next check
|
|
fi
|
|
if ! sanad_up; then
|
|
echo "[supervisor] $(date +%T) SanadV3 down — restarting"
|
|
pkill -9 -f "shell_scripts/start_sanad.sh" 2>/dev/null || true
|
|
sleep 1
|
|
start_sanad
|
|
# grace MUST exceed worst-case cold boot: up to 20s DDS-iface wait +
|
|
# conda activate + heavy lazy imports (torch/transformers/gemini) +
|
|
# arm.init before uvicorn binds. 15s was shorter than that and caused a
|
|
# restart storm (kill a still-booting instance, relaunch, repeat).
|
|
sleep 45
|
|
fi
|
|
done
|