324 lines
11 KiB
Python

"""WebSocket → PTY bridge for the dashboard's Terminal tab.
Spawns a shell (bash by default) inside a pseudo-terminal on the robot and
relays stdin/stdout to a browser xterm.js instance over WebSocket. From the
operator's seat this is functionally identical to an in-browser
`ssh unitree@<robot>` — except no SSH handshake is needed because the
dashboard process already runs as unitree on the robot. The Terminal tab
connects to ws://<dashboard>/ws/terminal and you land in unitree's shell
directly.
PROTOCOL — text frames only. Control vs. keystrokes are disambiguated by
the leading byte:
client → server:
"\\x1f" + json-encoded control object (init / resize)
e.g. "\\x1f{\\"type\\":\\"init\\",\\"cols\\":80,\\"rows\\":24}"
<any other text> keystrokes — written to PTY
server → client:
<text> PTY stdout/stderr chunks
The \\x1f prefix (ASCII Unit Separator) is the disambiguator. If we just
JSON-sniffed every message, a user pasting `{"type":"resize",...}` into
their shell would silently resize the PTY instead of pasting the text.
SECURITY NOTE: anyone who can reach the dashboard URL gets shell access
as the unitree user. The dashboard already exposes equally-powerful
endpoints (E-STOP, motion replay, audio mute, etc.) so this isn't a new
threat class — but it IS a single-bullet kill switch for the robot. Bind
the dashboard to a trusted network only.
"""
from __future__ import annotations
import asyncio
import fcntl
import json
import os
import pty
import select
import shutil
import signal
import struct
import termios
import threading
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
from Project.Sanad.core.logger import get_logger
log = get_logger("terminal_ws")
router = APIRouter()
# Magic prefix that distinguishes control messages from raw keystrokes.
# ASCII 0x1F (Unit Separator) — not produced by normal keyboard input,
# so user-pasted JSON can never spoof a control frame.
_CTRL_PREFIX = "\x1f"
# Concurrent-session cap so a runaway tab can't spawn 50 bashes on the robot.
_MAX_SESSIONS = 4
_active: set[int] = set()
_active_lock = threading.Lock()
# Bounded queue depth between the PTY reader thread and the WS sender.
# A chatty shell command (e.g. `yes`, `cat /dev/urandom`) at gigabytes/sec
# would otherwise pile up unbounded asyncio tasks + string refs. Past the
# cap we drop chunks and surface a single drop notice — ANSI may corrupt
# briefly but the session stays alive.
_SEND_QUEUE_MAX = 64
def _resolve_shell() -> list[str]:
"""Pick a sensible shell. SHELL env first, then /bin/bash, then sh."""
sh = os.environ.get("SHELL", "")
if sh and shutil.which(sh):
return [sh, "-i"]
if shutil.which("/bin/bash"):
return ["/bin/bash", "-i"]
return ["/bin/sh", "-i"]
def _set_pty_size(fd: int, cols: int, rows: int) -> None:
"""Inform the PTY of its new window size so curses-style apps (htop,
less, vim) lay out correctly."""
try:
# TIOCSWINSZ payload: rows, cols, xpixel, ypixel (xpixel/ypixel
# unused, kept 0).
fcntl.ioctl(fd, termios.TIOCSWINSZ,
struct.pack("HHHH", rows, cols, 0, 0))
except Exception as exc:
log.debug("TIOCSWINSZ failed (cols=%s rows=%s): %s", cols, rows, exc)
async def _reap_child(pid: int) -> None:
"""SIGHUP → wait briefly → SIGKILL → wait briefly → giveup.
Earlier version SIGKILLed unconditionally because the WNOHANG check
happened immediately after SIGHUP (which never returns true that fast).
Now we poll for up to ~1.5s after SIGHUP before escalating.
"""
async def _wait_exit(timeout_s: float, interval_s: float = 0.1) -> bool:
end = asyncio.get_running_loop().time() + timeout_s
while asyncio.get_running_loop().time() < end:
try:
done_pid, _ = os.waitpid(pid, os.WNOHANG)
except ChildProcessError:
return True # already reaped
except OSError:
return False
if done_pid:
return True
await asyncio.sleep(interval_s)
return False
# 1. Polite request
try:
os.kill(pid, signal.SIGHUP)
except ProcessLookupError:
return
except OSError as exc:
log.debug("SIGHUP pid=%d: %s", pid, exc)
return
if await _wait_exit(1.5):
return
# 2. Force
try:
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
return
except OSError as exc:
log.debug("SIGKILL pid=%d: %s", pid, exc)
return
if not await _wait_exit(1.0):
log.warning("terminal child pid=%d failed to exit after SIGKILL", pid)
@router.websocket("/ws/terminal")
async def terminal_ws(ws: WebSocket) -> None:
"""Bridge a browser xterm.js to a shell PTY on the robot."""
await ws.accept()
# Concurrent-session guard.
with _active_lock:
if len(_active) >= _MAX_SESSIONS:
await ws.send_text(
f"\r\n[terminal] Refused — already have {_MAX_SESSIONS} "
f"open sessions. Close another tab and reconnect.\r\n"
)
await ws.close(code=1008)
return
# Fork + exec the shell. Parent gets the master fd; child becomes the
# shell with stdin/stdout/stderr wired to the slave end.
cmd = _resolve_shell()
try:
pid, fd = pty.fork()
except OSError as exc:
log.error("pty.fork failed: %s", exc)
await ws.send_text(f"\r\n[terminal] pty.fork failed: {exc}\r\n")
await ws.close(code=1011)
return
if pid == 0:
# CHILD — set env so the shell is interactive and looks right.
os.environ.setdefault("TERM", "xterm-256color")
os.environ.setdefault("LANG", os.environ.get("LANG", "en_US.UTF-8"))
try:
os.execvp(cmd[0], cmd)
except OSError as exc:
# exec failed — printing to fd 2 reaches the parent via the
# PTY so the browser sees the error before we _exit.
os.write(2, f"[terminal] exec failed: {exc}\n".encode())
os._exit(127)
return # unreachable in child
# PARENT
with _active_lock:
_active.add(pid)
log.info("terminal session started pid=%d cmd=%s", pid, cmd[0])
loop = asyncio.get_running_loop()
closed = asyncio.Event()
# Bounded queue + dedicated sender task = backpressure. If the queue
# fills up we drop the chunk and bump _dropped so we can surface a
# short notice in the stream.
send_q: asyncio.Queue[str] = asyncio.Queue(maxsize=_SEND_QUEUE_MAX)
dropped = 0
def _reader_thread() -> None:
"""Drain PTY master fd → queue. Runs in a daemon thread because
select.select on a pipe blocks; asyncio has no portable
equivalent for arbitrary fds on Windows (and we want one code
path)."""
nonlocal dropped
try:
while not closed.is_set():
try:
r, _, _ = select.select([fd], [], [], 0.1)
except (OSError, ValueError):
break
if not r:
continue
try:
data = os.read(fd, 4096)
except OSError:
break
if not data: # EOF — child exited / PTY closed
break
try:
text = data.decode("utf-8", errors="replace")
except Exception:
continue
# put_nowait raises on full — we drop and count.
try:
loop.call_soon_threadsafe(_enqueue, text)
except RuntimeError:
# loop closed — bail
break
finally:
loop.call_soon_threadsafe(closed.set)
def _enqueue(text: str) -> None:
nonlocal dropped
try:
send_q.put_nowait(text)
except asyncio.QueueFull:
dropped += 1
async def _sender_task() -> None:
"""Drains send_q → WebSocket. Single producer, single consumer
means no extra locking needed. Backoff on send failure and let
the closed flag end the session."""
nonlocal dropped
while not closed.is_set():
try:
text = await asyncio.wait_for(send_q.get(), timeout=0.5)
except asyncio.TimeoutError:
continue
try:
await ws.send_text(text)
except Exception as exc:
log.info("terminal ws.send failed (likely client gone): %s", exc)
closed.set()
return
# If we dropped chunks since the last successful send, tell
# the user once so the ANSI corruption isn't mysterious.
if dropped:
d = dropped
dropped = 0
try:
await ws.send_text(
f"\r\n\x1b[2m[term: dropped {d} chunk(s) — slow client]"
f"\x1b[0m\r\n",
)
except Exception:
closed.set()
return
reader = threading.Thread(target=_reader_thread, daemon=True,
name=f"terminal-rx-{pid}")
reader.start()
sender = asyncio.create_task(_sender_task())
# Initial sizing — xterm.js will send a {type:"init",...} control
# frame right after onopen with the actual window size.
_set_pty_size(fd, 80, 24)
try:
while not closed.is_set():
try:
msg = await asyncio.wait_for(ws.receive_text(), timeout=0.5)
except asyncio.TimeoutError:
continue
except WebSocketDisconnect:
break
if not msg:
continue
# Control frame? Must start with the magic prefix. User-typed
# / pasted text can never spoof this — \x1f isn't producible
# by normal keyboard input.
if msg[:1] == _CTRL_PREFIX:
try:
ctrl = json.loads(msg[1:])
except (json.JSONDecodeError, ValueError):
ctrl = None
if isinstance(ctrl, dict) and ctrl.get("type") in ("init", "resize"):
cols = int(ctrl.get("cols") or 80)
rows = int(ctrl.get("rows") or 24)
_set_pty_size(fd, cols, rows)
# Either way, control frames are NEVER forwarded to PTY.
continue
# Plain keystrokes — write to PTY master.
try:
os.write(fd, msg.encode("utf-8", errors="replace"))
except OSError as exc:
log.info("terminal pty write failed (child likely exited): %s", exc)
break
finally:
closed.set()
try:
sender.cancel()
except Exception:
pass
try:
await _reap_child(pid)
except Exception as exc:
log.debug("reap_child pid=%d: %s", pid, exc)
try:
os.close(fd)
except OSError:
pass
with _active_lock:
_active.discard(pid)
log.info("terminal session ended pid=%d", pid)
try:
await ws.close()
except Exception:
pass