From 7121b82a1ecac882a7f446e6ac8ad7d903c2050d Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Mon, 20 Apr 2026 18:46:16 -0700 Subject: [PATCH 1/5] refactor(driver-vm): drop log-grep readiness; always run gvproxy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VM driver no longer owns the Ready transition — the gateway-side SupervisorSessionObserver now promotes sandboxes to Ready when their supervisor session connects. Remove guest_ssh_ready() (a brittle grep over the serial console) and the ready_condition() helper. monitor_sandbox still watches the launcher child process and emits Error conditions on ProcessExited / ProcessPollFailed. Also always start gvproxy, not just when port_map is non-empty. With the supervisor-initiated relay migration in #867, the SSH port forward was dropped; that left port_map empty in the default path, which in turn skipped gvproxy startup, which left the guest with no eth0 and no route to the host gateway. The guest supervisor's outbound ConnectSupervisor stream needs gvproxy to reach host.containers.internal (rewritten to 192.168.127.1 inside the guest), so gvproxy is structurally required for any sandbox that talks to the gateway. Inline the gvproxy setup into an unconditional block that returns (guard, api_sock, forwarded_port_map), dropping the mutable plumbing the prior conditional form needed. Remove the now-dead VmContext::set_port_map wrapper; mark its libkrun FFI binding #[allow(dead_code)] so a future reintroduction doesn't need to touch the symbol table. --- crates/openshell-driver-vm/src/driver.rs | 71 ++++------------------- crates/openshell-driver-vm/src/ffi.rs | 5 ++ crates/openshell-driver-vm/src/runtime.rs | 35 +++++------ 3 files changed, 31 insertions(+), 80 deletions(-) diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index 8237ba03c..c3a975d10 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -403,16 +403,23 @@ impl VmDriver { snapshots } + /// Watch the launcher child process and surface errors as driver + /// conditions. + /// + /// The driver no longer owns the `Ready` transition — the gateway + /// promotes a sandbox to `Ready` the moment its supervisor session + /// lands (see `openshell-server/src/compute/mod.rs`). This loop only + /// handles the sad paths: the child process failing to start, exiting + /// abnormally, or becoming unpollable. Those still surface as driver + /// `Error` conditions so the gateway can reason about a dead VM. async fn monitor_sandbox(&self, sandbox_id: String) { - let mut ready_emitted = false; - loop { - let (process, state_dir) = { + let process = { let registry = self.registry.lock().await; let Some(record) = registry.get(&sandbox_id) else { return; }; - (record.process.clone(), record.state_dir.clone()) + record.process.clone() }; let exit_status = { @@ -469,16 +476,6 @@ impl VmDriver { return; } - if !ready_emitted && guest_ssh_ready(&state_dir).await { - if let Some(snapshot) = self - .set_snapshot_condition(&sandbox_id, ready_condition(), false) - .await - { - self.publish_snapshot(snapshot); - } - ready_emitted = true; - } - tokio::time::sleep(Duration::from_millis(250)).await; } } @@ -843,16 +840,6 @@ async fn terminate_vm_process(child: &mut Child) -> Result<(), std::io::Error> { } } -async fn guest_ssh_ready(state_dir: &Path) -> bool { - let console_log = state_dir.join("rootfs-console.log"); - let Ok(contents) = tokio::fs::read_to_string(console_log).await else { - return false; - }; - - contents.contains("SSH server is ready to accept connections") - || contents.contains("SSH server listening") -} - fn sandbox_snapshot(sandbox: &Sandbox, condition: SandboxCondition, deleting: bool) -> Sandbox { Sandbox { id: sandbox.id.clone(), @@ -895,16 +882,6 @@ fn provisioning_condition() -> SandboxCondition { } } -fn ready_condition() -> SandboxCondition { - SandboxCondition { - r#type: "Ready".to_string(), - status: "True".to_string(), - reason: "Listening".to_string(), - message: "Supervisor is listening for SSH connections".to_string(), - last_transition_time: String::new(), - } -} - fn deleting_condition() -> SandboxCondition { SandboxCondition { r#type: "Ready".to_string(), @@ -1214,32 +1191,6 @@ mod tests { let _ = std::fs::remove_dir_all(base); } - #[tokio::test] - async fn guest_ssh_ready_detects_guest_console_marker() { - let base = unique_temp_dir(); - std::fs::create_dir_all(&base).unwrap(); - std::fs::write( - base.join("rootfs-console.log"), - "...\nINFO openshell_sandbox: SSH server is ready to accept connections\n", - ) - .unwrap(); - - assert!(guest_ssh_ready(&base).await); - - let _ = std::fs::remove_dir_all(base); - } - - #[tokio::test] - async fn guest_ssh_ready_is_false_without_marker() { - let base = unique_temp_dir(); - std::fs::create_dir_all(&base).unwrap(); - std::fs::write(base.join("rootfs-console.log"), "sandbox booting\n").unwrap(); - - assert!(!guest_ssh_ready(&base).await); - - let _ = std::fs::remove_dir_all(base); - } - fn unique_temp_dir() -> PathBuf { static COUNTER: AtomicU64 = AtomicU64::new(0); let nanos = SystemTime::now() diff --git a/crates/openshell-driver-vm/src/ffi.rs b/crates/openshell-driver-vm/src/ffi.rs index 750788ac1..0391e048c 100644 --- a/crates/openshell-driver-vm/src/ffi.rs +++ b/crates/openshell-driver-vm/src/ffi.rs @@ -68,6 +68,11 @@ pub struct LibKrun { pub krun_set_root: KrunSetRoot, pub krun_set_workdir: KrunSetWorkdir, pub krun_set_exec: KrunSetExec, + /// Kept loaded for future use (e.g. exposing sandbox-requested ports + /// from the guest via libkrun's built-in port mapper instead of + /// gvproxy). Currently unused since all port forwarding is done by + /// gvproxy over its API socket. + #[allow(dead_code)] pub krun_set_port_map: KrunSetPortMap, pub krun_set_console_output: KrunSetConsoleOutput, pub krun_start_enter: KrunStartEnter, diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs index 9888feb18..b814a61d0 100644 --- a/crates/openshell-driver-vm/src/runtime.rs +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -64,10 +64,15 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { vm.set_root(&config.rootfs)?; vm.set_workdir(&config.workdir)?; - let mut forwarded_port_map = config.port_map.clone(); - let mut gvproxy_guard = None; - let mut gvproxy_api_sock = None; - if !config.port_map.is_empty() { + // The guest supervisor opens an outbound `ConnectSupervisor` gRPC + // stream to the host gateway on startup and keeps it alive for the + // sandbox lifetime. Without gvproxy wired up, the VM has no eth0, + // loses DHCP, and cannot reach the gateway — so we always start + // gvproxy even when the caller doesn't request any explicit port + // forwards. (Prior to the supervisor-initiated relay migration the + // driver forwarded a host-side SSH port and gvproxy ran incidentally + // as a byproduct; that implicit setup is gone now.) + let (gvproxy_guard, gvproxy_api_sock, forwarded_port_map) = { let gvproxy_binary = runtime_dir.join("gvproxy"); if !gvproxy_binary.is_file() { return Err(format!( @@ -91,7 +96,6 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { .map_err(|e| format!("create gvproxy log {}: {e}", gvproxy_log.display()))?; let gvproxy_ports = plan_gvproxy_ports(&config.port_map)?; - forwarded_port_map = gvproxy_ports.forwarded_ports; #[cfg(target_os = "linux")] let (gvproxy_net_flag, gvproxy_net_url) = @@ -142,13 +146,13 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; } - gvproxy_guard = Some(GvproxyGuard::new(child)); - gvproxy_api_sock = Some(api_sock); - } + ( + Some(GvproxyGuard::new(child)), + Some(api_sock), + gvproxy_ports.forwarded_ports, + ) + }; - if !config.port_map.is_empty() && gvproxy_api_sock.is_none() { - vm.set_port_map(&config.port_map)?; - } vm.set_console_output(&config.console_output)?; let env = if config.env.is_empty() { @@ -399,15 +403,6 @@ impl VmContext { ) } - fn set_port_map(&self, port_map: &[String]) -> Result<(), String> { - let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); - let (_owners, ptrs) = c_string_array(&port_strs)?; - check( - unsafe { (self.krun.krun_set_port_map)(self.ctx_id, ptrs.as_ptr()) }, - "krun_set_port_map", - ) - } - fn set_console_output(&self, path: &Path) -> Result<(), String> { let console_c = path_to_cstring(path)?; check( From 593a86cfec415cdede2cfd6f49c07b63a7566ab7 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Mon, 20 Apr 2026 18:46:38 -0700 Subject: [PATCH 2/5] e2e(vm): run smoke against openshell-gateway with the VM compute driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite e2e/rust/e2e-vm.sh for the split-binary flow (openshell-gateway + openshell-driver-vm) now that the former openshell-vm K8s-in-a-VM binary is gone. The new flow: 1. Stage the embedded VM runtime (libkrun + gvproxy + base rootfs) via mise run vm:setup and mise run vm:rootfs -- --base, both idempotent and run only when artifacts are missing. 2. Build openshell-gateway, openshell-driver-vm, and the openshell CLI from the current workspace with cargo. 3. On macOS, codesign the driver with the Hypervisor.framework entitlement so libkrun can start the microVM. 4. Start the gateway with --drivers vm --disable-tls --disable-gateway-auth --db-url sqlite::memory:, pinning --driver-dir target/debug so the gateway picks up the freshly built driver rather than ~/.local/libexec/openshell from a prior install-vm.sh run. 5. Wait for 'Server listening', run the cluster-agnostic Rust smoke test against OPENSHELL_GATEWAY_ENDPOINT=http://127.0.0.1:, then SIGTERM the gateway. State paths root under /tmp rather than target/ because the VM driver's compute-driver.sock lives under --vm-driver-state-dir; with AF_UNIX SUN_LEN = 104 bytes on macOS (108 on Linux), worktree paths under target/ routinely blow the limit. On failure, the trap preserves the per-run state dir plus dumps the gateway log and every sandbox's rootfs-console.log inline so CI artifacts capture post-mortem data. Drop the former --vm-port / --vm-name reuse path entirely — the new gateway is cheap to start (a few seconds, no k3s bootstrap) and that reuse flow mapped to openshell-vm's StatefulSet rollout, which no longer exists. Drop the build:docker:gateway and vm:build task dependencies from tasks/test.toml's e2e:vm for the same reason. --- e2e/rust/e2e-vm.sh | 420 ++++++++++++++++++++++----------------------- tasks/test.toml | 3 +- 2 files changed, 202 insertions(+), 221 deletions(-) diff --git a/e2e/rust/e2e-vm.sh b/e2e/rust/e2e-vm.sh index 5fd055036..5990d8db6 100755 --- a/e2e/rust/e2e-vm.sh +++ b/e2e/rust/e2e-vm.sh @@ -2,245 +2,227 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Run the Rust e2e smoke test against an openshell-vm gateway. +# Run the Rust e2e smoke test against an openshell-gateway running the +# standalone VM compute driver (`openshell-driver-vm`). # -# Usage: -# mise run e2e:vm # start new named VM on random port -# mise run e2e:vm -- --vm-port=30051 # reuse existing VM on port 30051 -# mise run e2e:vm -- --vm-port=30051 --vm-name=my-vm # reuse existing named VM and run exec check -# -# Options: -# --vm-port=PORT Skip VM startup and test against this port. -# --vm-name=NAME VM instance name. Auto-generated for fresh VMs. +# Architecture (post supervisor-initiated relay, PR #867): +# * The gateway never dials the sandbox. Instead, the in-guest +# supervisor opens an outbound `ConnectSupervisor` gRPC stream to +# the gateway on startup and keeps it alive for the sandbox +# lifetime. SSH (`/connect/ssh`) and `ExecSandbox` traffic ride the +# same TCP+TLS+HTTP/2 connection as multiplexed HTTP/2 streams. +# * There is no host-side SSH port forward. gvproxy still provides +# guest egress so the supervisor can reach the gateway, but it no +# longer forwards any TCP port back to the guest. +# * Readiness is authoritative on the gateway: a sandbox's phase +# flips to `Ready` the moment `ConnectSupervisor` registers, and +# back to `Provisioning` when the session drops. The VM driver +# only reports `Error` conditions for dead launcher processes. # -# When --vm-port is omitted: -# 1. Picks a random free host port -# 2. Starts the VM with --name --port :30051 -# 3. Waits for the VM to fully bootstrap (mTLS certs + gRPC health) -# 4. Verifies `openshell-vm exec` works -# 5. Runs the Rust smoke test -# 6. Tears down the VM +# Usage: +# mise run e2e:vm # -# When --vm-port is given the script assumes the VM is already running -# on that port and runs the smoke test. The VM exec check runs only when -# --vm-name is provided (so the script can target the correct instance). +# What the script does: +# 1. Ensures the VM runtime (libkrun + gvproxy + rootfs) is staged. +# 2. Builds `openshell-gateway`, `openshell-driver-vm`, and the +# `openshell` CLI with the embedded runtime. +# 3. On macOS, codesigns the VM driver (libkrun needs the +# `com.apple.security.hypervisor` entitlement). +# 4. Starts the gateway with `--drivers vm --disable-tls +# --disable-gateway-auth --db-url sqlite::memory:` on a random +# free port, waits for `Server listening`, then runs the +# cluster-agnostic Rust smoke test. +# 5. Tears the gateway down and (on failure) preserves the gateway +# log and every VM serial console log for post-mortem. # -# Prerequisites (when starting a new VM): `mise run vm:build` must already -# be done (the e2e:vm mise task handles this via depends). +# Prerequisites (handled automatically by this script if missing): +# - `mise run vm:setup` — downloads / builds the libkrun runtime. +# - `mise run vm:rootfs -- --base` — builds the sandbox rootfs tarball. set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -RUNTIME_DIR="${ROOT}/target/debug/openshell-vm.runtime" -GATEWAY_BIN="${ROOT}/target/debug/openshell-vm" -VM_GATEWAY_IMAGE="${IMAGE_REPO_BASE:-openshell}/gateway:${IMAGE_TAG:-dev}" -VM_GATEWAY_TAR_REL="var/lib/rancher/k3s/agent/images/openshell-server.tar.zst" -GUEST_PORT=30051 -TIMEOUT=180 - -named_vm_rootfs() { - local vm_version - - vm_version=$("${GATEWAY_BIN}" --version | awk '{print $2}') - printf '%s\n' "${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/openshell-vm/${vm_version}/instances/${VM_NAME}/rootfs" -} - -vm_exec() { - local rootfs_args=() - if [ -n "${VM_ROOTFS_DIR:-}" ]; then - rootfs_args=(--rootfs "${VM_ROOTFS_DIR}") - fi - "${GATEWAY_BIN}" "${rootfs_args[@]}" --name "${VM_NAME}" exec -- "$@" -} +COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed" +GATEWAY_BIN="${ROOT}/target/debug/openshell-gateway" +DRIVER_BIN="${ROOT}/target/debug/openshell-driver-vm" + +# The VM driver places `compute-driver.sock` under --vm-driver-state-dir. +# AF_UNIX SUN_LEN is 104 bytes on macOS (108 on Linux), so paths anchored +# in the workspace's `target/` blow the limit on typical developer +# machines — e.g. a ~100-char `~/.superset/worktrees/.../target/...` +# prefix plus the `compute-driver.sock` leaf leaves no room. macOS' +# per-user `$TMPDIR` (`/var/folders/xx/.../T/`) can be 50+ chars too, +# so root state under `/tmp` unconditionally to keep UDS paths short. +STATE_DIR_ROOT="/tmp" + +# Smoke test timeouts. First boot extracts the embedded libkrun runtime +# (~60–90MB of zstd per architecture) and the sandbox rootfs (~200MB). +# The guest then runs k3s-free sandbox supervisor startup; a cold +# microVM is typically ready within ~15s. +GATEWAY_READY_TIMEOUT=60 +SANDBOX_PROVISION_TIMEOUT=180 + +# ── Build prerequisites ────────────────────────────────────────────── + +if [ ! -f "${COMPRESSED_DIR}/rootfs.tar.zst" ]; then + echo "==> Building base VM rootfs tarball (mise run vm:rootfs -- --base)" + mise run vm:rootfs -- --base +fi -prepare_named_vm_rootfs() { - if [ -z "${VM_NAME}" ]; then - return 0 - fi +if [ ! -f "${COMPRESSED_DIR}/rootfs.tar.zst" ] \ + || ! find "${COMPRESSED_DIR}" -maxdepth 1 -name 'libkrun*.zst' | grep -q .; then + echo "==> Preparing embedded VM runtime (mise run vm:setup)" + mise run vm:setup +fi - echo "Preparing named VM rootfs '${VM_NAME}'..." - VM_ROOTFS_DIR="$("${ROOT}/tasks/scripts/vm/ensure-vm-rootfs.sh" --name "${VM_NAME}" \ - | tail -n 1 | sed 's/^using openshell-vm rootfs at //')" - "${ROOT}/tasks/scripts/vm/sync-vm-rootfs.sh" --name "${VM_NAME}" -} +export OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR:-${COMPRESSED_DIR}}" + +echo "==> Building openshell-gateway, openshell-driver-vm, openshell (CLI)" +cargo build \ + -p openshell-server \ + -p openshell-driver-vm \ + -p openshell-cli \ + --features openshell-core/dev-settings + +if [ "$(uname -s)" = "Darwin" ]; then + echo "==> Codesigning openshell-driver-vm (Hypervisor entitlement)" + codesign \ + --entitlements "${ROOT}/crates/openshell-driver-vm/entitlements.plist" \ + --force \ + -s - \ + "${DRIVER_BIN}" +fi -refresh_vm_gateway() { - if [ -z "${VM_NAME}" ]; then - return 0 +# ── Pick a random free host port for the gateway ───────────────────── + +HOST_PORT="$(python3 -c 'import socket +s = socket.socket() +s.bind(("", 0)) +print(s.getsockname()[1]) +s.close()')" + +# Per-run state dir so concurrent e2e runs don't collide on the UDS or +# sandbox state. The VM driver creates `/compute-driver.sock` +# and `/sandboxes//rootfs/` under here. Keep the +# basename short — see the SUN_LEN comment above. +RUN_STATE_DIR="${STATE_DIR_ROOT}/os-vm-e2e-${HOST_PORT}-$$" +mkdir -p "${RUN_STATE_DIR}" + +GATEWAY_LOG="$(mktemp /tmp/openshell-gateway-e2e.XXXXXX)" + +# ── Cleanup (trap) ─────────────────────────────────────────────────── + +cleanup() { + local exit_code=$? + + if [ -n "${GATEWAY_PID:-}" ] && kill -0 "${GATEWAY_PID}" 2>/dev/null; then + echo "Stopping openshell-gateway (pid ${GATEWAY_PID})..." + # SIGTERM first; gateway drops ManagedDriverProcess which SIGKILLs + # the driver and removes the UDS. Wait briefly, then force-kill. + kill -TERM "${GATEWAY_PID}" 2>/dev/null || true + for _ in 1 2 3 4 5 6 7 8 9 10; do + kill -0 "${GATEWAY_PID}" 2>/dev/null || break + sleep 0.5 + done + kill -KILL "${GATEWAY_PID}" 2>/dev/null || true + wait "${GATEWAY_PID}" 2>/dev/null || true fi - echo "Refreshing VM gateway StatefulSet image to ${VM_GATEWAY_IMAGE}..." - # Re-import the host-synced :dev image into the VM's containerd, then - # force a rollout when the StatefulSet already points at the same tag. - vm_exec sh -lc "set -eu; \ - image_tar='/${VM_GATEWAY_TAR_REL}'; \ - k3s ctr -n k8s.io images import \"\${image_tar}\" >/dev/null; \ - current_image=\$(kubectl -n openshell get statefulset/openshell -o jsonpath='{.spec.template.spec.containers[?(@.name==\"openshell\")].image}'); \ - if [ \"\${current_image}\" = \"${VM_GATEWAY_IMAGE}\" ]; then \ - kubectl -n openshell rollout restart statefulset/openshell >/dev/null; \ - else \ - kubectl -n openshell set image statefulset/openshell openshell=${VM_GATEWAY_IMAGE} >/dev/null; \ - fi; \ - kubectl -n openshell rollout status statefulset/openshell --timeout=300s" - echo "Gateway rollout complete." -} - -wait_for_gateway_health() { - local elapsed=0 timeout=60 consecutive_ok=0 - - echo "Waiting for refreshed gateway health..." - while [ "${elapsed}" -lt "${timeout}" ]; do - if "${ROOT}/target/debug/openshell" status >/dev/null 2>&1; then - consecutive_ok=$((consecutive_ok + 1)) - if [ "${consecutive_ok}" -ge 3 ]; then - echo "Gateway health confirmed after refresh." - return 0 - fi - else - consecutive_ok=0 - fi - - sleep 2 - elapsed=$((elapsed + 2)) - done - - echo "ERROR: refreshed gateway did not become healthy after ${timeout}s" - return 1 -} - -# ── Parse arguments ────────────────────────────────────────────────── -VM_PORT="" -VM_NAME="" -VM_ROOTFS_DIR="" -for arg in "$@"; do - case "$arg" in - --vm-port=*) VM_PORT="${arg#--vm-port=}" ;; - --vm-name=*) VM_NAME="${arg#--vm-name=}" ;; - *) echo "Unknown argument: $arg"; exit 1 ;; - esac -done + # On failure, keep the VM console log for debugging. We deliberately + # print it instead of leaving it on disk because the state dir gets + # wiped on success. + if [ "${exit_code}" -ne 0 ]; then + echo "=== gateway log (preserved for debugging) ===" + cat "${GATEWAY_LOG}" 2>/dev/null || true + echo "=== end gateway log ===" + + local console + while IFS= read -r -d '' console; do + echo "=== VM console log: ${console} ===" + cat "${console}" 2>/dev/null || true + echo "=== end VM console log ===" + done < <(find "${RUN_STATE_DIR}/sandboxes" -name 'rootfs-console.log' -print0 2>/dev/null) + fi -# ── Determine mode ─────────────────────────────────────────────────── -if [ -n "${VM_PORT}" ]; then - # Point at an already-running VM. - HOST_PORT="${VM_PORT}" - echo "Using existing VM on port ${HOST_PORT}." - if [ -n "${VM_NAME}" ]; then - prepare_named_vm_rootfs + rm -f "${GATEWAY_LOG}" 2>/dev/null || true + # Only wipe the per-run state dir on success. On failure, leave it for + # post-mortem (serial console logs, gvproxy logs, rootfs dumps). + if [ "${exit_code}" -eq 0 ]; then + rm -rf "${RUN_STATE_DIR}" 2>/dev/null || true + else + echo "NOTE: preserving ${RUN_STATE_DIR} for debugging" fi -else - # Pick a random free port and start a new VM. - HOST_PORT=$(python3 -c 'import socket; s=socket.socket(); s.bind(("",0)); print(s.getsockname()[1]); s.close()') - if [ -z "${VM_NAME}" ]; then - VM_NAME="e2e-${HOST_PORT}-$$" +} +trap cleanup EXIT + +# ── Launch the gateway + VM driver ─────────────────────────────────── + +SSH_HANDSHAKE_SECRET="$(openssl rand -hex 32)" + +echo "==> Starting openshell-gateway on 127.0.0.1:${HOST_PORT} (state: ${RUN_STATE_DIR})" + +# Pin --driver-dir to the workspace `target/debug/` so we always pick up +# the driver we just cargo-built. Without this, the gateway's +# `resolve_compute_driver_bin` fallback prefers +# `~/.local/libexec/openshell/openshell-driver-vm` when present +# (install-vm.sh installs there), which silently shadows development +# builds — a subtle source of stale-binary bugs in e2e runs. +"${GATEWAY_BIN}" \ + --drivers vm \ + --disable-tls \ + --disable-gateway-auth \ + --db-url 'sqlite::memory:' \ + --port "${HOST_PORT}" \ + --grpc-endpoint "http://127.0.0.1:${HOST_PORT}" \ + --ssh-handshake-secret "${SSH_HANDSHAKE_SECRET}" \ + --driver-dir "${ROOT}/target/debug" \ + --vm-driver-state-dir "${RUN_STATE_DIR}" \ + >"${GATEWAY_LOG}" 2>&1 & +GATEWAY_PID=$! + +# ── Wait for gateway readiness ─────────────────────────────────────── +# +# The gateway logs `INFO openshell_server: Server listening +# address=0.0.0.0:` after its tonic listener is up. That is the +# only signal the smoke test needs — the VM driver is spawned eagerly +# but sandboxes are created on demand, so "Server listening" is the +# right gate here. + +echo "==> Waiting for gateway readiness (timeout ${GATEWAY_READY_TIMEOUT}s)" +elapsed=0 +while ! grep -q 'Server listening' "${GATEWAY_LOG}" 2>/dev/null; do + if ! kill -0 "${GATEWAY_PID}" 2>/dev/null; then + echo "ERROR: openshell-gateway exited before becoming ready" + exit 1 fi - - cleanup() { - local exit_code=$? - if [ -n "${VM_PID:-}" ] && kill -0 "$VM_PID" 2>/dev/null; then - echo "Stopping openshell-vm (pid ${VM_PID})..." - kill "$VM_PID" 2>/dev/null || true - wait "$VM_PID" 2>/dev/null || true - fi - # On failure, preserve the VM console log for post-mortem debugging. - if [ "$exit_code" -ne 0 ] && [ -n "${VM_NAME:-}" ]; then - local console_log - console_log="$(named_vm_rootfs)-console.log" - if [ -f "$console_log" ]; then - echo "=== VM console log (preserved for debugging) ===" - cat "$console_log" - echo "=== end VM console log ===" - fi - fi - rm -f "${VM_LOG:-}" 2>/dev/null || true - if [ -n "${VM_NAME:-}" ]; then - rm -rf "$(dirname "$(named_vm_rootfs)")" 2>/dev/null || true - fi - } - trap cleanup EXIT - - prepare_named_vm_rootfs - - echo "Starting openshell-vm '${VM_NAME}' on port ${HOST_PORT}..." - if [ "$(uname -s)" = "Darwin" ]; then - export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" + if [ "${elapsed}" -ge "${GATEWAY_READY_TIMEOUT}" ]; then + echo "ERROR: openshell-gateway did not become ready after ${GATEWAY_READY_TIMEOUT}s" + exit 1 fi + sleep 1 + elapsed=$((elapsed + 1)) +done - VM_LOG=$(mktemp /tmp/openshell-vm-e2e.XXXXXX) - rootfs_args=() - if [ -n "${VM_ROOTFS_DIR}" ]; then - rootfs_args=(--rootfs "${VM_ROOTFS_DIR}") - fi - "${GATEWAY_BIN}" "${rootfs_args[@]}" --name "${VM_NAME}" --port "${HOST_PORT}:${GUEST_PORT}" 2>"${VM_LOG}" & - VM_PID=$! - - # ── Wait for full bootstrap (mTLS certs + gRPC health) ───────────── - # The VM prints "Ready [Xs total]" to stderr after bootstrap_gateway() - # stores mTLS certs and wait_for_gateway_ready() confirms the gRPC - # service is responding. Waiting only for TCP port reachability (nc -z) - # is insufficient because port forwarding is established before the - # mTLS certs are written, causing `openshell status` to fail. - echo "Waiting for VM bootstrap to complete (timeout ${TIMEOUT}s)..." - elapsed=0 - while ! grep -q "^Ready " "${VM_LOG}" 2>/dev/null; do - if ! kill -0 "$VM_PID" 2>/dev/null; then - echo "ERROR: openshell-vm exited before becoming ready" - echo "VM log:" - cat "${VM_LOG}" - exit 1 - fi - if [ "$elapsed" -ge "$TIMEOUT" ]; then - echo "ERROR: openshell-vm did not become ready after ${TIMEOUT}s" - echo "VM log:" - cat "${VM_LOG}" - exit 1 - fi - sleep 2 - elapsed=$((elapsed + 2)) - done - echo "Gateway is ready (${elapsed}s)." - echo "VM log:" - cat "${VM_LOG}" -fi +echo "==> Gateway ready after ${elapsed}s" -# ── Exec into the VM (when instance name is known) ─────────────────── -if [ -n "${VM_NAME}" ]; then - echo "Verifying openshell-vm exec for '${VM_NAME}'..." - exec_elapsed=0 - exec_timeout=60 - until vm_exec /bin/true; do - if [ "$exec_elapsed" -ge "$exec_timeout" ]; then - echo "ERROR: openshell-vm exec did not become ready after ${exec_timeout}s" - exit 1 - fi - sleep 2 - exec_elapsed=$((exec_elapsed + 2)) - done - echo "VM exec succeeded." -else - echo "Skipping openshell-vm exec check (provide --vm-name for existing VMs)." -fi +# ── Run the smoke test ─────────────────────────────────────────────── +# +# The CLI takes OPENSHELL_GATEWAY_ENDPOINT directly; no gateway +# metadata lookup needed when TLS is disabled. -refresh_vm_gateway +export OPENSHELL_GATEWAY_ENDPOINT="http://127.0.0.1:${HOST_PORT}" -# ── Run the smoke test ─────────────────────────────────────────────── -# The openshell CLI reads OPENSHELL_GATEWAY_ENDPOINT to connect to the -# gateway directly, and OPENSHELL_GATEWAY to resolve mTLS certs from -# ~/.config/openshell/gateways//mtls/. -# In the VM, the overlayfs snapshotter re-extracts all image layers on -# every boot. The 1GB sandbox base image extraction can take >300s -# under contention, so allow 600s for sandbox provisioning. -export OPENSHELL_PROVISION_TIMEOUT=600 -export OPENSHELL_GATEWAY_ENDPOINT="https://127.0.0.1:${HOST_PORT}" -if [ -n "${VM_NAME}" ]; then - export OPENSHELL_GATEWAY="openshell-vm-${VM_NAME}" -else - export OPENSHELL_GATEWAY="openshell-vm" -fi +# The VM driver creates each sandbox VM from scratch — the embedded +# rootfs is extracted per sandbox, and the guest's sandbox supervisor +# then initializes policy, netns, Landlock, and sshd. On a cold host +# this is ~15s; allow 180s for slower CI runners. +export OPENSHELL_PROVISION_TIMEOUT="${SANDBOX_PROVISION_TIMEOUT}" -echo "Running e2e smoke test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${OPENSHELL_GATEWAY_ENDPOINT})..." -cargo build -p openshell-cli --features openshell-core/dev-settings -wait_for_gateway_health -cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test smoke -- --nocapture +echo "==> Running e2e smoke test (endpoint: ${OPENSHELL_GATEWAY_ENDPOINT})" +cargo test \ + --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ + --features e2e \ + --test smoke \ + -- --nocapture -echo "Smoke test passed." +echo "==> Smoke test passed." diff --git a/tasks/test.toml b/tasks/test.toml index f24ea6f2b..cf45d2b6b 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -49,6 +49,5 @@ env = { UV_NO_SYNC = "1", PYTHONPATH = "python" } run = "uv run pytest -o python_files='test_*.py' -m gpu -n ${E2E_PARALLEL:-1} e2e/python" ["e2e:vm"] -description = "Boot openshell-vm and run smoke e2e (macOS ARM64; pass -- --vm-port=N [--vm-name=NAME] to reuse)" -depends = ["build:docker:gateway", "vm:build"] +description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e" run = "e2e/rust/e2e-vm.sh" From 3af6d1a7263daf270a17ac67bcf8f1ef12012f7d Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Mon, 20 Apr 2026 22:40:50 -0700 Subject: [PATCH 3/5] refactor(driver-vm): drop port-forwarding plumbing from gvproxy setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the SSH port forward removed in #867 and no other host→guest port mappings in play, everything that configured gvproxy's port-forwarder is dead weight. gvproxy stays because the VM still needs its virtual NIC, DHCP server, and default router for guest egress, and because the sandbox supervisor's per-sandbox netns (veth + iptables, see openshell-sandbox/src/sandbox/linux/netns.rs) needs a real kernel network stack inside the guest to branch off of — libkrun's built-in TSI socket impersonation would not satisfy those primitives. What we stop doing: * Dropping the `-listen` API socket. No one calls `/services/forwarder/expose` on it any more. * Passing `-ssh-port -1`. gvproxy's default 2222 SSH forward binds a host-side TCP listener that would race concurrent sandboxes and surface a misleading 'sshd is reachable' endpoint. `-1` is gvproxy's documented switch for 'no SSH forward'; see getForwardsMap in containers/gvisor-tap-vsock cmd/gvproxy/main.go. * Removing VmLaunchConfig::port_map and the CLI --vm-port flag. * Removing krun_set_port_map from the libkrun FFI bindings. * Removing helpers that only made sense when we had a port map to manage: plan_gvproxy_ports, parse_port_mapping, expose_port_map, gvproxy_expose, pick_gvproxy_ssh_port, kill_stale_gvproxy_by_port, kill_stale_gvproxy_by_port_map, kill_gvproxy_pid, is_process_named, and the GUEST_SSH_PORT constant. * Removing the four port-mapping unit tests. Verified: after `sandbox create -- echo hi`, `lsof` shows gvproxy opens zero TCP listeners; only its qemu/vfkit unixgram data socket remains. E2E smoke still passes in ~10s. --- crates/openshell-driver-vm/src/ffi.rs | 8 - crates/openshell-driver-vm/src/lib.rs | 2 - crates/openshell-driver-vm/src/main.rs | 4 - crates/openshell-driver-vm/src/runtime.rs | 337 +++------------------- 4 files changed, 42 insertions(+), 309 deletions(-) diff --git a/crates/openshell-driver-vm/src/ffi.rs b/crates/openshell-driver-vm/src/ffi.rs index 0391e048c..a81b150af 100644 --- a/crates/openshell-driver-vm/src/ffi.rs +++ b/crates/openshell-driver-vm/src/ffi.rs @@ -37,7 +37,6 @@ type KrunSetExec = unsafe extern "C" fn( argv: *const *const c_char, envp: *const *const c_char, ) -> i32; -type KrunSetPortMap = unsafe extern "C" fn(ctx_id: u32, port_map: *const *const c_char) -> i32; type KrunSetConsoleOutput = unsafe extern "C" fn(ctx_id: u32, filepath: *const c_char) -> i32; type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32; type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32; @@ -68,12 +67,6 @@ pub struct LibKrun { pub krun_set_root: KrunSetRoot, pub krun_set_workdir: KrunSetWorkdir, pub krun_set_exec: KrunSetExec, - /// Kept loaded for future use (e.g. exposing sandbox-requested ports - /// from the guest via libkrun's built-in port mapper instead of - /// gvproxy). Currently unused since all port forwarding is done by - /// gvproxy over its API socket. - #[allow(dead_code)] - pub krun_set_port_map: KrunSetPortMap, pub krun_set_console_output: KrunSetConsoleOutput, pub krun_start_enter: KrunStartEnter, pub krun_disable_implicit_vsock: KrunDisableImplicitVsock, @@ -126,7 +119,6 @@ impl LibKrun { krun_set_root: load_symbol(library, b"krun_set_root\0", &libkrun_path)?, krun_set_workdir: load_symbol(library, b"krun_set_workdir\0", &libkrun_path)?, krun_set_exec: load_symbol(library, b"krun_set_exec\0", &libkrun_path)?, - krun_set_port_map: load_symbol(library, b"krun_set_port_map\0", &libkrun_path)?, krun_set_console_output: load_symbol( library, b"krun_set_console_output\0", diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs index 1c424deeb..c57bc66cd 100644 --- a/crates/openshell-driver-vm/src/lib.rs +++ b/crates/openshell-driver-vm/src/lib.rs @@ -7,7 +7,5 @@ mod ffi; mod rootfs; mod runtime; -pub const GUEST_SSH_PORT: u16 = 2222; - pub use driver::{VmDriver, VmDriverConfig}; pub use runtime::{VM_RUNTIME_DIR_ENV, VmLaunchConfig, configured_runtime_dir, run_vm}; diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs index 3a7976273..35e6b618d 100644 --- a/crates/openshell-driver-vm/src/main.rs +++ b/crates/openshell-driver-vm/src/main.rs @@ -34,9 +34,6 @@ struct Args { #[arg(long, hide = true)] vm_env: Vec, - #[arg(long, hide = true)] - vm_port: Vec, - #[arg(long, hide = true)] vm_console_output: Option, @@ -183,7 +180,6 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result, pub env: Vec, pub workdir: String, - pub port_map: Vec, pub log_level: u32, pub console_output: PathBuf, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -struct PortMapping { - host_port: u16, - guest_port: u16, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -struct GvproxyPortPlan { - ssh_port: u16, - forwarded_ports: Vec, -} - pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { if !config.rootfs.is_dir() { return Err(format!( @@ -64,15 +49,37 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { vm.set_root(&config.rootfs)?; vm.set_workdir(&config.workdir)?; - // The guest supervisor opens an outbound `ConnectSupervisor` gRPC - // stream to the host gateway on startup and keeps it alive for the - // sandbox lifetime. Without gvproxy wired up, the VM has no eth0, - // loses DHCP, and cannot reach the gateway — so we always start - // gvproxy even when the caller doesn't request any explicit port - // forwards. (Prior to the supervisor-initiated relay migration the - // driver forwarded a host-side SSH port and gvproxy ran incidentally - // as a byproduct; that implicit setup is gone now.) - let (gvproxy_guard, gvproxy_api_sock, forwarded_port_map) = { + // Run gvproxy strictly as the guest's virtual NIC / DHCP / router. + // + // After the supervisor-initiated relay migration (#867), the driver + // no longer forwards any host-side ports into the guest — all ingress + // traffic for SSH and exec rides the outbound `ConnectSupervisor` + // gRPC stream the guest opens to the gateway. What gvproxy still + // provides here is the TCP/IP *plane* the guest kernel needs: + // + // * a virtio-net backend attached to libkrun via a Unix + // SOCK_STREAM (Linux) or SOCK_DGRAM (macOS vfkit), which + // surfaces as `eth0` inside the guest; + // * the DHCP server + default router the guest's udhcpc client + // talks to on boot (IPs 192.168.127.1 / .2, defaults for + // gvisor-tap-vsock); + // * name resolution for `host.containers.internal` / + // `host.docker.internal`, which is how the guest's + // `rewrite_openshell_endpoint_if_needed` probe reaches the host + // gateway when the bare loopback address doesn't resolve from + // inside the VM. + // + // That network plane is also what the sandbox supervisor's + // per-sandbox netns (veth pair + iptables, see + // `openshell-sandbox/src/sandbox/linux/netns.rs`) branches off of; + // libkrun's built-in TSI socket impersonation would not satisfy + // those kernel-level primitives. + // + // The `-listen` API socket and `-ssh-port` forwarder are both + // deliberately omitted: nothing in the driver enqueues port + // forwards on the API any more, and the host-side SSH listener is + // dead plumbing. + let gvproxy_guard = { let gvproxy_binary = runtime_dir.join("gvproxy"); if !gvproxy_binary.is_file() { return Err(format!( @@ -81,13 +88,9 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { )); } - kill_stale_gvproxy_by_port_map(&config.port_map); - let sock_base = gvproxy_socket_base(&config.rootfs)?; let net_sock = sock_base.with_extension("v"); - let api_sock = sock_base.with_extension("a"); let _ = std::fs::remove_file(&net_sock); - let _ = std::fs::remove_file(&api_sock); let _ = std::fs::remove_file(sock_base.with_extension("v-krun.sock")); let run_dir = config.rootfs.parent().unwrap_or(&config.rootfs); @@ -95,8 +98,6 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { let gvproxy_log_file = std::fs::File::create(&gvproxy_log) .map_err(|e| format!("create gvproxy log {}: {e}", gvproxy_log.display()))?; - let gvproxy_ports = plan_gvproxy_ports(&config.port_map)?; - #[cfg(target_os = "linux")] let (gvproxy_net_flag, gvproxy_net_url) = ("-listen-qemu", format!("unix://{}", net_sock.display())); @@ -106,13 +107,19 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { format!("unixgram://{}", net_sock.display()), ); + // `-ssh-port -1` tells gvproxy to skip its default SSH forward + // (127.0.0.1:2222 → guest:22). We don't use it — all gateway + // ingress rides the supervisor-initiated relay — and leaving + // the default on would bind a host-side TCP listener per + // sandbox, racing concurrent sandboxes for port 2222 and + // surfacing a misleading "sshd is reachable" endpoint. See + // https://github.com/containers/gvisor-tap-vsock `cmd/gvproxy/main.go` + // (`getForwardsMap` returns an empty map when `sshPort == -1`). let child = StdCommand::new(&gvproxy_binary) .arg(gvproxy_net_flag) .arg(&gvproxy_net_url) - .arg("-listen") - .arg(format!("unix://{}", api_sock.display())) .arg("-ssh-port") - .arg(gvproxy_ports.ssh_port.to_string()) + .arg("-1") .stdin(Stdio::null()) .stdout(Stdio::null()) .stderr(gvproxy_log_file) @@ -146,11 +153,7 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { vm.add_net_unixgram(&net_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; } - ( - Some(GvproxyGuard::new(child)), - Some(api_sock), - gvproxy_ports.forwarded_ports, - ) + Some(GvproxyGuard::new(child)) }; vm.set_console_output(&config.console_output)?; @@ -177,21 +180,6 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { _ => { install_signal_forwarding(pid); - let port_forward_result = if let Some(api_sock) = gvproxy_api_sock.as_ref() { - expose_port_map(api_sock, &forwarded_port_map) - } else { - Ok(()) - }; - - if let Err(err) = port_forward_result { - unsafe { - libc::kill(pid, libc::SIGTERM); - } - let _ = wait_for_child(pid); - cleanup_gvproxy(gvproxy_guard); - return Err(err); - } - let status = wait_for_child(pid)?; CHILD_PID.store(0, Ordering::Relaxed); cleanup_gvproxy(gvproxy_guard); @@ -471,126 +459,6 @@ impl Drop for GvproxyGuard { } } -fn expose_port_map(api_sock: &Path, port_map: &[String]) -> Result<(), String> { - wait_for_path(api_sock, Duration::from_secs(2), "gvproxy API socket")?; - let guest_ip = "192.168.127.2"; - - for pm in port_map { - let mapping = parse_port_mapping(pm)?; - - let expose_body = format!( - r#"{{"local":":{}","remote":"{guest_ip}:{}","protocol":"tcp"}}"#, - mapping.host_port, mapping.guest_port - ); - - let deadline = Instant::now() + Duration::from_secs(10); - let mut retry_interval = Duration::from_millis(100); - loop { - match gvproxy_expose(api_sock, &expose_body) { - Ok(()) => break, - Err(err) if Instant::now() < deadline => { - std::thread::sleep(retry_interval); - retry_interval = (retry_interval * 2).min(Duration::from_secs(1)); - if retry_interval == Duration::from_secs(1) { - eprintln!("retrying gvproxy port expose {pm}: {err}"); - } - } - Err(err) => { - return Err(format!( - "failed to forward port {} via gvproxy: {err}", - mapping.host_port - )); - } - } - } - } - - Ok(()) -} - -fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { - let mut stream = - UnixStream::connect(api_sock).map_err(|e| format!("connect to gvproxy API socket: {e}"))?; - - let request = format!( - "POST /services/forwarder/expose HTTP/1.1\r\n\ - Host: localhost\r\n\ - Content-Type: application/json\r\n\ - Content-Length: {}\r\n\ - Connection: close\r\n\ - \r\n\ - {}", - body.len(), - body, - ); - - stream - .write_all(request.as_bytes()) - .map_err(|e| format!("write to gvproxy API: {e}"))?; - - let mut buf = [0u8; 1024]; - let n = stream - .read(&mut buf) - .map_err(|e| format!("read from gvproxy API: {e}"))?; - let response = String::from_utf8_lossy(&buf[..n]); - let status = response - .lines() - .next() - .and_then(|line| line.split_whitespace().nth(1)) - .unwrap_or("0"); - - match status { - "200" | "204" => Ok(()), - _ => Err(format!( - "gvproxy API: {}", - response.lines().next().unwrap_or("") - )), - } -} - -fn plan_gvproxy_ports(port_map: &[String]) -> Result { - let mut ssh_port = None; - let mut forwarded_ports = Vec::with_capacity(port_map.len()); - - for pm in port_map { - let mapping = parse_port_mapping(pm)?; - if ssh_port.is_none() && mapping.guest_port == GUEST_SSH_PORT && mapping.host_port >= 1024 { - ssh_port = Some(mapping.host_port); - continue; - } - forwarded_ports.push(pm.clone()); - } - - Ok(GvproxyPortPlan { - ssh_port: match ssh_port { - Some(port) => port, - None => pick_gvproxy_ssh_port()?, - }, - forwarded_ports, - }) -} - -fn parse_port_mapping(pm: &str) -> Result { - let parts: Vec<&str> = pm.split(':').collect(); - let (host, guest) = match parts.as_slice() { - [host, guest] => (*host, *guest), - [port] => (*port, *port), - _ => return Err(format!("invalid port mapping '{pm}'")), - }; - - let host_port = host - .parse::() - .map_err(|_| format!("invalid port mapping '{pm}'"))?; - let guest_port = guest - .parse::() - .map_err(|_| format!("invalid port mapping '{pm}'"))?; - - Ok(PortMapping { - host_port, - guest_port, - }) -} - fn wait_for_path(path: &Path, timeout: Duration, label: &str) -> Result<(), String> { let deadline = Instant::now() + timeout; let mut interval = Duration::from_millis(5); @@ -669,92 +537,6 @@ fn gvproxy_socket_base(rootfs: &Path) -> Result { Ok(secure_socket_base("osd-gv")?.join(hash_path_id(rootfs))) } -fn pick_gvproxy_ssh_port() -> Result { - let listener = std::net::TcpListener::bind(("127.0.0.1", 0)) - .map_err(|e| format!("allocate gvproxy ssh port on localhost: {e}"))?; - let port = listener - .local_addr() - .map_err(|e| format!("read gvproxy ssh port: {e}"))? - .port(); - drop(listener); - Ok(port) -} - -fn kill_stale_gvproxy_by_port_map(port_map: &[String]) { - for pm in port_map { - if let Some(host_port) = pm - .split(':') - .next() - .and_then(|port| port.parse::().ok()) - { - kill_stale_gvproxy_by_port(host_port); - } - } -} - -fn kill_stale_gvproxy_by_port(port: u16) { - let output = StdCommand::new("lsof") - .args(["-ti", &format!(":{port}")]) - .output(); - - let pids = match output { - Ok(output) if output.status.success() => { - String::from_utf8_lossy(&output.stdout).to_string() - } - _ => return, - }; - - for line in pids.lines() { - if let Ok(pid) = line.trim().parse::() - && is_process_named(pid as libc::pid_t, "gvproxy") - { - kill_gvproxy_pid(pid); - } - } -} - -fn kill_gvproxy_pid(pid: u32) { - let pid = pid as libc::pid_t; - if unsafe { libc::kill(pid, 0) } != 0 { - return; - } - if !is_process_named(pid, "gvproxy") { - return; - } - unsafe { - libc::kill(pid, libc::SIGTERM); - } - std::thread::sleep(Duration::from_millis(200)); -} - -#[cfg(target_os = "macos")] -fn is_process_named(pid: libc::pid_t, expected: &str) -> bool { - StdCommand::new("ps") - .args(["-p", &pid.to_string(), "-o", "comm="]) - .output() - .ok() - .and_then(|output| { - if output.status.success() { - String::from_utf8(output.stdout).ok() - } else { - None - } - }) - .is_some_and(|name| name.trim().contains(expected)) -} - -#[cfg(target_os = "linux")] -fn is_process_named(pid: libc::pid_t, expected: &str) -> bool { - std::fs::read_to_string(format!("/proc/{pid}/comm")) - .map(|name| name.trim().contains(expected)) - .unwrap_or(false) -} - -#[cfg(not(any(target_os = "macos", target_os = "linux")))] -fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool { - false -} - fn install_signal_forwarding(pid: i32) { unsafe { libc::signal( @@ -835,38 +617,3 @@ fn check_kvm_access() -> Result<(), String> { format!("cannot open /dev/kvm: {e}\nKVM access is required to run microVMs on Linux.") }) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn plan_gvproxy_ports_reuses_sandbox_ssh_mapping() { - let plan = plan_gvproxy_ports(&["64739:2222".to_string()]).expect("plan should succeed"); - - assert_eq!(plan.ssh_port, 64739); - assert!(plan.forwarded_ports.is_empty()); - } - - #[test] - fn plan_gvproxy_ports_keeps_non_ssh_mappings_for_forwarder() { - let plan = plan_gvproxy_ports(&["64739:8080".to_string()]).expect("plan should succeed"); - - assert_ne!(plan.ssh_port, 64739); - assert_eq!(plan.forwarded_ports, vec!["64739:8080".to_string()]); - } - - #[test] - fn plan_gvproxy_ports_ignores_privileged_host_ports_for_direct_ssh() { - let plan = plan_gvproxy_ports(&["22:2222".to_string()]).expect("plan should succeed"); - - assert_ne!(plan.ssh_port, 22); - assert_eq!(plan.forwarded_ports, vec!["22:2222".to_string()]); - } - - #[test] - fn parse_port_mapping_rejects_invalid_entries() { - let err = parse_port_mapping("bad:mapping").expect_err("invalid mapping should fail"); - assert!(err.contains("invalid port mapping")); - } -} From d83dc04d2b200aea16299132580b3cffe8ebdc76 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 21 Apr 2026 19:59:47 -0700 Subject: [PATCH 4/5] wip --- architecture/gateway.md | 2 +- crates/openshell-driver-vm/README.md | 2 +- .../scripts/openshell-vm-sandbox-init.sh | 23 +- crates/openshell-driver-vm/src/driver.rs | 40 ++- crates/openshell-driver-vm/src/runtime.rs | 7 +- crates/openshell-server/src/compute/mod.rs | 247 +++++++++++++++++- crates/openshell-server/src/lib.rs | 11 +- .../src/supervisor_session.rs | 29 ++ crates/openshell-vm/scripts/build-rootfs.sh | 63 +++++ 9 files changed, 409 insertions(+), 15 deletions(-) diff --git a/architecture/gateway.md b/architecture/gateway.md index 5dd2419af..9e9da6785 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -605,7 +605,7 @@ The gateway reaches the sandbox exclusively through the supervisor-initiated `Co - **Create**: The VM driver process allocates a sandbox-specific rootfs from its own embedded `rootfs.tar.zst`, injects an explicitly configured guest mTLS bundle when the gateway callback endpoint is `https://`, then re-execs itself in a hidden helper mode that loads libkrun directly and boots the supervisor. - **Networking**: The helper starts an embedded `gvproxy`, wires it into libkrun as virtio-net, and gives the guest outbound connectivity. No inbound TCP listener is needed — the supervisor reaches the gateway over its outbound `ConnectSupervisor` stream. -- **Gateway callback**: The guest init script configures `eth0` for gvproxy networking, prefers the configured `OPENSHELL_GRPC_ENDPOINT`, and falls back to host aliases or the gvproxy gateway IP (`192.168.127.1`) when local hostname resolution is unavailable on macOS. +- **Gateway callback**: The guest init script configures `eth0` for gvproxy networking, seeds `/etc/hosts` so `host.openshell.internal` resolves to the gvproxy gateway IP (`192.168.127.1`), preserves gvproxy's legacy `host.containers.internal` / `host.docker.internal` DNS answers, prefers the configured `OPENSHELL_GRPC_ENDPOINT`, and falls back to those aliases or the raw gateway IP when local hostname resolution is unavailable on macOS. - **Guest boot**: The sandbox guest runs a minimal init script that starts `openshell-sandbox` directly as PID 1 inside the VM. - **Watch stream**: Emits provisioning, ready, error, deleting, deleted, and platform-event updates so the gateway store remains the durable source of truth. diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md index a95462695..03cb92195 100644 --- a/crates/openshell-driver-vm/README.md +++ b/crates/openshell-driver-vm/README.md @@ -97,7 +97,7 @@ The gateway resolves `openshell-driver-vm` in this order: `--driver-dir`, conven | Flag | Env var | Default | Purpose | |---|---|---|---| | `--drivers vm` | `OPENSHELL_DRIVERS` | `kubernetes` | Select the VM compute driver. | -| `--grpc-endpoint URL` | `OPENSHELL_GRPC_ENDPOINT` | — | Required. URL the sandbox guest calls back to. Use a host alias that resolves to the gateway's host from inside the VM (gvproxy answers `host.containers.internal` and `host.openshell.internal` to `192.168.127.1`). | +| `--grpc-endpoint URL` | `OPENSHELL_GRPC_ENDPOINT` | — | Required. URL the sandbox guest calls back to. Use a host alias that resolves to the gateway's host from inside the VM (`host.containers.internal` comes from gvproxy DNS; the guest init script also seeds `host.openshell.internal` to `192.168.127.1`). | | `--vm-driver-state-dir DIR` | `OPENSHELL_VM_DRIVER_STATE_DIR` | `target/openshell-vm-driver` | Per-sandbox rootfs, console logs, and the `compute-driver.sock` UDS. | | `--driver-dir DIR` | `OPENSHELL_DRIVER_DIR` | unset | Override the directory searched for `openshell-driver-vm`. | | `--vm-driver-vcpus N` | `OPENSHELL_VM_DRIVER_VCPUS` | `2` | vCPUs per sandbox. | diff --git a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh index 70dda5acb..e449003f9 100644 --- a/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh +++ b/crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh @@ -9,6 +9,7 @@ set -euo pipefail BOOT_START=$(date +%s%3N 2>/dev/null || date +%s) +GVPROXY_GATEWAY_IP="192.168.127.1" ts() { local now @@ -72,6 +73,20 @@ tcp_probe() { fi } +ensure_host_gateway_aliases() { + local hosts_tmp="/tmp/openshell-hosts.$$" + + if [ -f /etc/hosts ]; then + grep -vE '(^|[[:space:]])host\.openshell\.internal([[:space:]]|$)' /etc/hosts > "$hosts_tmp" || true + else + : > "$hosts_tmp" + fi + + printf '%s host.openshell.internal\n' "$GVPROXY_GATEWAY_IP" >> "$hosts_tmp" + cat "$hosts_tmp" > /etc/hosts + rm -f "$hosts_tmp" +} + rewrite_openshell_endpoint_if_needed() { local endpoint="${OPENSHELL_ENDPOINT:-}" [ -n "$endpoint" ] || return 0 @@ -92,7 +107,7 @@ rewrite_openshell_endpoint_if_needed() { return 0 fi - for candidate in host.containers.internal host.docker.internal 192.168.127.1; do + for candidate in host.openshell.internal host.containers.internal host.docker.internal "$GVPROXY_GATEWAY_IP"; do if [ "$candidate" = "$host" ]; then continue fi @@ -163,18 +178,20 @@ DHCP_SCRIPT if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then ts "WARNING: DHCP failed, falling back to static config" ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + ip route add default via "$GVPROXY_GATEWAY_IP" 2>/dev/null || true fi else ts "no DHCP client, using static config" ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true + ip route add default via "$GVPROXY_GATEWAY_IP" 2>/dev/null || true fi if [ ! -s /etc/resolv.conf ]; then echo "nameserver 8.8.8.8" > /etc/resolv.conf echo "nameserver 8.8.4.4" >> /etc/resolv.conf fi + + ensure_host_gateway_aliases else ts "WARNING: eth0 not found; supervisor will start without guest egress" fi diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index c3a975d10..f2b50d0e1 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -33,6 +33,8 @@ const DRIVER_NAME: &str = "openshell-driver-vm"; const WATCH_BUFFER: usize = 256; const DEFAULT_VCPUS: u8 = 2; const DEFAULT_MEM_MIB: u32 = 2048; +const GVPROXY_GATEWAY_IP: &str = "192.168.127.1"; +const OPENSHELL_HOST_GATEWAY_ALIAS: &str = "host.openshell.internal"; const GUEST_SSH_SOCKET_PATH: &str = "/run/openshell/ssh.sock"; const GUEST_TLS_DIR: &str = "/opt/openshell/tls"; const GUEST_TLS_CA_PATH: &str = "/opt/openshell/tls/ca.crt"; @@ -147,7 +149,7 @@ fn validate_openshell_endpoint(endpoint: &str) -> Result<(), String> { if invalid_from_vm { return Err(format!( - "openshell endpoint '{endpoint}' is not reachable from sandbox VMs; use a concrete host such as 127.0.0.1, host.containers.internal, or another routable address" + "openshell endpoint '{endpoint}' is not reachable from sandbox VMs; use a concrete host such as 127.0.0.1, {OPENSHELL_HOST_GATEWAY_ALIAS}, or another routable address" )); } @@ -723,7 +725,7 @@ fn guest_visible_openshell_endpoint(endpoint: &str) -> String { None => false, }; - if should_rewrite && url.set_host(Some("192.168.127.1")).is_ok() { + if should_rewrite && url.set_host(Some(GVPROXY_GATEWAY_IP)).is_ok() { return url.to_string(); } @@ -1007,19 +1009,47 @@ mod tests { let env = build_guest_environment(&sandbox, &config); assert!(env.contains(&"HOME=/root".to_string())); - assert!(env.contains(&"OPENSHELL_ENDPOINT=http://192.168.127.1:8080/".to_string())); + assert!(env.contains(&format!( + "OPENSHELL_ENDPOINT=http://{GVPROXY_GATEWAY_IP}:8080/" + ))); assert!(env.contains(&"OPENSHELL_SANDBOX_ID=sandbox-123".to_string())); assert!(env.contains(&format!( "OPENSHELL_SSH_SOCKET_PATH={GUEST_SSH_SOCKET_PATH}" ))); } + #[test] + fn guest_visible_openshell_endpoint_rewrites_loopback_hosts_to_gvproxy_gateway() { + assert_eq!( + guest_visible_openshell_endpoint("http://127.0.0.1:8080"), + format!("http://{GVPROXY_GATEWAY_IP}:8080/") + ); + assert_eq!( + guest_visible_openshell_endpoint("http://localhost:8080"), + format!("http://{GVPROXY_GATEWAY_IP}:8080/") + ); + assert_eq!( + guest_visible_openshell_endpoint("https://[::1]:8443"), + format!("https://{GVPROXY_GATEWAY_IP}:8443/") + ); + } + #[test] fn guest_visible_openshell_endpoint_preserves_non_loopback_hosts() { + assert_eq!( + guest_visible_openshell_endpoint(&format!( + "http://{OPENSHELL_HOST_GATEWAY_ALIAS}:8080" + )), + format!("http://{OPENSHELL_HOST_GATEWAY_ALIAS}:8080") + ); assert_eq!( guest_visible_openshell_endpoint("http://host.containers.internal:8080"), "http://host.containers.internal:8080" ); + assert_eq!( + guest_visible_openshell_endpoint(&format!("http://{GVPROXY_GATEWAY_IP}:8080")), + format!("http://{GVPROXY_GATEWAY_IP}:8080") + ); assert_eq!( guest_visible_openshell_endpoint("https://gateway.internal:8443"), "https://gateway.internal:8443" @@ -1134,9 +1164,9 @@ mod tests { fn validate_openshell_endpoint_accepts_host_gateway() { validate_openshell_endpoint("http://host.containers.internal:8080") .expect("guest-reachable host alias should be accepted"); - validate_openshell_endpoint("http://192.168.127.1:8080") + validate_openshell_endpoint(&format!("http://{GVPROXY_GATEWAY_IP}:8080")) .expect("gateway IP should be accepted"); - validate_openshell_endpoint("http://host.openshell.internal:8080") + validate_openshell_endpoint(&format!("http://{OPENSHELL_HOST_GATEWAY_ALIAS}:8080")) .expect("openshell host alias should be accepted"); validate_openshell_endpoint("https://gateway.internal:8443") .expect("dns endpoint should be accepted"); diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs index 318f710e6..ae6e4c183 100644 --- a/crates/openshell-driver-vm/src/runtime.rs +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -63,8 +63,11 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { // * the DHCP server + default router the guest's udhcpc client // talks to on boot (IPs 192.168.127.1 / .2, defaults for // gvisor-tap-vsock); - // * name resolution for `host.containers.internal` / - // `host.docker.internal`, which is how the guest's + // * the host-facing gateway identity the guest uses for callbacks: + // the init script seeds `/etc/hosts` with + // `host.openshell.internal` pointing at 192.168.127.1 while + // leaving gvproxy's legacy `host.containers.internal` / + // `host.docker.internal` DNS answers intact, which is how the guest's // `rewrite_openshell_endpoint_if_needed` probe reaches the host // gateway when the bare loopback address doesn't resolve from // inside the VM. diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 95ffbfaa4..35c72f80c 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -11,6 +11,7 @@ use crate::grpc::policy::{SANDBOX_SETTINGS_OBJECT_TYPE, sandbox_settings_id}; use crate::persistence::{ObjectId, ObjectName, ObjectRecord, ObjectType, Store}; use crate::sandbox_index::SandboxIndex; use crate::sandbox_watch::SandboxWatchBus; +use crate::supervisor_session::SupervisorSessionRegistry; use crate::tracing_bus::TracingLogBus; use futures::{Stream, StreamExt}; use openshell_core::proto::compute::v1::{ @@ -188,6 +189,7 @@ pub struct ComputeRuntime { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, sync_lock: Arc>, } @@ -205,6 +207,7 @@ impl ComputeRuntime { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Result { let default_image = driver .get_capabilities(Request::new(GetCapabilitiesRequest {})) @@ -220,6 +223,7 @@ impl ComputeRuntime { sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, sync_lock: Arc::new(Mutex::new(())), }) } @@ -230,6 +234,7 @@ impl ComputeRuntime { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Result { let driver = KubernetesComputeDriver::new(config) .await @@ -242,6 +247,7 @@ impl ComputeRuntime { sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, ) .await } @@ -253,6 +259,7 @@ impl ComputeRuntime { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Result { let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); Self::from_driver( @@ -262,6 +269,7 @@ impl ComputeRuntime { sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, ) .await } @@ -563,7 +571,8 @@ impl ComputeRuntime { existing.as_ref().and_then(|sandbox| sandbox.spec.as_ref()), ); - let phase = derive_phase(incoming.status.as_ref()); + let session_connected = self.supervisor_sessions.has_session(&incoming.id); + let mut phase = derive_phase(incoming.status.as_ref()); let mut sandbox = existing.unwrap_or_else(|| Sandbox { id: incoming.id.clone(), name: incoming.name.clone(), @@ -574,6 +583,12 @@ impl ComputeRuntime { ..Default::default() }); + if session_connected && matches!(phase, SandboxPhase::Provisioning | SandboxPhase::Unknown) + { + ensure_supervisor_ready_status(&mut status, &sandbox.name); + phase = SandboxPhase::Ready; + } + let old_phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); if old_phase != phase { info!( @@ -622,6 +637,55 @@ impl ComputeRuntime { Ok(()) } + pub async fn supervisor_session_connected(&self, sandbox_id: &str) -> Result<(), String> { + self.set_supervisor_session_state(sandbox_id, true).await + } + + pub async fn supervisor_session_disconnected(&self, sandbox_id: &str) -> Result<(), String> { + self.set_supervisor_session_state(sandbox_id, false).await + } + + async fn set_supervisor_session_state( + &self, + sandbox_id: &str, + connected: bool, + ) -> Result<(), String> { + let _guard = self.sync_lock.lock().await; + let Some(record) = self + .store + .get(Sandbox::object_type(), sandbox_id) + .await + .map_err(|e| e.to_string())? + else { + return Ok(()); + }; + + let mut sandbox = decode_sandbox_record(&record)?; + let current_phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); + + if current_phase == SandboxPhase::Deleting || current_phase == SandboxPhase::Error { + return Ok(()); + } + + if connected { + ensure_supervisor_ready_status(&mut sandbox.status, &sandbox.name); + sandbox.phase = SandboxPhase::Ready as i32; + } else if current_phase == SandboxPhase::Ready { + ensure_supervisor_not_ready_status(&mut sandbox.status, &sandbox.name); + sandbox.phase = SandboxPhase::Provisioning as i32; + } else { + return Ok(()); + } + + self.sandbox_index.update_from_sandbox(&sandbox); + self.store + .put_message(&sandbox) + .await + .map_err(|e| e.to_string())?; + self.sandbox_watch_bus.notify(sandbox_id); + Ok(()) + } + async fn apply_deleted(&self, sandbox_id: &str) -> Result<(), String> { let _guard = self.sync_lock.lock().await; self.apply_deleted_locked(sandbox_id).await @@ -963,6 +1027,58 @@ fn public_status_from_driver(status: &DriverSandboxStatus) -> SandboxStatus { } } +fn ensure_supervisor_ready_status(status: &mut Option, sandbox_name: &str) { + upsert_ready_condition( + status, + sandbox_name, + SandboxCondition { + r#type: "Ready".to_string(), + status: "True".to_string(), + reason: "DependenciesReady".to_string(), + message: "Supervisor session connected".to_string(), + last_transition_time: String::new(), + }, + ); +} + +fn ensure_supervisor_not_ready_status(status: &mut Option, sandbox_name: &str) { + upsert_ready_condition( + status, + sandbox_name, + SandboxCondition { + r#type: "Ready".to_string(), + status: "False".to_string(), + reason: "DependenciesNotReady".to_string(), + message: "Supervisor session disconnected".to_string(), + last_transition_time: String::new(), + }, + ); +} + +fn upsert_ready_condition( + status: &mut Option, + sandbox_name: &str, + condition: SandboxCondition, +) { + let status = status.get_or_insert_with(|| SandboxStatus { + sandbox_name: sandbox_name.to_string(), + agent_pod: String::new(), + agent_fd: String::new(), + sandbox_fd: String::new(), + conditions: Vec::new(), + }); + + if let Some(existing) = status + .conditions + .iter_mut() + .find(|existing| existing.r#type == "Ready") + { + *existing = condition; + } else { + status.conditions.push(condition); + } +} + fn public_condition_from_driver(condition: &DriverCondition) -> SandboxCondition { SandboxCondition { r#type: condition.r#type.clone(), @@ -1044,6 +1160,7 @@ mod tests { GetSandboxResponse, StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateResponse, }; use std::sync::Arc; + use tokio::sync::{mpsc, oneshot}; #[derive(Debug, Default)] struct TestDriver { @@ -1159,10 +1276,22 @@ mod tests { sandbox_index: SandboxIndex::new(), sandbox_watch_bus: SandboxWatchBus::new(), tracing_log_bus: TracingLogBus::new(), + supervisor_sessions: Arc::new(SupervisorSessionRegistry::new()), sync_lock: Arc::new(Mutex::new(())), } } + fn register_test_supervisor_session(runtime: &ComputeRuntime, sandbox_id: &str) { + let (tx, _rx) = mpsc::channel(1); + let (shutdown_tx, _shutdown_rx) = oneshot::channel(); + runtime.supervisor_sessions.register( + sandbox_id.to_string(), + "session-1".to_string(), + tx, + shutdown_tx, + ); + } + fn sandbox_record(id: &str, name: &str, phase: SandboxPhase) -> Sandbox { Sandbox { id: id.to_string(), @@ -1417,6 +1546,122 @@ mod tests { ); } + #[tokio::test] + async fn apply_sandbox_update_promotes_connected_supervisor_session_to_ready() { + let runtime = test_runtime(Arc::new(TestDriver::default())).await; + let sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Provisioning); + runtime.store.put_message(&sandbox).await.unwrap(); + + register_test_supervisor_session(&runtime, "sb-1"); + + runtime + .apply_sandbox_update(DriverSandbox { + id: "sb-1".to_string(), + name: "sandbox-a".to_string(), + namespace: "default".to_string(), + spec: None, + status: Some(make_driver_status(make_driver_condition( + "Starting", + "VM is starting", + ))), + }) + .await + .unwrap(); + + let stored = runtime + .store + .get_message::("sb-1") + .await + .unwrap() + .unwrap(); + assert_eq!( + SandboxPhase::try_from(stored.phase).unwrap(), + SandboxPhase::Ready + ); + let ready = stored + .status + .as_ref() + .and_then(|status| { + status + .conditions + .iter() + .find(|condition| condition.r#type == "Ready") + }) + .unwrap(); + assert_eq!(ready.status, "True"); + assert_eq!(ready.reason, "DependenciesReady"); + assert_eq!(ready.message, "Supervisor session connected"); + } + + #[tokio::test] + async fn supervisor_session_connected_promotes_store_state_without_driver_refresh() { + let runtime = test_runtime(Arc::new(TestDriver::default())).await; + let sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Provisioning); + runtime.store.put_message(&sandbox).await.unwrap(); + + runtime.supervisor_session_connected("sb-1").await.unwrap(); + + let stored = runtime + .store + .get_message::("sb-1") + .await + .unwrap() + .unwrap(); + assert_eq!( + SandboxPhase::try_from(stored.phase).unwrap(), + SandboxPhase::Ready + ); + } + + #[tokio::test] + async fn supervisor_session_disconnected_demotes_ready_sandbox() { + let runtime = test_runtime(Arc::new(TestDriver::default())).await; + let mut sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Ready); + sandbox.status = Some(SandboxStatus { + sandbox_name: "sandbox-a".to_string(), + agent_pod: String::new(), + agent_fd: String::new(), + sandbox_fd: String::new(), + conditions: vec![SandboxCondition { + r#type: "Ready".to_string(), + status: "True".to_string(), + reason: "DependenciesReady".to_string(), + message: "Supervisor session connected".to_string(), + last_transition_time: String::new(), + }], + }); + runtime.store.put_message(&sandbox).await.unwrap(); + + runtime + .supervisor_session_disconnected("sb-1") + .await + .unwrap(); + + let stored = runtime + .store + .get_message::("sb-1") + .await + .unwrap() + .unwrap(); + assert_eq!( + SandboxPhase::try_from(stored.phase).unwrap(), + SandboxPhase::Provisioning + ); + let ready = stored + .status + .as_ref() + .and_then(|status| { + status + .conditions + .iter() + .find(|condition| condition.r#type == "Ready") + }) + .unwrap(); + assert_eq!(ready.status, "False"); + assert_eq!(ready.reason, "DependenciesNotReady"); + assert_eq!(ready.message, "Supervisor session disconnected"); + } + #[tokio::test] async fn reconcile_store_with_backend_applies_driver_snapshot() { let runtime = test_runtime(Arc::new(TestDriver { diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 9501ea3b2..a40794037 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -88,7 +88,7 @@ pub struct ServerState { pub settings_mutex: tokio::sync::Mutex<()>, /// Registry of active supervisor sessions and pending relay channels. - pub supervisor_sessions: supervisor_session::SupervisorSessionRegistry, + pub supervisor_sessions: Arc, } fn is_benign_tls_handshake_failure(error: &std::io::Error) -> bool { @@ -108,6 +108,7 @@ impl ServerState { sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Self { Self { config, @@ -119,7 +120,7 @@ impl ServerState { ssh_connections_by_token: Mutex::new(HashMap::new()), ssh_connections_by_sandbox: Mutex::new(HashMap::new()), settings_mutex: tokio::sync::Mutex::new(()), - supervisor_sessions: supervisor_session::SupervisorSessionRegistry::new(), + supervisor_sessions, } } } @@ -150,6 +151,7 @@ pub async fn run_server( let sandbox_index = SandboxIndex::new(); let sandbox_watch_bus = SandboxWatchBus::new(); + let supervisor_sessions = Arc::new(supervisor_session::SupervisorSessionRegistry::new()); let compute = build_compute_runtime( &config, &vm_config, @@ -157,6 +159,7 @@ pub async fn run_server( sandbox_index.clone(), sandbox_watch_bus.clone(), tracing_log_bus.clone(), + supervisor_sessions.clone(), ) .await?; let state = Arc::new(ServerState::new( @@ -166,6 +169,7 @@ pub async fn run_server( sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, )); state.compute.spawn_watchers(); @@ -261,6 +265,7 @@ async fn build_compute_runtime( sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, + supervisor_sessions: Arc, ) -> Result { let driver = configured_compute_driver(config)?; info!(driver = %driver, "Using compute driver"); @@ -288,6 +293,7 @@ async fn build_compute_runtime( sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions.clone(), ) .await .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))), @@ -300,6 +306,7 @@ async fn build_compute_runtime( sandbox_index, sandbox_watch_bus, tracing_log_bus, + supervisor_sessions, ) .await .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) diff --git a/crates/openshell-server/src/supervisor_session.rs b/crates/openshell-server/src/supervisor_session.rs index f81ee9e3c..d130bf71d 100644 --- a/crates/openshell-server/src/supervisor_session.rs +++ b/crates/openshell-server/src/supervisor_session.rs @@ -180,6 +180,10 @@ impl SupervisorSessionRegistry { .map(|s| s.tx.clone()) } + pub fn has_session(&self, sandbox_id: &str) -> bool { + self.sessions.lock().unwrap().contains_key(sandbox_id) + } + fn pending_channel_ids(&self, sandbox_id: &str) -> Vec { self.pending_relays .lock() @@ -547,6 +551,19 @@ pub async fn handle_connect_supervisor( .await; } + if let Err(err) = state + .compute + .supervisor_session_connected(&sandbox_id) + .await + { + warn!( + sandbox_id = %sandbox_id, + session_id = %session_id, + error = %err, + "supervisor session: failed to mark sandbox ready" + ); + } + // Step 4: Spawn the session loop that reads inbound messages. let state_clone = Arc::clone(state); let sandbox_id_clone = sandbox_id.clone(); @@ -565,6 +582,18 @@ pub async fn handle_connect_supervisor( .remove_if_current(&sandbox_id_clone, &session_id); if still_ours { info!(sandbox_id = %sandbox_id_clone, session_id = %session_id, "supervisor session: ended"); + if let Err(err) = state_clone + .compute + .supervisor_session_disconnected(&sandbox_id_clone) + .await + { + warn!( + sandbox_id = %sandbox_id_clone, + session_id = %session_id, + error = %err, + "supervisor session: failed to mark sandbox disconnected" + ); + } } else { info!(sandbox_id = %sandbox_id_clone, session_id = %session_id, "supervisor session: ended (already superseded)"); } diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index d43046d4f..566b32141 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -119,6 +119,65 @@ verify_checksum() { fi } +ensure_build_nofile_limit() { + local desired="${OPENSHELL_VM_BUILD_NOFILE_LIMIT:-8192}" + local minimum=1024 + local current="" + local hard="" + local target="" + + [ "$(uname -s)" = "Darwin" ] || return 0 + command -v cargo-zigbuild >/dev/null 2>&1 || return 0 + + current="$(ulimit -n 2>/dev/null || echo "")" + case "${current}" in + ''|*[!0-9]*) + return 0 + ;; + esac + + if [ "${current}" -ge "${desired}" ]; then + return 0 + fi + + hard="$(ulimit -Hn 2>/dev/null || echo "")" + target="${desired}" + case "${hard}" in + ''|unlimited|infinity) + ;; + *[!0-9]*) + ;; + *) + if [ "${hard}" -lt "${target}" ]; then + target="${hard}" + fi + ;; + esac + + if [ "${target}" -gt "${current}" ] && ulimit -n "${target}" 2>/dev/null; then + echo "==> Raised open file limit for cargo-zigbuild: ${current} -> $(ulimit -n)" + fi + + current="$(ulimit -n 2>/dev/null || echo "${current}")" + case "${current}" in + ''|*[!0-9]*) + return 0 + ;; + esac + + if [ "${current}" -lt "${desired}" ]; then + echo "WARNING: Open file limit is ${current}; cargo-zigbuild is more reliable at ${desired}+ on macOS." + fi + + if [ "${current}" -lt "${minimum}" ]; then + echo "ERROR: Open file limit (${current}) is too low for cargo-zigbuild on macOS." + echo " Zig 0.14+ can fail with ProcessFdQuotaExceeded while linking large binaries." + echo " Run: ulimit -n ${desired}" + echo " Then re-run this script." + exit 1 + fi +} + if [ "$BASE_ONLY" = true ]; then echo "==> Building base openshell-vm rootfs" echo " Guest arch: ${GUEST_ARCH}" @@ -135,6 +194,10 @@ else fi echo "" +# cargo-zigbuild on macOS can exhaust the default per-process file descriptor +# limit while linking larger targets with Zig 0.14+. +ensure_build_nofile_limit + # ── Check for running VM ──────────────────────────────────────────────── # If an openshell-vm is using this rootfs via virtio-fs, wiping the rootfs # corrupts the VM's filesystem (e.g. /var disappears) causing cascading From 9c30881021378e82a3cfffb010a5e7d72237e582 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 21 Apr 2026 23:01:06 -0700 Subject: [PATCH 5/5] cleanup --- Cargo.lock | 15 + architecture/custom-vm-runtime.md | 354 ++++++++++---------- crates/openshell-driver-vm/Cargo.toml | 8 + crates/openshell-driver-vm/Makefile | 7 - crates/openshell-driver-vm/README.md | 44 ++- crates/openshell-driver-vm/src/driver.rs | 14 +- crates/openshell-driver-vm/src/lib.rs | 1 + crates/openshell-driver-vm/src/main.rs | 42 ++- crates/openshell-driver-vm/src/procguard.rs | 196 +++++++++++ crates/openshell-driver-vm/src/runtime.rs | 119 ++++++- crates/openshell-driver-vm/start.sh | 26 +- tasks/scripts/vm/smoke-orphan-cleanup.sh | 204 +++++++++++ tasks/scripts/vm/vm-setup.sh | 2 +- tasks/vm.toml | 21 +- 14 files changed, 835 insertions(+), 218 deletions(-) delete mode 100644 crates/openshell-driver-vm/Makefile create mode 100644 crates/openshell-driver-vm/src/procguard.rs create mode 100755 tasks/scripts/vm/smoke-orphan-cleanup.sh diff --git a/Cargo.lock b/Cargo.lock index d5de42fb3..2d0bc6ce2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3101,6 +3101,7 @@ dependencies = [ "miette", "nix", "openshell-core", + "polling", "prost-types", "tar", "tokio", @@ -3672,6 +3673,20 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" +[[package]] +name = "polling" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi", + "pin-project-lite", + "rustix 1.1.4", + "windows-sys 0.61.2", +] + [[package]] name = "poly1305" version = "0.8.0" diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md index 548b86d17..4cafe424f 100644 --- a/architecture/custom-vm-runtime.md +++ b/architecture/custom-vm-runtime.md @@ -1,140 +1,161 @@ # Custom libkrunfw VM Runtime -> Status: Experimental and work in progress (WIP). VM support is under active development and may change. +> Status: Experimental and work in progress (WIP). The VM compute driver is +> under active development and may change. ## Overview -The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a -lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel -is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel. +The OpenShell gateway uses [libkrun](https://github.com/containers/libkrun) via the +`openshell-driver-vm` compute driver to boot a lightweight microVM per sandbox. +Each VM runs on Apple Hypervisor.framework (macOS) or KVM (Linux), with the guest +kernel embedded inside `libkrunfw`. -The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or -conntrack support. This is insufficient for Kubernetes pod networking. +The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, +netfilter, or conntrack support. That is insufficient for the sandbox supervisor's +per-sandbox network namespace primitives (veth pair + iptables, see +`crates/openshell-sandbox/src/sandbox/linux/netns.rs`). The custom libkrunfw +runtime adds bridge, iptables/nftables, and conntrack support to the guest +kernel. -The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to -the VM kernel, enabling standard Kubernetes networking. +The driver is spawned by `openshell-gateway` as a subprocess, talks to it over a +Unix domain socket (`compute-driver.sock`) with the +`openshell.compute.v1.ComputeDriver` gRPC surface, and manages per-sandbox +microVMs. The runtime (libkrun + libkrunfw + gvproxy) and the sandbox rootfs are +embedded directly in the driver binary — no sibling files required at runtime. ## Architecture ```mermaid graph TD subgraph Host["Host (macOS / Linux)"] - BIN[openshell-vm binary] - EMB["Embedded runtime (zstd-compressed)\nlibkrun · libkrunfw · gvproxy"] - CACHE["~/.local/share/openshell/vm-runtime/{version}/"] - PROV[Runtime provenance logging] - GVP[gvproxy networking proxy] - - BIN --> EMB - BIN -->|extracts to| CACHE - BIN --> PROV - BIN -->|spawns| GVP + GATEWAY["openshell-gateway
(compute::vm::spawn)"] + DRIVER["openshell-driver-vm
(compute-driver.sock)"] + EMB["Embedded runtime (zstd)
libkrun · libkrunfw · gvproxy
+ sandbox rootfs.tar.zst"] + GVP["gvproxy (per sandbox)
virtio-net · DHCP · DNS"] + + GATEWAY <-->|gRPC over UDS| DRIVER + DRIVER --> EMB + DRIVER -->|spawns one per sandbox| GVP end - subgraph Guest["Guest VM"] - INIT["openshell-vm-init.sh (PID 1)"] - VAL[Validates kernel capabilities] - CNI[Configures bridge CNI] - EXECA["Starts exec agent\nvsock port 10777"] - PKI[Generates mTLS PKI] - K3S[Execs k3s server] - EXECPY["openshell-vm-exec-agent.py"] - CHK["check-vm-capabilities.sh"] - - INIT --> VAL --> CNI --> EXECA --> PKI --> K3S + subgraph Guest["Per-sandbox microVM"] + SBXINIT["/srv/openshell-vm-sandbox-init.sh"] + SBX["/opt/openshell/bin/openshell-sandbox
(PID 1, supervisor)"] + SBXINIT --> SBX end - BIN -- "fork + krun_start_enter" --> INIT - GVP -- "virtio-net" --> Guest + DRIVER -- "fork + krun_start_enter" --> SBXINIT + GVP -- "virtio-net eth0" --> Guest + SBX -.->|"outbound ConnectSupervisor
gRPC stream"| GATEWAY + CLIENT["openshell-cli"] -->|SSH over supervisor relay| GATEWAY ``` +The driver spawns **one microVM per sandbox**. Each VM boots directly into +`openshell-sandbox` as PID 1. All gateway ingress — SSH, exec, connect — rides +the supervisor-initiated `ConnectSupervisor` gRPC stream opened from inside the +guest back out to the gateway, so gvproxy is configured with `-ssh-port -1` and +never binds a host-side TCP listener. + ## Embedded Runtime -The openshell-vm binary is fully self-contained, embedding both the VM runtime libraries -and a minimal rootfs as zstd-compressed byte arrays. On first use, the binary extracts -these to XDG cache directories with progress bars: +`openshell-driver-vm` embeds the VM runtime libraries and the sandbox rootfs as +zstd-compressed byte arrays, extracting on demand: ``` -~/.local/share/openshell/vm-runtime/{version}/ +~/.local/share/openshell/vm-runtime// # libkrun / libkrunfw / gvproxy ├── libkrun.{dylib,so} ├── libkrunfw.{5.dylib,so.5} └── gvproxy -~/.local/share/openshell/openshell-vm/{version}/instances//rootfs/ -├── usr/local/bin/k3s -├── opt/openshell/bin/openshell-sandbox -├── opt/openshell/manifests/ -└── ... +/sandboxes//rootfs/ # per-sandbox rootfs ``` -This eliminates the need for separate bundles or downloads - a single ~120MB binary -provides everything needed to run the VM. Old cache versions are automatically -cleaned up when a new version is extracted. +Old runtime cache versions are cleaned up when a new version is extracted. -### Hybrid Approach +### Sandbox rootfs preparation -The embedded rootfs uses a "minimal" configuration: -- Includes: Base Ubuntu, k3s binary, supervisor binary, helm charts, manifests -- Excludes: Pre-loaded container images (~1GB savings) +The rootfs tarball the driver embeds starts from the same minimal Ubuntu base +used across the project, and is **rewritten into a supervisor-only sandbox +guest** during extraction: -Container images are pulled on demand when sandboxes are created. First boot takes -~30-60s as k3s initializes; subsequent boots use cached state for ~3-5s startup. +- k3s state and Kubernetes manifests are stripped out +- `/srv/openshell-vm-sandbox-init.sh` is installed as the guest entrypoint +- the guest boots directly into `openshell-sandbox` — no k3s, no kube-proxy, + no CNI plugins -For the VM compute driver, the same embedded rootfs is rewritten into a -supervisor-only sandbox guest before boot: +See `crates/openshell-driver-vm/src/rootfs.rs` for the rewrite logic and +`crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh` for the init +script that gets installed. -- removes k3s state and Kubernetes manifests from the extracted rootfs -- installs `/srv/openshell-vm-sandbox-init.sh` -- boots directly into `openshell-sandbox` instead of `openshell-vm-init.sh` -- keeps the same embedded libkrun/libkrunfw kernel/runtime bundle +### `--internal-run-vm` helper -`openshell-driver-vm` now embeds the sandbox rootfs tarball independently so it can -prepare sandbox guests without linking against the `openshell-vm` Rust crate. -It now also embeds the minimal libkrun/libkrunfw bundle it needs for sandbox -boots and launches sandbox guests via a hidden helper mode in the -`openshell-driver-vm` binary itself, without depending on the `openshell-vm` -binary. The helper still starts its own embedded `gvproxy` instance to provide -virtio-net guest egress plus the single inbound SSH port forward used by the -compute driver. +The driver binary has two modes: the default mode is the gRPC server; when +launched with `--internal-run-vm` it becomes a per-sandbox launcher. The driver +spawns one launcher per sandbox as a subprocess, which in turn starts `gvproxy` +and calls `krun_start_enter` to boot the guest. Keeping the launcher in the +same binary means the driver ships a single artifact for both roles. -For fully air-gapped environments requiring pre-loaded images, build with: -```bash -mise run vm:rootfs # Full rootfs (~2GB, includes images) -mise run vm:build # Rebuild binary with full rootfs -``` +## Network Plane + +The driver launches a **dedicated `gvproxy` instance per sandbox** to provide the +guest's networking plane: + +- virtio-net backend over a Unix SOCK_STREAM (Linux) or SOCK_DGRAM (macOS vfkit) + socket, which surfaces as `eth0` inside the guest +- DHCP server + default router (192.168.127.1 / 192.168.127.2) for the guest's + udhcpc client +- DNS for host aliases: the guest init script seeds `/etc/hosts` with + `host.openshell.internal` → 192.168.127.1, while leaving gvproxy's legacy + `host.containers.internal` / `host.docker.internal` resolution intact + +The `-listen` API socket and the `-ssh-port` forwarder are both intentionally +omitted. After the supervisor-initiated relay migration the driver does not +enqueue any host-side port forwards, and the guest's SSH listener lives on a +Unix socket at `/run/openshell/ssh.sock` inside the VM that is reached over the +outbound `ConnectSupervisor` gRPC stream. Binding a host listener would race +concurrent sandboxes for port 2222 and surface a misleading "sshd is reachable" +endpoint. + +The sandbox supervisor's per-sandbox netns (veth pair + iptables) branches off +of this plane. libkrun's built-in TSI socket impersonation would not satisfy +those kernel-level primitives, which is why we need the custom libkrunfw. -## Network Profile +## Process Lifecycle Cleanup -The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and -netfilter kernel support. The init script validates these capabilities at boot and fails -fast with an actionable error if they are missing. +`openshell-driver-vm` installs a cross-platform "die when my parent dies" +primitive (`procguard`) in every link of the spawn chain so that killing +`openshell-gateway` (SIGTERM, SIGKILL, or crash) reaps the driver, per-sandbox +launcher, gvproxy, and the libkrun worker: -### Bridge Profile +- Linux: `nix::sys::prctl::set_pdeathsig(SIGKILL)` +- macOS / BSDs: `smol-rs/polling` with `ProcessOps::Exit` on a helper thread +- gvproxy (the one non-Rust child) gets `PR_SET_PDEATHSIG` via `pre_exec` on + Linux, and is SIGTERM'd from the launcher's procguard cleanup callback on + macOS -- CNI: bridge plugin with `cni0` interface -- IP masquerade: enabled (iptables-legacy via CNI bridge plugin) -- kube-proxy: enabled (nftables mode) -- Service VIPs: functional (ClusterIP, NodePort) -- hostNetwork workarounds: not required +See `crates/openshell-driver-vm/src/procguard.rs` for the implementation and +`tasks/scripts/vm/smoke-orphan-cleanup.sh` (exposed as +`mise run vm:smoke:orphan-cleanup`) for the regression test that covers both +SIGTERM and SIGKILL paths. ## Runtime Provenance -At boot, the openshell-vm binary logs provenance metadata about the loaded runtime bundle: +At driver startup the loaded runtime bundle is logged with: - Library paths and SHA-256 hashes - Whether the runtime is custom-built or stock - For custom runtimes: libkrunfw commit, kernel version, build timestamp -This information is sourced from `provenance.json` (generated by the build script) -and makes it straightforward to correlate VM behavior with a specific runtime artifact. +This information is sourced from `provenance.json` (generated by the build +script) and makes it straightforward to correlate sandbox VM behavior with a +specific runtime artifact. ## Build Pipeline ```mermaid graph LR subgraph Source["crates/openshell-vm/runtime/"] - KCONF["kernel/openshell.kconfig\nKernel config fragment"] - README["README.md\nOperator documentation"] + KCONF["kernel/openshell.kconfig
Kernel config fragment"] end subgraph Linux["Linux CI (build-libkrun.sh)"] @@ -145,101 +166,87 @@ graph LR BUILD_M["Build libkrunfw.dylib + libkrun.dylib"] end - subgraph Output["target/libkrun-build/"] - LIB_SO["libkrunfw.so + libkrun.so\n(Linux)"] - LIB_DY["libkrunfw.dylib + libkrun.dylib\n(macOS)"] + subgraph Output["vm-runtime-<platform>.tar.zst"] + LIB_SO["libkrunfw.so + libkrun.so + gvproxy
(Linux)"] + LIB_DY["libkrunfw.dylib + libkrun.dylib + gvproxy
(macOS)"] end - KCONF --> BUILD_L - BUILD_L --> LIB_SO - KCONF --> BUILD_M - BUILD_M --> LIB_DY + KCONF --> BUILD_L --> LIB_SO + KCONF --> BUILD_M --> LIB_DY ``` +The `vm-runtime-.tar.zst` artifact is consumed by +`openshell-driver-vm`'s `build.rs`, which embeds the library set into the +binary via `include_bytes!()`. Setting `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` +at build time (wired up by `crates/openshell-driver-vm/start.sh`) points the +build at the staged artifacts. + ## Kernel Config Fragment -The `openshell.kconfig` fragment enables these kernel features on top of the stock -libkrunfw kernel: +The `openshell.kconfig` fragment enables these kernel features on top of the +stock libkrunfw kernel: | Feature | Key Configs | Purpose | |---------|-------------|---------| -| Network namespaces | `CONFIG_NET_NS`, `CONFIG_NAMESPACES` | Pod isolation | -| veth | `CONFIG_VETH` | Pod network namespace pairs | -| Bridge device | `CONFIG_BRIDGE`, `CONFIG_BRIDGE_NETFILTER` | cni0 bridge for pod networking, kube-proxy bridge traffic visibility | +| Network namespaces | `CONFIG_NET_NS`, `CONFIG_NAMESPACES` | Sandbox netns isolation | +| veth | `CONFIG_VETH` | Sandbox network namespace pairs | +| Bridge device | `CONFIG_BRIDGE`, `CONFIG_BRIDGE_NETFILTER` | Bridge support + iptables visibility into bridge traffic | | Netfilter framework | `CONFIG_NETFILTER`, `CONFIG_NETFILTER_ADVANCED`, `CONFIG_NETFILTER_XTABLES` | iptables/nftables framework | -| xtables match modules | `CONFIG_NETFILTER_XT_MATCH_CONNTRACK`, `_COMMENT`, `_MULTIPORT`, `_MARK`, `_STATISTIC`, `_ADDRTYPE`, `_RECENT`, `_LIMIT` | kube-proxy and kubelet iptables rules | +| xtables match modules | `CONFIG_NETFILTER_XT_MATCH_CONNTRACK`, `_COMMENT`, `_MULTIPORT`, `_MARK`, `_STATISTIC`, `_ADDRTYPE`, `_RECENT`, `_LIMIT` | Sandbox supervisor iptables rules | | Connection tracking | `CONFIG_NF_CONNTRACK`, `CONFIG_NF_CT_NETLINK` | NAT state tracking | -| NAT | `CONFIG_NF_NAT` | Service VIP DNAT/SNAT | -| iptables | `CONFIG_IP_NF_IPTABLES`, `CONFIG_IP_NF_FILTER`, `CONFIG_IP_NF_NAT`, `CONFIG_IP_NF_MANGLE` | CNI bridge masquerade and compat | -| nftables | `CONFIG_NF_TABLES`, `CONFIG_NFT_CT`, `CONFIG_NFT_NAT`, `CONFIG_NFT_MASQ`, `CONFIG_NFT_NUMGEN`, `CONFIG_NFT_FIB_IPV4` | kube-proxy nftables mode (primary) | -| IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Pod-to-pod routing | -| IPVS | `CONFIG_IP_VS`, `CONFIG_IP_VS_RR`, `CONFIG_IP_VS_NFCT` | kube-proxy IPVS mode (optional) | -| Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | Kubernetes QoS | -| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Container resource limits | -| TUN/TAP | `CONFIG_TUN` | CNI plugin support | +| NAT | `CONFIG_NF_NAT` | Sandbox egress DNAT/SNAT | +| iptables | `CONFIG_IP_NF_IPTABLES`, `CONFIG_IP_NF_FILTER`, `CONFIG_IP_NF_NAT`, `CONFIG_IP_NF_MANGLE` | Masquerade and compat | +| nftables | `CONFIG_NF_TABLES`, `CONFIG_NFT_CT`, `CONFIG_NFT_NAT`, `CONFIG_NFT_MASQ`, `CONFIG_NFT_NUMGEN`, `CONFIG_NFT_FIB_IPV4` | nftables path | +| IP forwarding | `CONFIG_IP_ADVANCED_ROUTER`, `CONFIG_IP_MULTIPLE_TABLES` | Sandbox-to-host routing | +| Traffic control | `CONFIG_NET_SCH_HTB`, `CONFIG_NET_CLS_CGROUP` | QoS | +| Cgroups | `CONFIG_CGROUPS`, `CONFIG_CGROUP_DEVICE`, `CONFIG_MEMCG`, `CONFIG_CGROUP_PIDS` | Sandbox resource limits | +| TUN/TAP | `CONFIG_TUN` | CNI plugin compatibility; inherited from the shared kconfig, not exercised by the driver. | | Dummy interface | `CONFIG_DUMMY` | Fallback networking | -| Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support | -| Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support | +| Landlock | `CONFIG_SECURITY_LANDLOCK` | Sandbox supervisor filesystem sandboxing | +| Seccomp filter | `CONFIG_SECCOMP_FILTER` | Sandbox supervisor syscall filtering | -See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full fragment with -inline comments explaining why each option is needed. +See `crates/openshell-vm/runtime/kernel/openshell.kconfig` for the full +fragment with inline comments explaining why each option is needed. ## Verification -One verification tool is provided: - -1. **Capability checker** (`check-vm-capabilities.sh`): Runs inside the VM to verify - kernel capabilities. Produces pass/fail results for each required feature. - -## Running Commands In A Live VM - -The standalone `openshell-vm` binary supports `openshell-vm exec -- ` for a running VM. - -- Each VM instance stores local runtime state next to its instance rootfs -- libkrun maps a per-instance host Unix socket into the guest on vsock port `10777` -- `openshell-vm-init.sh` starts `openshell-vm-exec-agent.py` during boot -- `openshell-vm exec` connects to the host socket, which libkrun forwards into the guest exec agent -- The guest exec agent spawns the command, then streams stdout, stderr, and exit status back -- The host-side bootstrap also uses the exec agent to read PKI cert files from the guest - (via `cat /opt/openshell/pki/`) instead of requiring a separate vsock server - -`openshell-vm exec` also injects `KUBECONFIG=/etc/rancher/k3s/k3s.yaml` by default so kubectl-style -commands work the same way they would inside the VM shell. +- **Capability checker** (`check-vm-capabilities.sh`): runs inside a sandbox VM + to verify kernel capabilities. Produces pass/fail results for each required + feature. +- **Orphan-cleanup smoke test**: `mise run vm:smoke:orphan-cleanup` asserts + that killing the gateway leaves zero driver, launcher, gvproxy, or libkrun + survivors. ## Build Commands -```bash +```shell # One-time setup: download pre-built runtime (~30s) mise run vm:setup -# Build and run -mise run vm - -# Build embedded binary with base rootfs (~120MB, recommended) -mise run vm:rootfs -- --base # Build base rootfs tarball -mise run vm:build # Build binary with embedded rootfs - -# Build with full rootfs (air-gapped, ~2GB+) -mise run vm:rootfs # Build full rootfs tarball -mise run vm:build # Rebuild binary +# Start openshell-gateway with the VM compute driver +mise run gateway:vm # With custom kernel (optional, adds ~20 min) -FROM_SOURCE=1 mise run vm:setup # Build runtime from source -mise run vm:build # Then build embedded binary +FROM_SOURCE=1 mise run vm:setup # Wipe everything and start over mise run vm:clean ``` +See `crates/openshell-driver-vm/README.md` for the full driver workflow, +including multi-gateway development, CLI registration, and sandbox creation +examples. + ## CI/CD -The openshell-vm build is split into two GitHub Actions workflows that publish to a -rolling `vm-dev` GitHub Release: +Two GitHub Actions workflows back the driver's release artifacts, both +publishing to a rolling `vm-dev` GitHub Release: ### Kernel Runtime (`release-vm-kernel.yml`) -Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for all -supported platforms. Runs on-demand or when the kernel config / pinned versions change. +Builds the custom libkrunfw (kernel firmware), libkrun (VMM), and gvproxy for +all supported platforms. Runs on-demand or when the kernel config / pinned +versions change. | Platform | Runner | Build Method | |----------|--------|-------------| @@ -247,43 +254,36 @@ supported platforms. Runs on-demand or when the kernel config / pinned versions | Linux x86_64 | `build-amd64` (self-hosted) | Native `build-libkrun.sh` | | macOS ARM64 | `macos-latest-xlarge` (GitHub-hosted) | `build-libkrun-macos.sh` | -Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, gvproxy, and -provenance metadata. - -Each platform builds its own libkrunfw and libkrun natively. The kernel inside -libkrunfw is always Linux regardless of host platform. +Artifacts: `vm-runtime-{platform}.tar.zst` containing libkrun, libkrunfw, +gvproxy, and provenance metadata. Each platform builds its own libkrunfw and +libkrun natively; the kernel inside libkrunfw is always Linux regardless of +host platform. -### VM Binary (`release-vm-dev.yml`) +### Driver Binary (`release-vm-dev.yml`) -Builds the self-extracting openshell-vm binary for all platforms. Runs on every push -to `main` that touches VM-related crates. +Builds the self-contained `openshell-driver-vm` binary for every platform, +with the kernel runtime + sandbox rootfs embedded. Runs on every push to +`main` that touches VM-related crates. -```mermaid -graph TD - CV[compute-versions] --> DL[download-kernel-runtime\nfrom vm-dev release] - DL --> RFS_ARM[build-rootfs arm64] - DL --> RFS_AMD[build-rootfs amd64] - RFS_ARM --> VM_ARM[build-vm linux-arm64] - RFS_AMD --> VM_AMD[build-vm linux-amd64] - RFS_ARM --> VM_MAC["build-vm-macos\n(osxcross, reuses arm64 rootfs)"] - VM_ARM --> REL[release-vm-dev\nupload to rolling release] - VM_AMD --> REL - VM_MAC --> REL -``` +The `download-kernel-runtime` job pulls the current `vm-runtime-.tar.zst` +from the `vm-dev` release; the `build-openshell-driver-vm` jobs set +`OPENSHELL_VM_RUNTIME_COMPRESSED_DIR=$PWD/target/vm-runtime-compressed` and +run `cargo build --release -p openshell-driver-vm`. The macOS driver is +cross-compiled via osxcross (no macOS runner needed for the binary build — +only for the kernel build). -The macOS binary is cross-compiled via osxcross (no macOS runner needed for the binary -build — only for the kernel build). The macOS VM guest is always Linux ARM64, so it -reuses the arm64 rootfs. - -macOS binaries produced via osxcross are not codesigned. Users must self-sign: -```bash -codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - ./openshell-vm -``` +macOS driver binaries produced via osxcross are not codesigned. Development +builds are signed automatically by `crates/openshell-driver-vm/start.sh`; a +packaged release needs signing in CI. ## Rollout Strategy -1. Custom runtime is embedded by default when building with `mise run vm:build`. -2. The init script validates kernel capabilities at boot and fails fast if missing. -3. For development, override with `OPENSHELL_VM_RUNTIME_DIR` to use a local directory. -4. In CI, kernel runtime is pre-built and cached in the `vm-dev` release. The binary - build downloads it via `download-kernel-runtime.sh`. +1. Custom runtime is embedded by default when building `openshell-driver-vm` + with `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` set (wired up by + `crates/openshell-driver-vm/start.sh`). +2. The sandbox init script validates kernel capabilities at boot and fails + fast if missing. +3. For development, override with `OPENSHELL_VM_RUNTIME_DIR` to use a local + directory instead of the extracted cache. +4. In CI, the kernel runtime is pre-built and cached in the `vm-dev` release. + The driver build downloads it via `download-kernel-runtime.sh`. diff --git a/crates/openshell-driver-vm/Cargo.toml b/crates/openshell-driver-vm/Cargo.toml index 368716ef9..b4d92b0fc 100644 --- a/crates/openshell-driver-vm/Cargo.toml +++ b/crates/openshell-driver-vm/Cargo.toml @@ -37,5 +37,13 @@ libloading = "0.8" tar = "0.4" zstd = "0.13" +# smol-rs/polling drives the BSD/macOS parent-death detection in +# procguard via kqueue's EVFILT_PROC / NOTE_EXIT filter. We could use +# it on Linux too (via epoll + pidfd) but sticking with +# nix::sys::prctl::set_pdeathsig there keeps the Linux path a single +# syscall with no helper thread. +[target.'cfg(any(target_os = "macos", target_os = "ios", target_os = "freebsd", target_os = "netbsd", target_os = "openbsd", target_os = "dragonfly"))'.dependencies] +polling = "3.11" + [lints] workspace = true diff --git a/crates/openshell-driver-vm/Makefile b/crates/openshell-driver-vm/Makefile deleted file mode 100644 index e1c360f3d..000000000 --- a/crates/openshell-driver-vm/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -.PHONY: start - -start: - ./start.sh diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md index 03cb92195..8808b25d9 100644 --- a/crates/openshell-driver-vm/README.md +++ b/crates/openshell-driver-vm/README.md @@ -31,19 +31,15 @@ Sandbox guests execute `/opt/openshell/bin/openshell-sandbox` as PID 1 inside th ## Quick start (recommended) -`start.sh` handles runtime setup, builds, codesigning, and environment wiring. From the repo root: ```shell -crates/openshell-driver-vm/start.sh +mise run gateway:vm ``` -or equivalently: - -```shell -make -C crates/openshell-driver-vm start -``` -First run takes a few minutes while `mise run vm:setup` stages libkrun/libkrunfw/gvproxy and `mise run vm:rootfs -- --base` builds the embedded rootfs. Subsequent runs are cached. State lives under `target/openshell-vm-driver-dev/` (SQLite DB + per-sandbox rootfs + `compute-driver.sock`). +First run takes a few minutes while `mise run vm:setup` stages libkrun/libkrunfw/gvproxy and `mise run vm:rootfs -- --base` builds the embedded rootfs. Subsequent runs are cached. To keep the Unix socket path under macOS `SUN_LEN`, `mise run gateway:vm` and `start.sh` default the state dir to `/tmp/openshell-vm-driver-dev-$USER-port-$PORT/` (SQLite DB + per-sandbox rootfs + `compute-driver.sock`) unless `OPENSHELL_VM_DRIVER_STATE_DIR` is set. +The wrapper also prints the recommended gateway name (`vm-driver-port-$PORT` by default) plus the exact repo-local `scripts/bin/openshell gateway add` and `scripts/bin/openshell gateway select` commands to use from another terminal. This avoids accidentally hitting an older `openshell` binary elsewhere on your `PATH`. +It also exports `OPENSHELL_DRIVER_DIR=$PWD/target/debug` before starting the gateway so local dev runs use the freshly built `openshell-driver-vm` instead of an older installed copy from `~/.local/libexec/openshell` or `/usr/local/libexec`. Override via environment: @@ -53,10 +49,33 @@ OPENSHELL_SSH_HANDSHAKE_SECRET=$(openssl rand -hex 32) \ crates/openshell-driver-vm/start.sh ``` +Run multiple dev gateways side by side by giving each one a unique port. The wrapper derives a distinct default state dir from that port automatically: + +```shell +OPENSHELL_SERVER_PORT=8080 mise run gateway:vm +OPENSHELL_SERVER_PORT=8081 mise run gateway:vm +``` + +If you want a custom suffix instead of `port-$PORT`, set `OPENSHELL_VM_INSTANCE`: + +```shell +OPENSHELL_SERVER_PORT=8082 \ +OPENSHELL_VM_INSTANCE=feature-a \ +mise run gateway:vm +``` + +If you want a custom CLI gateway name, set `OPENSHELL_VM_GATEWAY_NAME`: + +```shell +OPENSHELL_SERVER_PORT=8082 \ +OPENSHELL_VM_GATEWAY_NAME=vm-feature-a \ +mise run gateway:vm +``` + Teardown: ```shell -rm -rf target/openshell-vm-driver-dev +rm -rf /tmp/openshell-vm-driver-dev-$USER-port-8080 ``` ## Manual equivalent @@ -78,16 +97,17 @@ codesign \ --force -s - target/debug/openshell-driver-vm # 4. Start the gateway with the VM driver -mkdir -p target/openshell-vm-driver-dev +mkdir -p /tmp/openshell-vm-driver-dev-$USER-port-8080 target/debug/openshell-gateway \ --drivers vm \ --disable-tls \ - --database-url sqlite:target/openshell-vm-driver-dev/openshell.db \ + --database-url sqlite:/tmp/openshell-vm-driver-dev-$USER-port-8080/openshell.db \ + --driver-dir $PWD/target/debug \ --grpc-endpoint http://host.containers.internal:8080 \ --ssh-handshake-secret dev-vm-driver-secret \ --ssh-gateway-host 127.0.0.1 \ --ssh-gateway-port 8080 \ - --vm-driver-state-dir $PWD/target/openshell-vm-driver-dev + --vm-driver-state-dir /tmp/openshell-vm-driver-dev-$USER-port-8080 ``` The gateway resolves `openshell-driver-vm` in this order: `--driver-dir`, conventional install locations (`~/.local/libexec/openshell`, `/usr/local/libexec/openshell`, `/usr/local/libexec`), then a sibling of the gateway binary. diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index f2b50d0e1..d649a585a 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -258,7 +258,19 @@ impl VmDriver { let console_output = state_dir.join("rootfs-console.log"); let mut command = Command::new(&self.launcher_bin); - command.kill_on_drop(true); + // Intentionally DO NOT set kill_on_drop(true). On a signal-driven + // driver exit (SIGKILL, SIGTERM without a handler, panic), + // tokio's Drop is racy with the launcher's procguard-initiated + // cleanup: if kill_on_drop SIGKILLs the launcher first, its + // cleanup callback never gets to SIGTERM gvproxy, and gvproxy is + // reparented to init as an orphan. Instead the whole cleanup + // cascade runs via procguard: + // driver exits → launcher's kqueue (macOS) or PR_SET_PDEATHSIG + // (Linux) fires → launcher kills gvproxy + libkrun fork → + // launcher exits → its own children die under pdeathsig. + // The explicit Drop path in VmProcess::terminate_vm_process still + // handles voluntary `delete_sandbox` teardown cleanly, where we + // do want SIGTERM + wait + SIGKILL semantics. command.stdin(Stdio::null()); command.stdout(Stdio::inherit()); command.stderr(Stdio::inherit()); diff --git a/crates/openshell-driver-vm/src/lib.rs b/crates/openshell-driver-vm/src/lib.rs index c57bc66cd..772db47b3 100644 --- a/crates/openshell-driver-vm/src/lib.rs +++ b/crates/openshell-driver-vm/src/lib.rs @@ -4,6 +4,7 @@ pub mod driver; mod embedded_runtime; mod ffi; +pub mod procguard; mod rootfs; mod runtime; diff --git a/crates/openshell-driver-vm/src/main.rs b/crates/openshell-driver-vm/src/main.rs index 35e6b618d..5a675e78a 100644 --- a/crates/openshell-driver-vm/src/main.rs +++ b/crates/openshell-driver-vm/src/main.rs @@ -6,7 +6,8 @@ use miette::{IntoDiagnostic, Result}; use openshell_core::VERSION; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; use openshell_driver_vm::{ - VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, VmLaunchConfig, configured_runtime_dir, run_vm, + VM_RUNTIME_DIR_ENV, VmDriver, VmDriverConfig, VmLaunchConfig, configured_runtime_dir, + procguard, run_vm, }; use std::net::SocketAddr; use std::path::PathBuf; @@ -98,6 +99,14 @@ struct Args { async fn main() -> Result<()> { let args = Args::parse(); if args.internal_run_vm { + // We intentionally defer procguard arming until `run_vm()` so + // that the only arm is the one that knows how to clean up + // gvproxy. Racing two watchers against the same parent-death + // event causes the bare arm's `exit(1)` to win, skipping the + // gvproxy cleanup and leaking the helper. The risk window + // before `run_vm` arms procguard is ~a few syscalls long + // (`build_vm_launch_config`, `configured_runtime_dir`), which + // is negligible next to the parent gRPC server's uptime. maybe_reexec_internal_vm_with_runtime_env()?; let config = build_vm_launch_config(&args).map_err(|err| miette::miette!("{err}"))?; run_vm(&config).map_err(|err| miette::miette!("{err}"))?; @@ -110,6 +119,18 @@ async fn main() -> Result<()> { ) .init(); + // Arm procguard so that if the gateway is killed (SIGKILL or crash) + // we also die. Without this the driver is reparented to init and + // keeps its per-sandbox VM launchers alive forever. Launchers have + // their own procguards (armed in `run_vm`) which cascade cleanup of + // gvproxy and the libkrun worker the moment this driver exits. + if let Err(err) = procguard::die_with_parent() { + tracing::warn!( + error = %err, + "procguard arm failed; gateway crashes may orphan this driver" + ); + } + let driver = VmDriver::new(VmDriverConfig { openshell_endpoint: args .openshell_endpoint @@ -187,6 +208,8 @@ fn build_vm_launch_config(args: &Args) -> std::result::Result Result<()> { + use std::os::unix::process::CommandExt as _; + const REEXEC_ENV: &str = "__OPENSHELL_DRIVER_VM_REEXEC"; if std::env::var_os(REEXEC_ENV).is_some() { @@ -209,14 +232,23 @@ fn maybe_reexec_internal_vm_with_runtime_env() -> Result<()> { .map_err(|err| miette::miette!("join DYLD_LIBRARY_PATH: {err}"))?; let exe = std::env::current_exe().into_diagnostic()?; let args: Vec = std::env::args().skip(1).collect(); - let status = std::process::Command::new(exe) + + // Use execvp() so the current process is *replaced* by the re-exec'd + // binary — no wrapper process sits between the compute driver and + // the actually-running VM launcher. That avoids two problems: + // 1. An extra process level that survives SIGKILL of the driver + // (the wrapper was reparenting the re-exec'd child to init). + // 2. Signal forwarding: with a wrapper, a SIGTERM to the wrapper + // doesn't reach the child unless we hand-roll forwarding. + // After exec, the child inherits our PID and our procguard arming. + let err = std::process::Command::new(exe) .args(&args) .env("DYLD_LIBRARY_PATH", &joined) .env(VM_RUNTIME_DIR_ENV, runtime_dir) .env(REEXEC_ENV, "1") - .status() - .into_diagnostic()?; - std::process::exit(status.code().unwrap_or(1)); + .exec(); + // `exec()` only returns on failure. + Err(miette::miette!("failed to re-exec with runtime env: {err}")) } #[cfg(not(target_os = "macos"))] diff --git a/crates/openshell-driver-vm/src/procguard.rs b/crates/openshell-driver-vm/src/procguard.rs new file mode 100644 index 000000000..1d91880f7 --- /dev/null +++ b/crates/openshell-driver-vm/src/procguard.rs @@ -0,0 +1,196 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Cross-platform "die when my parent dies" primitive. +//! +//! The VM driver spawns a chain of subprocesses (compute driver → `--internal-run-vm` +//! launcher → gvproxy + libkrun fork). If any link in that chain is killed +//! with SIGKILL — or simply crashes — the children are reparented to init +//! and survive indefinitely, leaking libkrun workers and gvproxy +//! instances. +//! +//! This module exposes two functions: +//! * [`die_with_parent`] — configure the kernel (Linux) or a helper +//! thread (BSDs, incl. macOS) to SIGKILL the current process when its +//! parent dies. Call it from `main` in every subprocess we spawn +//! along the chain. Idempotent-ish (each call is a full setup — see +//! the runtime.rs comment at the single call site). +//! * [`die_with_parent_cleanup`] — same as above, but on the BSD path a +//! best-effort cleanup callback runs *before* this process exits. +//! This matters when we own a non-Rust child (e.g. gvproxy) that +//! cannot arm its own procguard; the callback lets us SIGTERM it +//! first. +//! +//! The Linux path uses `nix::sys::prctl::set_pdeathsig(SIGKILL)`, and +//! the BSD path uses `smol-rs/polling` with its `kqueue::Process` + +//! `ProcessOps::Exit` filter. Both are well-tested library surfaces; +//! we keep only the glue code and the pre-arming parent-liveness +//! re-check. + +/// Arrange for the current process to receive SIGKILL if its parent dies. +/// +/// On Linux this sets `PR_SET_PDEATHSIG` to SIGKILL (via +/// `nix::sys::prctl`). The kernel delivers SIGKILL the moment +/// `getppid()` changes away from the original parent. +/// +/// On the BSD family (macOS, FreeBSD, etc.) this spawns a detached +/// helper thread that uses `kqueue` with `EVFILT_PROC | NOTE_EXIT` on +/// the parent PID. When the parent exits the thread calls `exit(1)`, +/// which is sufficient for our use case — we are not a critical daemon +/// that needs to drain state; we are a VM launcher / gRPC driver whose +/// entire job is tied to the parent's lifetime. +pub fn die_with_parent() -> Result<(), String> { + die_with_parent_cleanup(|| ()) +} + +/// Like [`die_with_parent`], but run `cleanup` (best-effort, +/// async-signal-unsafe — it runs on the helper thread) immediately +/// before terminating the process. Use this when we own children that +/// cannot arm their own procguard; the cleanup hook is the only chance +/// we get to send them SIGTERM after the kernel reparents us. +/// +/// On Linux the cleanup is a no-op: `PR_SET_PDEATHSIG` delivers SIGKILL +/// directly to us, there is no Rust-controlled moment between "parent +/// died" and "we die" in which we could run a callback. +pub fn die_with_parent_cleanup(cleanup: F) -> Result<(), String> +where + F: FnOnce() + Send + 'static, +{ + #[cfg(target_os = "linux")] + { + // Linux has no opportunity for a cleanup hook — the kernel + // delivers SIGKILL directly. Callers that need pre-exit cleanup + // must combine this with a `pre_exec` PR_SET_PDEATHSIG on their + // children (so the kernel cascades) or rely on process-group + // killpg from a signal handler in the parent. + let _ = cleanup; // intentionally dropped + install_linux_pdeathsig() + } + + #[cfg(any( + target_os = "macos", + target_os = "ios", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly", + ))] + { + install_bsd_kqueue_watcher(cleanup) + } + + #[cfg(not(any( + target_os = "linux", + target_os = "macos", + target_os = "ios", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly", + )))] + { + let _ = cleanup; + Ok(()) + } +} + +#[cfg(target_os = "linux")] +fn install_linux_pdeathsig() -> Result<(), String> { + use nix::sys::signal::Signal; + use nix::unistd::getppid; + + // Race: if the parent already died between fork/exec and this call, + // `getppid()` now returns 1 and PR_SET_PDEATHSIG will never fire. + // Read the current parent first so we can detect that case and exit. + let original_ppid = getppid(); + if original_ppid == nix::unistd::Pid::from_raw(1) { + return Err("process was already orphaned before procguard armed".to_string()); + } + + nix::sys::prctl::set_pdeathsig(Signal::SIGKILL) + .map_err(|err| format!("prctl(PR_SET_PDEATHSIG) failed: {err}"))?; + + // Re-check after arming: the parent may have died between getppid() + // and prctl(). If so, PR_SET_PDEATHSIG missed its window. + if getppid() != original_ppid { + return Err("parent exited before procguard could arm".to_string()); + } + + Ok(()) +} + +#[cfg(any( + target_os = "macos", + target_os = "ios", + target_os = "freebsd", + target_os = "netbsd", + target_os = "openbsd", + target_os = "dragonfly", +))] +fn install_bsd_kqueue_watcher(cleanup: F) -> Result<(), String> +where + F: FnOnce() + Send + 'static, +{ + use nix::unistd::getppid; + use polling::os::kqueue::{PollerKqueueExt, Process, ProcessOps}; + use polling::{Events, PollMode, Poller}; + + let parent_pid = getppid(); + if parent_pid == nix::unistd::Pid::from_raw(1) { + return Err("process was already orphaned before procguard armed".to_string()); + } + let parent_pid_nz = std::num::NonZeroI32::new(parent_pid.as_raw()) + .ok_or_else(|| "getppid returned 0 unexpectedly".to_string())?; + + // Build the poller on the caller's thread so any setup error + // surfaces synchronously. `EVFILT_PROC | NOTE_EXIT` is a one-shot + // filter, so `PollMode::Oneshot` matches the kernel semantics. + // + // SAFETY: `Process::from_pid` requires the PID to "be tied to an + // actual child process". Our parent is alive at this point — we + // re-check `getppid()` immediately after registration to close the + // race where the parent dies between the read above and the + // `add_filter` call. The BSD kqueue implementation accepts any + // live PID, not just our own children; the "child" wording in the + // polling docs is carried over from historical terminology in the + // kqueue(2) manpage. The kernel guarantees NOTE_EXIT fires if the + // PID is valid at registration. + let poller = Poller::new().map_err(|err| format!("polling: Poller::new failed: {err}"))?; + let key = 1; + #[allow(unsafe_code)] + // SAFETY requirement is documented on the enclosing function: the + // PID was just read from `getppid()` and re-checked below, so it + // points at a live process. `Process::from_pid` is an + // entry-in-the-kernel-table registration — the kernel validates + // the PID when the filter is added. + let filter = unsafe { Process::from_pid(parent_pid_nz, ProcessOps::Exit) }; + poller + .add_filter(filter, key, PollMode::Oneshot) + .map_err(|err| format!("polling: add_filter(NOTE_EXIT, {parent_pid_nz}) failed: {err}"))?; + + // Between getppid() and the registered filter the parent may + // already have died. Detect that and abort so the caller can bail. + if getppid() != parent_pid { + return Err("parent exited before procguard could arm".to_string()); + } + + // Hand off to a dedicated OS thread. Block in `poller.wait()` + // until the single NOTE_EXIT event fires, run the cleanup, then + // exit. We prefer `exit(1)` over `kill(getpid, SIGKILL)` so the + // callback gets to complete — SIGKILL would race it. Our children + // have their own procguards armed and will notice `getppid() == + // 1` shortly after, so we do not need Linux-semantics exactness. + std::thread::Builder::new() + .name("procguard".to_string()) + .spawn(move || { + let mut events = Events::new(); + // Block indefinitely; the filter is Oneshot so we expect + // exactly one event (parent's NOTE_EXIT) or a spurious + // wakeup we treat the same way. + let _ = poller.wait(&mut events, None); + let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(cleanup)); + std::process::exit(1); + }) + .map(|_| ()) + .map_err(|e| format!("failed to spawn procguard thread: {e}")) +} diff --git a/crates/openshell-driver-vm/src/runtime.rs b/crates/openshell-driver-vm/src/runtime.rs index ae6e4c183..e20c7d4e5 100644 --- a/crates/openshell-driver-vm/src/runtime.rs +++ b/crates/openshell-driver-vm/src/runtime.rs @@ -10,12 +10,19 @@ use std::ptr; use std::sync::atomic::{AtomicI32, Ordering}; use std::time::{Duration, Instant}; -use crate::{embedded_runtime, ffi}; +use crate::{embedded_runtime, ffi, procguard}; pub const VM_RUNTIME_DIR_ENV: &str = "OPENSHELL_VM_RUNTIME_DIR"; +/// PID of the forked libkrun worker (the VM's PID 1). Zero when not running. +/// Used by the SIGTERM/SIGINT handler to forward signals to the VM. static CHILD_PID: AtomicI32 = AtomicI32::new(0); +/// PID of the gvproxy helper process. Zero when not running. Used by the +/// SIGTERM/SIGINT handler to make sure gvproxy doesn't survive the +/// launcher on macOS (where we can't use `PR_SET_PDEATHSIG`). +static GVPROXY_PID: AtomicI32 = AtomicI32::new(0); + pub struct VmLaunchConfig { pub rootfs: PathBuf, pub vcpus: u8, @@ -36,6 +43,47 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { )); } + // Arm procguard first, BEFORE we spawn gvproxy or fork libkrun, so + // that the launcher can't be orphaned during setup. The cleanup + // callback reads the GVPROXY_PID atomic (initially 0 — no-op) and + // the CHILD_PID atomic (the libkrun fork), so it stays correct as + // those slots get populated later in this function. Only ONE arm + // per process: racing two watchers for the same NOTE_EXIT event + // would cause whichever wins to skip the cleanup. + if let Err(err) = procguard::die_with_parent_cleanup(|| { + // Cleanup order: SIGTERM gvproxy and the libkrun fork first so + // they can drain cleanly, then SIGKILL after a brief grace + // window. We can't rely on Rust destructors here; when + // procguard's watcher thread returns we call `std::process::exit` + // and the process tears down. Only async-signal-safe calls here: + // atomic loads and `kill(2)` are both on the POSIX list. + let gv_pid = GVPROXY_PID.load(Ordering::Relaxed); + let child_pid = CHILD_PID.load(Ordering::Relaxed); + if gv_pid > 0 { + unsafe { + libc::kill(gv_pid, libc::SIGTERM); + } + } + if child_pid > 0 { + unsafe { + libc::kill(child_pid, libc::SIGTERM); + } + } + std::thread::sleep(Duration::from_millis(200)); + if gv_pid > 0 { + unsafe { + libc::kill(gv_pid, libc::SIGKILL); + } + } + if child_pid > 0 { + unsafe { + libc::kill(child_pid, libc::SIGKILL); + } + } + }) { + return Err(format!("procguard arm failed: {err}")); + } + #[cfg(target_os = "linux")] check_kvm_access()?; @@ -118,16 +166,43 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { // surfacing a misleading "sshd is reachable" endpoint. See // https://github.com/containers/gvisor-tap-vsock `cmd/gvproxy/main.go` // (`getForwardsMap` returns an empty map when `sshPort == -1`). - let child = StdCommand::new(&gvproxy_binary) + let mut gvproxy_cmd = StdCommand::new(&gvproxy_binary); + gvproxy_cmd .arg(gvproxy_net_flag) .arg(&gvproxy_net_url) .arg("-ssh-port") .arg("-1") .stdin(Stdio::null()) .stdout(Stdio::null()) - .stderr(gvproxy_log_file) + .stderr(gvproxy_log_file); + + // On Linux the kernel will SIGKILL gvproxy the moment this + // launcher dies (or is SIGKILLed). `pre_exec` runs in the child + // between fork and execve, so the PR_SET_PDEATHSIG flag is + // inherited across execve and applies to gvproxy proper. On + // macOS/BSDs there is no equivalent; we fall back to killing + // gvproxy explicitly from the launcher's procguard cleanup + // callback (see `run_vm` above) and SIGTERM handler + // (see `install_signal_forwarding` below). + #[cfg(target_os = "linux")] + { + use nix::sys::signal::Signal; + use std::os::unix::process::CommandExt as _; + unsafe { + gvproxy_cmd.pre_exec(|| { + nix::sys::prctl::set_pdeathsig(Signal::SIGKILL) + .map_err(|err| std::io::Error::other(format!("pdeathsig: {err}"))) + }); + } + } + + let child = gvproxy_cmd .spawn() .map_err(|e| format!("failed to start gvproxy {}: {e}", gvproxy_binary.display()))?; + // The procguard cleanup reads GVPROXY_PID atomically. Storing it + // here makes the callback able to SIGTERM gvproxy if the driver + // dies from this moment onward. + GVPROXY_PID.store(child.id() as i32, Ordering::Relaxed); wait_for_path(&net_sock, Duration::from_secs(5), "gvproxy data socket")?; @@ -176,6 +251,20 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { match pid { -1 => Err(format!("fork failed: {}", std::io::Error::last_os_error())), 0 => { + // We are the libkrun worker (the VM's PID 1 inside the guest + // kernel, but a normal host process until krun_start_enter + // fires). Arm procguard so this fork is SIGKILLed if the + // parent launcher dies abruptly. On Linux this uses + // `PR_SET_PDEATHSIG`; on macOS this spawns a kqueue + // NOTE_EXIT watcher thread. Either way it closes the same + // leak gvproxy does above. + // + // We also SIGKILL ourselves if arming fails — there's no + // safe way to continue if we can't guarantee cleanup. + if let Err(err) = procguard::die_with_parent() { + eprintln!("libkrun worker: procguard arm failed: {err}"); + std::process::exit(1); + } let ret = vm.start_enter(); eprintln!("krun_start_enter failed: {ret}"); std::process::exit(1); @@ -186,6 +275,7 @@ pub fn run_vm(config: &VmLaunchConfig) -> Result<(), String> { let status = wait_for_child(pid)?; CHILD_PID.store(0, Ordering::Relaxed); cleanup_gvproxy(gvproxy_guard); + GVPROXY_PID.store(0, Ordering::Relaxed); if libc::WIFEXITED(status) { match libc::WEXITSTATUS(status) { @@ -554,11 +644,28 @@ fn install_signal_forwarding(pid: i32) { CHILD_PID.store(pid, Ordering::Relaxed); } +/// Async-signal-safe handler that forwards SIGTERM to every process we +/// own: the libkrun VM worker and the gvproxy helper. We cannot rely on +/// Rust destructors (`GvproxyGuard::drop`, `ManagedDriverProcess::drop`) +/// running on signal-driven exit, so we explicitly deliver the signal +/// here. The `wait_for_child` loop reaps libkrun and `cleanup_gvproxy` +/// reaps gvproxy before `run_vm` returns. +/// +/// Only async-signal-safe libc calls are used — `kill(2)` is listed in +/// POSIX.1-2017 as async-signal-safe, atomic loads are lock-free on the +/// platforms we target. extern "C" fn forward_signal(_sig: libc::c_int) { - let pid = CHILD_PID.load(Ordering::Relaxed); - if pid > 0 { + let vm_pid = CHILD_PID.load(Ordering::Relaxed); + if vm_pid > 0 { + unsafe { + libc::kill(vm_pid, libc::SIGTERM); + } + } + let gv_pid = GVPROXY_PID.load(Ordering::Relaxed); + if gv_pid > 0 { + // gvproxy handles SIGTERM cleanly; no need for SIGKILL. unsafe { - libc::kill(pid, libc::SIGTERM); + libc::kill(gv_pid, libc::SIGTERM); } } } diff --git a/crates/openshell-driver-vm/start.sh b/crates/openshell-driver-vm/start.sh index 155136c78..b5aebbefd 100755 --- a/crates/openshell-driver-vm/start.sh +++ b/crates/openshell-driver-vm/start.sh @@ -5,12 +5,26 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +CLI_BIN="${ROOT}/scripts/bin/openshell" COMPRESSED_DIR="${ROOT}/target/vm-runtime-compressed" -STATE_DIR_DEFAULT="${ROOT}/target/openshell-vm-driver-dev" +SERVER_PORT="${OPENSHELL_SERVER_PORT:-8080}" +# Keep the driver socket path under AF_UNIX SUN_LEN on macOS. +STATE_DIR_ROOT="${OPENSHELL_VM_DRIVER_STATE_ROOT:-/tmp}" +STATE_LABEL_RAW="${OPENSHELL_VM_INSTANCE:-port-${SERVER_PORT}}" +STATE_LABEL="$(printf '%s' "${STATE_LABEL_RAW}" | tr -cs '[:alnum:]._-' '-')" +if [ -z "${STATE_LABEL}" ]; then + STATE_LABEL="port-${SERVER_PORT}" +fi +STATE_DIR_DEFAULT="${STATE_DIR_ROOT}/openshell-vm-driver-dev-${USER:-user}-${STATE_LABEL}" STATE_DIR="${OPENSHELL_VM_DRIVER_STATE_DIR:-${STATE_DIR_DEFAULT}}" DB_PATH_DEFAULT="${STATE_DIR}/openshell.db" -SERVER_PORT="${OPENSHELL_SERVER_PORT:-8080}" VM_HOST_GATEWAY_DEFAULT="${OPENSHELL_VM_HOST_GATEWAY:-host.containers.internal}" +LOCAL_GATEWAY_ENDPOINT_DEFAULT="http://127.0.0.1:${SERVER_PORT}" +LOCAL_GATEWAY_ENDPOINT="${OPENSHELL_VM_LOCAL_GATEWAY_ENDPOINT:-${LOCAL_GATEWAY_ENDPOINT_DEFAULT}}" +GATEWAY_NAME_DEFAULT="vm-driver-${STATE_LABEL}" +GATEWAY_NAME="${OPENSHELL_VM_GATEWAY_NAME:-${GATEWAY_NAME_DEFAULT}}" +DRIVER_DIR_DEFAULT="${ROOT}/target/debug" +DRIVER_DIR="${OPENSHELL_DRIVER_DIR:-${DRIVER_DIR_DEFAULT}}" export OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="${OPENSHELL_VM_RUNTIME_COMPRESSED_DIR:-${COMPRESSED_DIR}}" @@ -52,11 +66,19 @@ fi export OPENSHELL_DISABLE_TLS="$(normalize_bool "${OPENSHELL_DISABLE_TLS:-true}")" export OPENSHELL_DB_URL="${OPENSHELL_DB_URL:-sqlite:${DB_PATH_DEFAULT}}" export OPENSHELL_DRIVERS="${OPENSHELL_DRIVERS:-vm}" +export OPENSHELL_DRIVER_DIR="${DRIVER_DIR}" export OPENSHELL_GRPC_ENDPOINT="${OPENSHELL_GRPC_ENDPOINT:-http://${VM_HOST_GATEWAY_DEFAULT}:${SERVER_PORT}}" export OPENSHELL_SSH_GATEWAY_HOST="${OPENSHELL_SSH_GATEWAY_HOST:-127.0.0.1}" export OPENSHELL_SSH_GATEWAY_PORT="${OPENSHELL_SSH_GATEWAY_PORT:-${SERVER_PORT}}" export OPENSHELL_SSH_HANDSHAKE_SECRET="${OPENSHELL_SSH_HANDSHAKE_SECRET:-dev-vm-driver-secret}" export OPENSHELL_VM_DRIVER_STATE_DIR="${STATE_DIR}" +echo "==> Gateway registration" +echo " Name: ${GATEWAY_NAME}" +echo " Endpoint: ${LOCAL_GATEWAY_ENDPOINT}" +echo " Register: ${CLI_BIN} gateway add --name ${GATEWAY_NAME} ${LOCAL_GATEWAY_ENDPOINT}" +echo " Select: ${CLI_BIN} gateway select ${GATEWAY_NAME}" +echo " Driver: ${OPENSHELL_DRIVER_DIR}/openshell-driver-vm" + echo "==> Starting OpenShell server with VM compute driver" exec "${ROOT}/target/debug/openshell-gateway" diff --git a/tasks/scripts/vm/smoke-orphan-cleanup.sh b/tasks/scripts/vm/smoke-orphan-cleanup.sh new file mode 100755 index 000000000..9a37861a0 --- /dev/null +++ b/tasks/scripts/vm/smoke-orphan-cleanup.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Smoke test: start the gateway with the VM driver, create a sandbox, then +# signal the gateway (SIGTERM then SIGKILL) and verify that no driver, +# launcher, gvproxy, or libkrun worker processes survive. +# +# Exit codes: +# 0 — both SIGTERM and SIGKILL cleanup passed +# 1 — one or more scenarios leaked survivors + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +cd "$ROOT" + +PORT="${OPENSHELL_SERVER_PORT:-8091}" +XDG="${TMPDIR:-/tmp}/vm-orphan-xdg-$$" +STATE_DIR="${TMPDIR:-/tmp}/openshell-vm-orphan-$$" +LOG="${TMPDIR:-/tmp}/vm-orphan-$$.log" + +cleanup_stray() { + # Best-effort: kill anything left over from our sandbox ids so repeated + # runs don't accumulate. + pkill -9 -f "openshell-vm-orphan-$$" 2>/dev/null || true + rm -rf "$XDG" "$STATE_DIR" 2>/dev/null || true + # Preserve the gateway log only on failure so operators can diagnose. + if [ "${EXIT_CODE:-0}" -ne 0 ]; then + echo "(log preserved at $LOG)" >&2 + else + rm -f "$LOG" "$LOG.create" 2>/dev/null || true + fi +} +trap cleanup_stray EXIT + +build_binaries() { + echo "==> Ensuring binaries are built" + if [ ! -x "$ROOT/target/debug/openshell-gateway" ] || [ ! -x "$ROOT/target/debug/openshell-driver-vm" ]; then + cargo build -p openshell-server -p openshell-driver-vm >&2 + fi + if [ "$(uname -s)" = "Darwin" ]; then + codesign \ + --entitlements "$ROOT/crates/openshell-driver-vm/entitlements.plist" \ + --force -s - \ + "$ROOT/target/debug/openshell-driver-vm" >/dev/null 2>&1 || true + fi +} + +start_gateway() { + local health_port=$((PORT + 1)) + echo "==> Starting gateway on port $PORT (state=$STATE_DIR, health=$health_port)" + mkdir -p "$STATE_DIR" + OPENSHELL_SERVER_PORT="$PORT" \ + OPENSHELL_HEALTH_PORT="$health_port" \ + OPENSHELL_DB_URL="sqlite:$STATE_DIR/openshell.db" \ + OPENSHELL_DRIVERS=vm \ + OPENSHELL_DRIVER_DIR="$ROOT/target/debug" \ + OPENSHELL_GRPC_ENDPOINT="http://host.containers.internal:$PORT" \ + OPENSHELL_SSH_GATEWAY_HOST=127.0.0.1 \ + OPENSHELL_SSH_GATEWAY_PORT="$PORT" \ + OPENSHELL_SSH_HANDSHAKE_SECRET=dev-vm-driver-secret \ + OPENSHELL_VM_DRIVER_STATE_DIR="$STATE_DIR" \ + OPENSHELL_VM_RUNTIME_COMPRESSED_DIR="$ROOT/target/vm-runtime-compressed" \ + nohup "$ROOT/target/debug/openshell-gateway" --disable-tls \ + > "$LOG" 2>&1 & + GATEWAY_PID=$! + echo "gateway pid=$GATEWAY_PID" + + for _ in $(seq 1 60); do + if grep -q "Server listening" "$LOG" 2>/dev/null; then + return 0 + fi + if ! kill -0 "$GATEWAY_PID" 2>/dev/null; then + echo "!! gateway died before ready" + tail -40 "$LOG" >&2 + return 1 + fi + sleep 1 + done + echo "!! gateway never reported ready" + tail -40 "$LOG" >&2 + return 1 +} + +create_sandbox() { + echo "==> Creating sandbox (--keep, long-running)" + mkdir -p "$XDG" + XDG_CONFIG_HOME="$XDG" "$ROOT/scripts/bin/openshell" gateway add \ + --name vm-orphan http://127.0.0.1:"$PORT" >/dev/null + XDG_CONFIG_HOME="$XDG" "$ROOT/scripts/bin/openshell" gateway select vm-orphan >/dev/null + + # Run the CLI in the background; it blocks waiting for sleep to finish. + XDG_CONFIG_HOME="$XDG" "$ROOT/scripts/bin/openshell" sandbox create \ + --name "orphan-$$" --keep -- sleep 99999 \ + > "$LOG.create" 2>&1 & + CLI_PID=$! + + for _ in $(seq 1 60); do + if pgrep -f "openshell-vm-orphan-$$|$STATE_DIR/sandboxes/" >/dev/null 2>&1; then + if pgrep -f gvproxy >/dev/null 2>&1; then + echo "sandbox came up (cli pid=$CLI_PID)" + return 0 + fi + fi + sleep 2 + done + echo "!! sandbox never came up" + tail -40 "$LOG" "$LOG.create" >&2 2>/dev/null || true + return 1 +} + +snapshot_kids() { + # Return all PIDs whose --state-dir or --vm-rootfs references our + # per-run directory, plus any gvproxy that mentions our socket base. + pgrep -fl "state-dir $STATE_DIR|$STATE_DIR/sandboxes" 2>/dev/null || true + pgrep -fl "gvproxy" 2>/dev/null | grep "osd-gv" || true +} + +count_alive() { + local alive + alive=$(pgrep -f "state-dir $STATE_DIR|$STATE_DIR/sandboxes" 2>/dev/null | wc -l | tr -d ' ') + local gv + gv=$(pgrep -f 'gvproxy' 2>/dev/null | xargs -r ps -o pid=,command= -p 2>/dev/null | grep -c 'osd-gv' || true) + echo $((alive + gv)) +} + +verify_cleanup() { + local label="$1" + local deadline="$2" + local waited=0 + while [ "$waited" -lt "$deadline" ]; do + local n + n=$(count_alive) + if [ "$n" = "0" ]; then + echo " PASS ($label): all descendants gone after ${waited}s" + return 0 + fi + sleep 1 + waited=$((waited + 1)) + done + echo " FAIL ($label): $(count_alive) descendants still alive after ${deadline}s:" + snapshot_kids | sed 's/^/ /' + return 1 +} + +run_scenario() { + local signal="$1" + local label="$2" + echo "======================================================" + echo "Scenario: $label (signal $signal)" + echo "======================================================" + + start_gateway || return 1 + create_sandbox || { kill -9 "$GATEWAY_PID" 2>/dev/null; return 1; } + + echo "-- process tree before signal --" + snapshot_kids | sed 's/^/ /' + echo + + echo "-> kill -$signal $GATEWAY_PID" + kill "-$signal" "$GATEWAY_PID" 2>/dev/null || true + + verify_cleanup "$label" 15 + local rc=$? + + # Belt-and-braces teardown between scenarios. + pkill -9 -f "$STATE_DIR/sandboxes|$STATE_DIR " 2>/dev/null || true + pkill -9 -f 'gvproxy.*osd-gv' 2>/dev/null || true + rm -rf "$STATE_DIR" /tmp/osd-gv "$XDG" 2>/dev/null || true + # CLI may still be running; reap it. + kill "${CLI_PID:-0}" 2>/dev/null || true + sleep 1 + + return $rc +} + +main() { + build_binaries + local overall=0 + + # Clean starting state. + pkill -9 -f 'openshell-gateway|openshell-driver-vm' 2>/dev/null || true + pkill -9 -f 'gvproxy.*osd-gv' 2>/dev/null || true + sleep 1 + + if ! run_scenario TERM "graceful SIGTERM"; then + overall=1 + fi + + if ! run_scenario KILL "abrupt SIGKILL"; then + overall=1 + fi + + if [ "$overall" -eq 0 ]; then + echo "ALL SCENARIOS PASSED" + else + echo "ONE OR MORE SCENARIOS FAILED" + fi + EXIT_CODE=$overall + return $overall +} + +main "$@" diff --git a/tasks/scripts/vm/vm-setup.sh b/tasks/scripts/vm/vm-setup.sh index e7ae06d08..bccb7f754 100755 --- a/tasks/scripts/vm/vm-setup.sh +++ b/tasks/scripts/vm/vm-setup.sh @@ -128,4 +128,4 @@ echo " Compressed artifacts in: ${OUTPUT_DIR}" echo "" echo "Next steps:" echo " mise run vm:rootfs --base # build rootfs (requires Docker)" -echo " mise run vm # build and run the VM" +echo " mise run gateway:vm # start openshell-gateway with the VM driver" diff --git a/tasks/vm.toml b/tasks/vm.toml index ca06b08c1..0a44b4ff7 100644 --- a/tasks/vm.toml +++ b/tasks/vm.toml @@ -5,22 +5,25 @@ # # Workflow: # mise run vm:setup # one-time: download pre-built runtime (~30s) -# mise run vm # build + run the VM +# mise run gateway:vm # start openshell-gateway with the VM driver +# mise run vm # build + run the standalone openshell-vm microVM # mise run vm:clean # wipe everything and start over # -# See crates/openshell-vm/README.md for full documentation. +# See crates/openshell-driver-vm/README.md for the `gateway:vm` flow and +# crates/openshell-vm/README.md for the standalone microVM path. # ═══════════════════════════════════════════════════════════════════════════ # Main Commands # ═══════════════════════════════════════════════════════════════════════════ +["gateway:vm"] +description = "Build openshell-gateway + openshell-driver-vm and start the gateway with the VM driver" +run = "crates/openshell-driver-vm/start.sh" + [vm] -description = "Build and run the openshell-vm microVM" +description = "Build and run the standalone openshell-vm microVM" depends = ["build:docker:gateway"] -run = [ - "mise run vm:build", - "tasks/scripts/vm/run-vm.sh", -] +run = ["mise run vm:build", "tasks/scripts/vm/run-vm.sh"] ["vm:build"] description = "Build the openshell-vm binary with embedded runtime" @@ -42,3 +45,7 @@ run = "tasks/scripts/vm/build-rootfs-tarball.sh" ["vm:clean"] description = "Remove all VM cached artifacts (runtime, rootfs, builds)" run = "tasks/scripts/vm/vm-clean.sh" + +["vm:smoke:orphan-cleanup"] +description = "Smoke test: start gateway+driver, create a sandbox, signal the gateway, assert no orphaned processes survive" +run = "tasks/scripts/vm/smoke-orphan-cleanup.sh"