maxbeech · maxbeech · May 28, 2026 · May 26, 2026 · May 26, 2026
diff --git a/src/App.tsx b/src/App.tsx
@@ -219,6 +219,7 @@ export default function App() {
       // Send run-completion notification for terminal statuses
       const terminalStatuses: RunStatus[] = [
         "succeeded",
+        "partially_succeeded",
         "failed",
         "permanent_failure",
       ];

diff --git a/src/components/content/dashboard-system-section.tsx b/src/components/content/dashboard-system-section.tsx
@@ -112,7 +112,7 @@ function RecentRunRow({ run, jobs, projects, onSelect, onRetry, onNewRun }: {
   const jobName = run.jobName ?? job?.name ?? "Unknown Job";
   const project = job ? projects.find((p) => p.id === job.projectId) : null;
   const isFailed = run.status === "failed" || run.status === "permanent_failure";
-  const isTerminal = ["succeeded", "failed", "permanent_failure", "cancelled"].includes(run.status);
+  const isTerminal = ["succeeded", "partially_succeeded", "failed", "permanent_failure", "cancelled"].includes(run.status);
   // Named group `group/run` (not bare `group`): the embedded-screen wrapper in
   // screen-embed-card.tsx is itself `.group`, so bare `group-hover:` utilities
   // here would also fire when hovering anywhere on the embedded dashboard.

diff --git a/src/components/content/run-detail-view.tsx b/src/components/content/run-detail-view.tsx
@@ -70,6 +70,7 @@ export function RunDetailView({ runId }: RunDetailViewProps) {
   const isCancellable = run.status === "running" || run.status === "queued";
   const isTerminal = [
     "succeeded",
+    "partially_succeeded",
     "failed",
     "permanent_failure",
     "cancelled",

diff --git a/supabase/migrations/20260526120000_runs_partial_status.sql b/supabase/migrations/20260526120000_runs_partial_status.sql
@@ -0,0 +1,6 @@
+-- Add partially_succeeded to the runs status check constraint.
+-- The TypeScript type and executor already use this value; the initial schema
+-- constraint was missing it, leaving the schema out of sync with reality.
+ALTER TABLE runs DROP CONSTRAINT IF EXISTS runs_status_check;
+ALTER TABLE runs ADD CONSTRAINT runs_status_check
+  CHECK (status IN ('deferred','queued','running','succeeded','partially_succeeded','failed','permanent_failure','cancelled'));
diff --git a/worker/src/executor-sandbox-prep.ts b/worker/src/executor-sandbox-prep.ts
@@ -35,6 +35,33 @@ export function buildBrowserExtensionArg(profiles: HydratedProfile[] = []): stri
   return `${envPrefix}${BROWSER_MCP_WRAPPER_PATH}`;
 }
 
+/**
+ * Block until the sandbox can respond to a trivial command, or throw if it
+ * doesn't become ready within ~60 s. Without this gate, commands issued
+ * immediately after Sandbox.create() on a cold template (boot can take 15-25 s)
+ * fire deadline_exceeded errors that cascade into fatal run failures.
+ *
+ * A single 60 s command timeout is used rather than a polling loop: if E2B
+ * infrastructure is responding but slow, waiting ~60 s for a simple echo is
+ * the right behaviour. If the sandbox is genuinely broken the SDK throws and
+ * we surface a clear error.
+ */
+export async function waitForSandboxReady(
+  sandbox: InstanceType<typeof Sandbox>,
+  log: (line: string) => void,
+): Promise<void> {
+  log("[openhelm] waiting for sandbox to become ready…");
+  try {
+    await sandbox.commands.run("echo ready", { timeoutMs: 60_000 });
+    log("[openhelm] sandbox ready");
+  } catch (err) {
+    throw new Error(
+      `sandbox did not become ready within 60 s — E2B cold-start may have exceeded the limit. ` +
+      `Original: ${err instanceof Error ? err.message : String(err)}`,
+    );
+  }
+}
+
 export async function prepareWorkspace(
   sandbox: InstanceType<typeof Sandbox>,
   gitUrl: string | null | undefined,
@@ -58,7 +85,9 @@ export async function prepareWorkspace(
     }
   } else {
     log(`[openhelm] no preconfigured repo; sandbox is empty`);
-    await sandbox.commands.run("mkdir -p /tmp/workspace", { timeoutMs: 5_000 });
+    // Use a generous timeout — on cold template boots the sandbox may still be
+    // settling after the readiness gate, so 5 s was too tight.
+    await sandbox.commands.run("mkdir -p /tmp/workspace", { timeoutMs: 30_000 });
   }
   return willClone;
 }

diff --git a/worker/src/executor.ts b/worker/src/executor.ts
@@ -23,6 +23,7 @@ import {
   setupMcpBrowser,
   setupNotifyMcp,
   hydrateAuthAndPreparePrompt,
+  waitForSandboxReady,
 } from "./executor-sandbox-prep.js";
 import { executeGoose, type GooseResult } from "./executor-goose.js";
 import { DEFAULT_INITIAL_MAX_TURNS } from "./executor-goose-types.js";
@@ -457,6 +458,11 @@ export async function executeRun(runId: string): Promise<void> {
     });
     activeSandboxes.set(runId, sandbox);
 
+    // Gate on sandbox readiness before issuing any commands. Cold E2B template
+    // boots can take 15-25 s; firing commands before the sandbox is ready
+    // produces deadline_exceeded errors that cascade into fatal run failures.
+    await waitForSandboxReady(sandbox, (line) => relay.onStderr(line));
+
     // Pre-start Xvfb before any browser MCP call. Without this, Chromium
     // inside the sandbox hangs for the full per-tool-call timeout (60-120s)
     // trying to connect to a nonexistent X server, which manifests as
@@ -961,7 +967,7 @@ async function startXvfb(
   try {
     await sandbox.commands.run(
       `bash -c 'Xvfb :0 -screen 0 1280x900x24 -ac +extension RANDR > /tmp/xvfb.log 2>&1 &'`,
-      { timeoutMs: 10_000, background: true },
+      { timeoutMs: 30_000, background: true },
     );
     const ready = await sandbox.commands.run(
       `bash -c 'for i in $(seq 1 40); do xdpyinfo -display :0 >/dev/null 2>&1 && exit 0; sleep 0.5; done; exit 1'`,

diff --git a/worker/src/run-verifier.ts b/worker/src/run-verifier.ts
@@ -98,6 +98,14 @@ recipient list AND any attachments mentioned; "create a research doc" implies th
 its citations. A run that produced SOME deliverables but skipped others is at best a
 partial success — never a full accomplishment.
 
+HOUSEKEEPING-STEPS EXCLUSION (prevents false partials): Agent-initiated housekeeping steps
+that are NOT stated or clearly implied by the job prompt are NOT deliverables. Examples of
+steps that do NOT count as deliverables: logging completion to an internal tracking table,
+updating a metrics dashboard, recording a run in an observability system, sending a
+self-notification. If the agent attempted such a step as a bonus and it failed (e.g. a CLI
+tool was missing, a data table didn't exist), do NOT count it as a missed deliverable —
+ignore it entirely when scoring the run.
+
 CONNECTED-TOOL-COVERAGE SUB-RULE: If a connected MCP server or CLI is clearly relevant
 to one of the implied deliverables (e.g. an Unsplash MCP when the task involves images,
 a Gmail MCP when the task involves email, a Notion MCP when the task involves docs) AND