diff --git a/src/App.tsx b/src/App.tsx index d38f433..2f02ba4 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -219,6 +219,7 @@ export default function App() { // Send run-completion notification for terminal statuses const terminalStatuses: RunStatus[] = [ "succeeded", + "partially_succeeded", "failed", "permanent_failure", ]; diff --git a/src/components/content/dashboard-system-section.tsx b/src/components/content/dashboard-system-section.tsx index 9ce651a..d9211d0 100644 --- a/src/components/content/dashboard-system-section.tsx +++ b/src/components/content/dashboard-system-section.tsx @@ -112,7 +112,7 @@ function RecentRunRow({ run, jobs, projects, onSelect, onRetry, onNewRun }: { const jobName = run.jobName ?? job?.name ?? "Unknown Job"; const project = job ? projects.find((p) => p.id === job.projectId) : null; const isFailed = run.status === "failed" || run.status === "permanent_failure"; - const isTerminal = ["succeeded", "failed", "permanent_failure", "cancelled"].includes(run.status); + const isTerminal = ["succeeded", "partially_succeeded", "failed", "permanent_failure", "cancelled"].includes(run.status); // Named group `group/run` (not bare `group`): the embedded-screen wrapper in // screen-embed-card.tsx is itself `.group`, so bare `group-hover:` utilities // here would also fire when hovering anywhere on the embedded dashboard. diff --git a/src/components/content/run-detail-view.tsx b/src/components/content/run-detail-view.tsx index 08593f9..9da396e 100644 --- a/src/components/content/run-detail-view.tsx +++ b/src/components/content/run-detail-view.tsx @@ -70,6 +70,7 @@ export function RunDetailView({ runId }: RunDetailViewProps) { const isCancellable = run.status === "running" || run.status === "queued"; const isTerminal = [ "succeeded", + "partially_succeeded", "failed", "permanent_failure", "cancelled", diff --git a/supabase/migrations/20260526120000_runs_partial_status.sql b/supabase/migrations/20260526120000_runs_partial_status.sql new file mode 100644 index 0000000..9b99ce0 --- /dev/null +++ b/supabase/migrations/20260526120000_runs_partial_status.sql @@ -0,0 +1,6 @@ +-- Add partially_succeeded to the runs status check constraint. +-- The TypeScript type and executor already use this value; the initial schema +-- constraint was missing it, leaving the schema out of sync with reality. +ALTER TABLE runs DROP CONSTRAINT IF EXISTS runs_status_check; +ALTER TABLE runs ADD CONSTRAINT runs_status_check + CHECK (status IN ('deferred','queued','running','succeeded','partially_succeeded','failed','permanent_failure','cancelled')); diff --git a/worker/src/executor-sandbox-prep.ts b/worker/src/executor-sandbox-prep.ts index 9a62fcd..5f8076c 100644 --- a/worker/src/executor-sandbox-prep.ts +++ b/worker/src/executor-sandbox-prep.ts @@ -35,6 +35,33 @@ export function buildBrowserExtensionArg(profiles: HydratedProfile[] = []): stri return `${envPrefix}${BROWSER_MCP_WRAPPER_PATH}`; } +/** + * Block until the sandbox can respond to a trivial command, or throw if it + * doesn't become ready within ~60 s. Without this gate, commands issued + * immediately after Sandbox.create() on a cold template (boot can take 15-25 s) + * fire deadline_exceeded errors that cascade into fatal run failures. + * + * A single 60 s command timeout is used rather than a polling loop: if E2B + * infrastructure is responding but slow, waiting ~60 s for a simple echo is + * the right behaviour. If the sandbox is genuinely broken the SDK throws and + * we surface a clear error. + */ +export async function waitForSandboxReady( + sandbox: InstanceType, + log: (line: string) => void, +): Promise { + log("[openhelm] waiting for sandbox to become ready…"); + try { + await sandbox.commands.run("echo ready", { timeoutMs: 60_000 }); + log("[openhelm] sandbox ready"); + } catch (err) { + throw new Error( + `sandbox did not become ready within 60 s — E2B cold-start may have exceeded the limit. ` + + `Original: ${err instanceof Error ? err.message : String(err)}`, + ); + } +} + export async function prepareWorkspace( sandbox: InstanceType, gitUrl: string | null | undefined, @@ -58,7 +85,9 @@ export async function prepareWorkspace( } } else { log(`[openhelm] no preconfigured repo; sandbox is empty`); - await sandbox.commands.run("mkdir -p /tmp/workspace", { timeoutMs: 5_000 }); + // Use a generous timeout — on cold template boots the sandbox may still be + // settling after the readiness gate, so 5 s was too tight. + await sandbox.commands.run("mkdir -p /tmp/workspace", { timeoutMs: 30_000 }); } return willClone; } diff --git a/worker/src/executor.ts b/worker/src/executor.ts index 3c63b89..9bc55c5 100644 --- a/worker/src/executor.ts +++ b/worker/src/executor.ts @@ -23,6 +23,7 @@ import { setupMcpBrowser, setupNotifyMcp, hydrateAuthAndPreparePrompt, + waitForSandboxReady, } from "./executor-sandbox-prep.js"; import { executeGoose, type GooseResult } from "./executor-goose.js"; import { DEFAULT_INITIAL_MAX_TURNS } from "./executor-goose-types.js"; @@ -457,6 +458,11 @@ export async function executeRun(runId: string): Promise { }); activeSandboxes.set(runId, sandbox); + // Gate on sandbox readiness before issuing any commands. Cold E2B template + // boots can take 15-25 s; firing commands before the sandbox is ready + // produces deadline_exceeded errors that cascade into fatal run failures. + await waitForSandboxReady(sandbox, (line) => relay.onStderr(line)); + // Pre-start Xvfb before any browser MCP call. Without this, Chromium // inside the sandbox hangs for the full per-tool-call timeout (60-120s) // trying to connect to a nonexistent X server, which manifests as @@ -961,7 +967,7 @@ async function startXvfb( try { await sandbox.commands.run( `bash -c 'Xvfb :0 -screen 0 1280x900x24 -ac +extension RANDR > /tmp/xvfb.log 2>&1 &'`, - { timeoutMs: 10_000, background: true }, + { timeoutMs: 30_000, background: true }, ); const ready = await sandbox.commands.run( `bash -c 'for i in $(seq 1 40); do xdpyinfo -display :0 >/dev/null 2>&1 && exit 0; sleep 0.5; done; exit 1'`, diff --git a/worker/src/run-verifier.ts b/worker/src/run-verifier.ts index b365bae..917eefb 100644 --- a/worker/src/run-verifier.ts +++ b/worker/src/run-verifier.ts @@ -98,6 +98,14 @@ recipient list AND any attachments mentioned; "create a research doc" implies th its citations. A run that produced SOME deliverables but skipped others is at best a partial success — never a full accomplishment. +HOUSEKEEPING-STEPS EXCLUSION (prevents false partials): Agent-initiated housekeeping steps +that are NOT stated or clearly implied by the job prompt are NOT deliverables. Examples of +steps that do NOT count as deliverables: logging completion to an internal tracking table, +updating a metrics dashboard, recording a run in an observability system, sending a +self-notification. If the agent attempted such a step as a bonus and it failed (e.g. a CLI +tool was missing, a data table didn't exist), do NOT count it as a missed deliverable — +ignore it entirely when scoring the run. + CONNECTED-TOOL-COVERAGE SUB-RULE: If a connected MCP server or CLI is clearly relevant to one of the implied deliverables (e.g. an Unsplash MCP when the task involves images, a Gmail MCP when the task involves email, a Notion MCP when the task involves docs) AND