Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ export default function App() {
// Send run-completion notification for terminal statuses
const terminalStatuses: RunStatus[] = [
"succeeded",
"partially_succeeded",
"failed",
"permanent_failure",
];
Expand Down
2 changes: 1 addition & 1 deletion src/components/content/dashboard-system-section.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ function RecentRunRow({ run, jobs, projects, onSelect, onRetry, onNewRun }: {
const jobName = run.jobName ?? job?.name ?? "Unknown Job";
const project = job ? projects.find((p) => p.id === job.projectId) : null;
const isFailed = run.status === "failed" || run.status === "permanent_failure";
const isTerminal = ["succeeded", "failed", "permanent_failure", "cancelled"].includes(run.status);
const isTerminal = ["succeeded", "partially_succeeded", "failed", "permanent_failure", "cancelled"].includes(run.status);
// Named group `group/run` (not bare `group`): the embedded-screen wrapper in
// screen-embed-card.tsx is itself `.group`, so bare `group-hover:` utilities
// here would also fire when hovering anywhere on the embedded dashboard.
Expand Down
1 change: 1 addition & 0 deletions src/components/content/run-detail-view.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ export function RunDetailView({ runId }: RunDetailViewProps) {
const isCancellable = run.status === "running" || run.status === "queued";
const isTerminal = [
"succeeded",
"partially_succeeded",
"failed",
"permanent_failure",
"cancelled",
Expand Down
6 changes: 6 additions & 0 deletions supabase/migrations/20260526120000_runs_partial_status.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- Add partially_succeeded to the runs status check constraint.
-- The TypeScript type and executor already use this value; the initial schema
-- constraint was missing it, leaving the schema out of sync with reality.
ALTER TABLE runs DROP CONSTRAINT IF EXISTS runs_status_check;
ALTER TABLE runs ADD CONSTRAINT runs_status_check
CHECK (status IN ('deferred','queued','running','succeeded','partially_succeeded','failed','permanent_failure','cancelled'));
31 changes: 30 additions & 1 deletion worker/src/executor-sandbox-prep.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,33 @@ export function buildBrowserExtensionArg(profiles: HydratedProfile[] = []): stri
return `${envPrefix}${BROWSER_MCP_WRAPPER_PATH}`;
}

/**
* Block until the sandbox can respond to a trivial command, or throw if it
* doesn't become ready within ~60 s. Without this gate, commands issued
* immediately after Sandbox.create() on a cold template (boot can take 15-25 s)
* fire deadline_exceeded errors that cascade into fatal run failures.
*
* A single 60 s command timeout is used rather than a polling loop: if E2B
* infrastructure is responding but slow, waiting ~60 s for a simple echo is
* the right behaviour. If the sandbox is genuinely broken the SDK throws and
* we surface a clear error.
*/
export async function waitForSandboxReady(
sandbox: InstanceType<typeof Sandbox>,
log: (line: string) => void,
): Promise<void> {
log("[openhelm] waiting for sandbox to become ready…");
try {
await sandbox.commands.run("echo ready", { timeoutMs: 60_000 });
log("[openhelm] sandbox ready");
} catch (err) {
throw new Error(
`sandbox did not become ready within 60 s — E2B cold-start may have exceeded the limit. ` +
`Original: ${err instanceof Error ? err.message : String(err)}`,
);
}
}

export async function prepareWorkspace(
sandbox: InstanceType<typeof Sandbox>,
gitUrl: string | null | undefined,
Expand All @@ -58,7 +85,9 @@ export async function prepareWorkspace(
}
} else {
log(`[openhelm] no preconfigured repo; sandbox is empty`);
await sandbox.commands.run("mkdir -p /tmp/workspace", { timeoutMs: 5_000 });
// Use a generous timeout — on cold template boots the sandbox may still be
// settling after the readiness gate, so 5 s was too tight.
await sandbox.commands.run("mkdir -p /tmp/workspace", { timeoutMs: 30_000 });
}
return willClone;
}
Expand Down
8 changes: 7 additions & 1 deletion worker/src/executor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import {
setupMcpBrowser,
setupNotifyMcp,
hydrateAuthAndPreparePrompt,
waitForSandboxReady,
} from "./executor-sandbox-prep.js";
import { executeGoose, type GooseResult } from "./executor-goose.js";
import { DEFAULT_INITIAL_MAX_TURNS } from "./executor-goose-types.js";
Expand Down Expand Up @@ -457,6 +458,11 @@ export async function executeRun(runId: string): Promise<void> {
});
activeSandboxes.set(runId, sandbox);

// Gate on sandbox readiness before issuing any commands. Cold E2B template
// boots can take 15-25 s; firing commands before the sandbox is ready
// produces deadline_exceeded errors that cascade into fatal run failures.
await waitForSandboxReady(sandbox, (line) => relay.onStderr(line));

// Pre-start Xvfb before any browser MCP call. Without this, Chromium
// inside the sandbox hangs for the full per-tool-call timeout (60-120s)
// trying to connect to a nonexistent X server, which manifests as
Expand Down Expand Up @@ -961,7 +967,7 @@ async function startXvfb(
try {
await sandbox.commands.run(
`bash -c 'Xvfb :0 -screen 0 1280x900x24 -ac +extension RANDR > /tmp/xvfb.log 2>&1 &'`,
{ timeoutMs: 10_000, background: true },
{ timeoutMs: 30_000, background: true },
);
const ready = await sandbox.commands.run(
`bash -c 'for i in $(seq 1 40); do xdpyinfo -display :0 >/dev/null 2>&1 && exit 0; sleep 0.5; done; exit 1'`,
Expand Down
8 changes: 8 additions & 0 deletions worker/src/run-verifier.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@ recipient list AND any attachments mentioned; "create a research doc" implies th
its citations. A run that produced SOME deliverables but skipped others is at best a
partial success — never a full accomplishment.

HOUSEKEEPING-STEPS EXCLUSION (prevents false partials): Agent-initiated housekeeping steps
that are NOT stated or clearly implied by the job prompt are NOT deliverables. Examples of
steps that do NOT count as deliverables: logging completion to an internal tracking table,
updating a metrics dashboard, recording a run in an observability system, sending a
self-notification. If the agent attempted such a step as a bonus and it failed (e.g. a CLI
tool was missing, a data table didn't exist), do NOT count it as a missed deliverable —
ignore it entirely when scoring the run.

CONNECTED-TOOL-COVERAGE SUB-RULE: If a connected MCP server or CLI is clearly relevant
to one of the implied deliverables (e.g. an Unsplash MCP when the task involves images,
a Gmail MCP when the task involves email, a Notion MCP when the task involves docs) AND
Expand Down