braintrustdata · Luca Forstner (lforst) · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/.changeset/thin-spies-train.md b/.changeset/thin-spies-train.md
@@ -0,0 +1,5 @@
+---
+"braintrust": minor
+---
+
+feat(flue): Update flue instrumentation to use new observe hooks
diff --git a/e2e/config/pr-comment-scenarios.json b/e2e/config/pr-comment-scenarios.json
@@ -177,17 +177,7 @@
     "scenarioDirName": "flue-instrumentation",
     "label": "Flue Instrumentation",
     "metadataScenario": "flue-instrumentation",
-    "variants": [
-      { "variantKey": "flue-v0-7-0-wrapped", "label": "v0.7.0 wrapped" },
-      {
-        "variantKey": "flue-v0-7-0-auto-hook",
-        "label": "v0.7.0 auto-hook"
-      },
-      {
-        "variantKey": "flue-v0-7-0-openai-auto-hook",
-        "label": "v0.7.0 OpenAI auto-hook"
-      }
-    ]
+    "variants": [{ "variantKey": "flue-v0-8-0", "label": "v0.8.0" }]
   },
   {
     "scenarioDirName": "github-copilot-instrumentation",

diff --git a/e2e/scenarios/flue-instrumentation/.flue/app.mjs b/e2e/scenarios/flue-instrumentation/.flue/app.mjs
@@ -0,0 +1,53 @@
+import { configureProvider, flue, observe } from "@flue/runtime/app";
+import { flush, initLogger } from "braintrust";
+
+function projectName() {
+  const configured = process.env.BRAINTRUST_E2E_PROJECT_NAME;
+  if (configured) {
+    return configured;
+  }
+  const testRunId = process.env.BRAINTRUST_E2E_RUN_ID ?? "local";
+  return `e2e-flue-instrumentation-${testRunId.toLowerCase().replace(/[^a-z0-9-]/g, "-")}`;
+}
+
+initLogger({ projectName: projectName() });
+
+if (process.env.FLUE_E2E_EXPLICIT_OBSERVE === "1") {
+  const { braintrustFlueObserver } = await import("braintrust");
+  observe(braintrustFlueObserver);
+}
+
+const openAIBaseUrl =
+  process.env.OPENAI_BASE_URL ?? process.env.BRAINTRUST_E2E_MODEL_BASE_URL;
+if (openAIBaseUrl) {
+  configureProvider("openai", { baseUrl: openAIBaseUrl });
+}
+
+const anthropicBaseUrl = process.env.ANTHROPIC_BASE_URL;
+if (anthropicBaseUrl) {
+  configureProvider("anthropic", {
+    apiKey: process.env.ANTHROPIC_API_KEY ?? "test-key",
+    baseUrl: anthropicBaseUrl,
+  });
+}
+
+let didScheduleFlush = false;
+process.on("beforeExit", () => {
+  if (didScheduleFlush) {
+    return;
+  }
+  didScheduleFlush = true;
+  void flush();
+});
+
+const app = flue();
+
+export default {
+  async fetch(request, env, ctx) {
+    if (new URL(request.url).pathname === "/__braintrust_flush") {
+      await flush();
+      return new Response("ok");
+    }
+    return app.fetch(request, env, ctx);
+  },
+};
diff --git a/e2e/scenarios/flue-instrumentation/.flue/workflows/instrumentation.mjs b/e2e/scenarios/flue-instrumentation/.flue/workflows/instrumentation.mjs
@@ -0,0 +1,142 @@
+import { createAgent, Type } from "@flue/runtime";
+import { local } from "@flue/runtime/node";
+import {
+  FLUE_MODEL,
+  FLUE_REASONING_MODEL,
+  SCENARIO_NAME,
+} from "../../constants.mjs";
+
+function flueModel() {
+  return process.env.FLUE_E2E_MODEL ?? FLUE_MODEL;
+}
+
+function flueReasoningModel() {
+  return process.env.FLUE_E2E_REASONING_MODEL ?? FLUE_REASONING_MODEL;
+}
+
+function fluePromptModel() {
+  return process.env.FLUE_E2E_PROMPT_MODEL ?? flueReasoningModel();
+}
+
+function fluePromptThinkingLevel() {
+  return (
+    process.env.FLUE_E2E_PROMPT_THINKING_LEVEL ?? flueReasoningThinkingLevel()
+  );
+}
+
+function flueReasoningThinkingLevel() {
+  return process.env.FLUE_E2E_REASONING_THINKING_LEVEL ?? "medium";
+}
+
+const flueE2EAgent = createAgent(() => ({
+  compaction: {
+    keepRecentTokens: 1,
+    reserveTokens: 64,
+  },
+  cwd: process.cwd(),
+  instructions: [
+    "You are a deterministic Flue instrumentation test agent.",
+    "Follow user instructions exactly.",
+    "When asked for a marker, output only that marker and no extra text.",
+    "When running a local skill file, read it yourself and do not delegate it to a task.",
+  ].join(" "),
+  model: flueModel(),
+  sandbox: local({ cwd: process.cwd() }),
+  thinkingLevel: "off",
+}));
+
+const lookupTool = {
+  description:
+    "Return a deterministic lookup result with an id needed by web_search.",
+  execute: async (args) =>
+    JSON.stringify({
+      id: "flue-session-2026",
+      query: args.query,
+      topic: "session instrumentation",
+    }),
+  name: "lookup",
+  parameters: Type.Object({
+    query: Type.String(),
+  }),
+};
+
+const webSearchTool = {
+  description:
+    "Search a deterministic local web index. Requires the id returned by lookup.",
+  execute: async (args) =>
+    JSON.stringify({
+      lookupId: args.lookupId,
+      query: args.query,
+      results: [
+        {
+          title: "Flue reasoning stream instrumentation",
+          url: "https://example.test/flue/reasoning-streams",
+        },
+      ],
+    }),
+  name: "web_search",
+  parameters: Type.Object({
+    lookupId: Type.String(),
+    query: Type.String(),
+  }),
+};
+
+const summarizeSourceTool = {
+  description:
+    "Summarize the selected deterministic source after web_search returns a URL.",
+  execute: async (args) =>
+    JSON.stringify({
+      summary:
+        "Flue emits reasoning, tool execution, and LLM turn events separately.",
+      url: args.url,
+    }),
+  name: "summarize_source",
+  parameters: Type.Object({
+    url: Type.String(),
+  }),
+};
+
+export async function route(_ctx, next) {
+  await next();
+}
+
+export async function run({ init, payload }) {
+  const harness = await init(flueE2EAgent, { name: "default" });
+  const session = await harness.session("main");
+  const skillSession = await harness.session("skill");
+  const taskSession = await harness.session("task");
+
+  await session.prompt(
+    [
+      "Complete this instrumented research flow.",
+      "Call exactly one tool per turn and wait for each tool result before choosing the next tool.",
+      'Step 1: call lookup with query "flue instrumentation".',
+      'Step 2: use the lookup result id as lookupId and call web_search with query "Braintrust Flue reasoning stream instrumentation".',
+      "Step 3: use the first web_search result url and call summarize_source.",
+      "After summarize_source returns, reply with exactly PROMPT_DONE and no other text.",
+    ].join(" "),
+    {
+      model: fluePromptModel(),
+      thinkingLevel: fluePromptThinkingLevel(),
+      tools: [lookupTool, webSearchTool, summarizeSourceTool],
+    },
+  );
+
+  await skillSession.skill("e2e-flue-skill", {
+    args: { marker: "SKILL_DONE" },
+    model: flueReasoningModel(),
+    thinkingLevel: "off",
+  });
+
+  await taskSession.task("Reply with exactly TASK_DONE and no other text.", {
+    model: FLUE_MODEL,
+    thinkingLevel: "off",
+  });
+
+  await session.compact();
+
+  return {
+    scenario: payload?.scenario ?? SCENARIO_NAME,
+    status: "done",
+  };
+}