docs: document embedded runner run tests

This commit is contained in:
Peter Steinberger
2026-06-04 14:38:00 -04:00
parent cd26595d6f
commit a31d3355cd
6 changed files with 42 additions and 9 deletions

View File

@@ -1,3 +1,4 @@
// Coverage for before_agent_finalize revision handling in embedded runs.
import { beforeAll, beforeEach, describe, expect, it } from "vitest";
import { makeAttemptResult } from "./run.overflow-compaction.fixture.js";
import {
@@ -15,6 +16,8 @@ function finalAnswerAttempt(
text: string,
overrides?: Partial<EmbeddedRunAttemptResult>,
): EmbeddedRunAttemptResult {
// Finalize tests need a successful assistant turn with both surfaced text and
// snapshot content so the runner can decide whether to request a revision.
return makeAttemptResult({
assistantTexts: [text],
lastAssistant: {
@@ -75,6 +78,8 @@ describe("runEmbeddedAgent before_agent_finalize", () => {
});
it("turns a revise decision into one more hidden continuation", async () => {
// Revision prompts are hidden continuations; they must not persist the
// original user prompt a second time.
mockedRunEmbeddedAttempt
.mockResolvedValueOnce(
finalAnswerAttempt("First answer.", {
@@ -123,6 +128,8 @@ describe("runEmbeddedAgent before_agent_finalize", () => {
});
it("does not retry finalize revisions after a timed-out attempt", async () => {
// A timed-out attempt may have partial assistant text, but asking for a
// finalize revision would replay an invalid or blocked provider turn.
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
finalAnswerAttempt("Late answer.", {
timedOut: true,

View File

@@ -1,3 +1,4 @@
// Coverage for cron before_agent_reply hook handling before embedded attempts.
import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
import { SILENT_REPLY_TOKEN } from "../../auto-reply/tokens.js";
import { makeAttemptResult } from "./run.overflow-compaction.fixture.js";
@@ -12,6 +13,8 @@ import {
let runEmbeddedAgent: typeof import("./run.js").runEmbeddedAgent;
function firstBeforeAgentReplyCall() {
// Helper keeps assertions on the hook payload and context close to the tests
// without leaking mock tuple details into every case.
const call = mockedGlobalHookRunner.runBeforeAgentReply.mock.calls[0];
if (!call) {
throw new Error("expected before_agent_reply hook call");
@@ -43,6 +46,8 @@ describe("runEmbeddedAgent cron before_agent_reply seam", () => {
});
it("lets before_agent_reply claim cron runs before the embedded attempt starts", async () => {
// Cron hooks can fully handle maintenance prompts before the model is
// invoked, which avoids unnecessary prompt-cache and setup work.
mockedGlobalHookRunner.hasHooks.mockImplementation(
(hookName: string) => hookName === "before_agent_reply",
);
@@ -134,6 +139,8 @@ describe("runEmbeddedAgent cron before_agent_reply seam", () => {
});
it("forwards one-shot model-run flags into the embedded attempt", async () => {
// Model-run mode is request-scoped; it must pass through to the first
// attempt without becoming a persistent session setting.
mockedRunEmbeddedAttempt.mockResolvedValueOnce(makeAttemptResult({ promptError: null }));
await runEmbeddedAgent({

View File

@@ -1,3 +1,4 @@
// Coverage for replay-safe Codex app-server recovery retries.
import { beforeAll, beforeEach, describe, expect, it } from "vitest";
import { makeModelFallbackCfg } from "../test-helpers/model-fallback-config-fixture.js";
import { makeAttemptResult } from "./run.overflow-compaction.fixture.js";
@@ -16,6 +17,8 @@ let runEmbeddedAgent: typeof import("./run.js").runEmbeddedAgent;
function codexClientClosedAttempt(
overrides: Partial<EmbeddedRunAttemptResult> = {},
): EmbeddedRunAttemptResult {
// Stdio client-close failures can be replay-safe when Codex reports that the
// turn ended before completion and no user-visible side effect escaped.
return makeAttemptResult({
assistantTexts: [],
promptError: new Error("codex app-server client closed before turn completed"),
@@ -34,6 +37,8 @@ function codexClientClosedAttempt(
function codexTurnCompletionIdleTimeoutAttempt(
overrides: Partial<EmbeddedRunAttemptResult> = {},
): EmbeddedRunAttemptResult {
// Completion-watch idle timeouts are retried separately from progress timeouts
// because only the former indicates Codex may have lost the final event.
return makeAttemptResult({
assistantTexts: [],
aborted: true,
@@ -106,6 +111,8 @@ describe("runEmbeddedAgent Codex app-server recovery", () => {
});
it("suppresses duplicate user persistence when retrying after the inbound message was persisted", async () => {
// If the first attempt already persisted the inbound message, the retry must
// not mirror it again into the transcript.
mockedRunEmbeddedAttempt
.mockImplementationOnce(async (attemptParams) => {
(

View File

@@ -1,3 +1,4 @@
// Coverage for handing Codex server_error turns to model fallback.
import { beforeAll, beforeEach, describe, expect, it } from "vitest";
import { makeAssistantMessageFixture } from "../test-helpers/assistant-message-fixtures.js";
import { makeModelFallbackCfg } from "../test-helpers/model-fallback-config-fixture.js";
@@ -27,6 +28,8 @@ describe("runEmbeddedAgent Codex server_error fallback handoff", () => {
});
it("throws FailoverError for Codex server_error when model fallbacks are configured", async () => {
// Codex server_error is a provider failure, not a normal assistant reply;
// configured fallbacks should receive it through the failover path.
const rawCodexError =
'Codex error: {"type":"error","error":{"type":"server_error","code":"server_error","message":"An error occurred while processing your request."},"sequence_number":2}';

View File

@@ -1,3 +1,4 @@
// Coverage for wiring the post-compaction loop guard into embedded runs.
import { beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
import type {
diagnosticSessionStates as DiagnosticSessionStatesType,
@@ -29,9 +30,8 @@ import {
} from "./run.overflow-compaction.harness.js";
let runEmbeddedAgent: typeof import("./run.js").runEmbeddedAgent;
// These need to be imported AFTER loadRunOverflowCompactionHarness so that
// they reference the same module instances the (re-imported) runner uses.
// vi.resetModules() inside the harness invalidates any earlier import.
// Import after loadRunOverflowCompactionHarness so these references point at the
// same module instances as the re-imported runner graph.
let diagnosticSessionStates: typeof DiagnosticSessionStatesType;
let getDiagnosticSessionState: typeof GetDiagnosticSessionStateType;
let recordToolCall: typeof RecordToolCallType;
@@ -51,6 +51,8 @@ function recordToolOutcome(
result: unknown,
runId?: string,
): void {
// Seed diagnostic history directly for cases that inspect persisted loop
// state without running a wrapped tool.
const toolCallId = `${toolName}-${state.toolCallHistory?.length ?? 0}`;
const scope = runId ? { runId } : undefined;
recordToolCall(state, toolName, toolParams, toolCallId, undefined, scope);
@@ -75,6 +77,8 @@ async function executeWrappedToolOutcome(
onToolOutcome?: ToolOutcomeObserver,
runId = baseParams.runId,
): Promise<unknown> {
// Exercise the live before_tool_call wrapper so the guard sees the same
// outcome observer path used by real embedded tools.
const tool = wrapToolWithBeforeToolCallHook(
{
name: toolName,
@@ -135,15 +139,13 @@ describe("post-compaction loop guard wired into runEmbeddedAgent", () => {
let attemptSignalAborted = false;
let attemptSignalReason: unknown;
// Attempt 1: overflow triggers compaction.
// Attempt 1: overflow triggers compaction.
mockedRunEmbeddedAttempt.mockImplementationOnce(async () =>
makeAttemptResult({ promptError: overflowError }),
);
// Attempt 2: post-compaction. The live wrapped-tool path records each
// outcome while the prompt is still running. The third identical result
// must not rely on throwing out of tool execution (the dependency converts
// tool errors into tool results); instead it aborts the attempt signal and
// the runner raises the persisted-loop error after the attempt unwinds.
// Attempt 2: live wrapped-tool outcomes repeat while the prompt is running.
// The guard aborts the attempt signal, then the runner raises the loop error
// after the attempt unwinds.
mockedRunEmbeddedAttempt.mockImplementationOnce(async (attemptParams: unknown) => {
const { abortSignal, onToolOutcome } = attemptParams as {
abortSignal?: AbortSignal;

View File

@@ -1,3 +1,4 @@
// Coverage for preserving current-attempt error context across model fallback.
import { beforeAll, beforeEach, describe, expect, it } from "vitest";
import { makeAssistantMessageFixture } from "../test-helpers/assistant-message-fixtures.js";
import { makeModelFallbackCfg } from "../test-helpers/model-fallback-config-fixture.js";
@@ -33,6 +34,8 @@ function isCurrentAttemptAssistant(value: unknown): value is CurrentAttemptAssis
}
function setupDeepseekFallbackErrorMatchers() {
// DeepSeek matchers prove failover classification uses the current candidate
// assistant instead of stale history from the previous provider.
mockedIsFailoverAssistantError.mockImplementation((...args: unknown[]) => {
const assistant = args[0];
return isCurrentAttemptAssistant(assistant) && assistant.provider === "deepseek";
@@ -44,6 +47,8 @@ function setupDeepseekFallbackErrorMatchers() {
}
function captureFormattedAssistant() {
// Capture the assistant passed to formatting so tests can inspect which
// provider/model error object drove the final failover message.
let lastFormattedAssistant: unknown;
mockedFormatAssistantErrorText.mockImplementation((...args: unknown[]) => {
lastFormattedAssistant = args[0];
@@ -173,6 +178,8 @@ describe("runEmbeddedAgent cross-provider fallback error handling", () => {
});
it("does not reuse a prior provider session assistant when the current candidate times out", async () => {
// Timeout failover has no reliable current assistant. Reusing the previous
// provider's session error would misattribute the failed candidate.
const getLastFormattedAssistant = captureFormattedAssistant();
mockedRunEmbeddedAttempt.mockResolvedValueOnce(
makeAttemptResult({