diff --git a/extensions/qa-lab/src/providers/mock-openai/server.test.ts b/extensions/qa-lab/src/providers/mock-openai/server.test.ts index d92f516212e1..911ff86c830c 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts @@ -3859,6 +3859,14 @@ describe("qa mock openai server", () => { expect(maxReasoning.summary).toEqual([]); expect(outputItem(maxPayload, 1).type).toBe("message"); expect(outputText(maxPayload, 1)).toBe("THINKING-MAX-OK"); + + const maxStream = await expectResponsesText(server, { + stream: true, + model: "gpt-5.5", + input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)], + }); + expect(maxStream).toContain('"type":"response.output_text.delta"'); + expect(maxStream).toContain('"delta":"THINKING-MAX-OK"'); }); it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => { diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index 7ae7b0a86511..b1d753ce363e 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -1512,6 +1512,20 @@ function buildReasoningAndAssistantEvents(params: { status: "in_progress", }, }, + { + type: "response.output_text.delta", + item_id: answerItem.id, + output_index: 1, + content_index: 0, + delta: params.answerText, + }, + { + type: "response.output_text.done", + item_id: answerItem.id, + output_index: 1, + content_index: 0, + text: params.answerText, + }, { type: "response.output_item.done", item: answerItem, @@ -1664,13 +1678,13 @@ async function buildResponsesPayload( } return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY"); } - if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(prompt)) { + if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(allInputText)) { return buildReasoningAndAssistantEvents({ reasoningId: "rs_mock_thinking_visibility_max", answerText: "THINKING-MAX-OK", }); } - if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(prompt)) { + if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(allInputText)) { return buildAssistantEvents("THINKING-OFF-OK"); } if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) { diff --git a/extensions/qa-lab/src/providers/shared/mock-model-config.ts b/extensions/qa-lab/src/providers/shared/mock-model-config.ts index 308b7f23507f..349ee211e3f3 100644 --- a/extensions/qa-lab/src/providers/shared/mock-model-config.ts +++ b/extensions/qa-lab/src/providers/shared/mock-model-config.ts @@ -31,7 +31,7 @@ function createMockOpenAiResponsesProvider(baseUrl: string): ModelProviderConfig id: "gpt-5.5", name: "gpt-5.5", api: "openai-responses", - reasoning: false, + reasoning: true, input: ["text", "image"], cost: ZERO_COST, contextWindow: 128_000, @@ -41,7 +41,7 @@ function createMockOpenAiResponsesProvider(baseUrl: string): ModelProviderConfig id: "gpt-5.5-alt", name: "gpt-5.5-alt", api: "openai-responses", - reasoning: false, + reasoning: true, input: ["text", "image"], cost: ZERO_COST, contextWindow: 128_000, diff --git a/extensions/qa-lab/src/qa-gateway-config.test.ts b/extensions/qa-lab/src/qa-gateway-config.test.ts index 42cbc0b6ba40..6ae4f50f248a 100644 --- a/extensions/qa-lab/src/qa-gateway-config.test.ts +++ b/extensions/qa-lab/src/qa-gateway-config.test.ts @@ -55,6 +55,12 @@ describe("buildQaGatewayConfig", () => { expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("mock-openai/gpt-5.5"); expect(cfg.models?.providers?.["mock-openai"]?.baseUrl).toBe("http://127.0.0.1:44080/v1"); expect(cfg.models?.providers?.["mock-openai"]?.request).toEqual({ allowPrivateNetwork: true }); + expect(cfg.models?.providers?.["mock-openai"]?.models).toEqual( + expect.arrayContaining([ + expect.objectContaining({ id: "gpt-5.5", reasoning: true }), + expect.objectContaining({ id: "gpt-5.5-alt", reasoning: true }), + ]), + ); expect(cfg.models?.providers?.openai?.baseUrl).toBe("http://127.0.0.1:44080/v1"); expect(cfg.models?.providers?.openai?.request).toEqual({ allowPrivateNetwork: true }); expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080"); diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index f61a8f391ae6..f7069b79d14d 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -197,14 +197,18 @@ describe("qa scenario catalog", () => { expect(readQaScenarioById("long-context-progress-watchdog").sourcePath).toBe( "qa/scenarios/runtime/long-context-progress-watchdog.md", ); - expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow)) - .toContain("EmbeddedAttemptSessionTakeoverError"); - expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow)) - .toContain("AbortError"); - expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow)) - .toContain("This operation was aborted"); - expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow)) - .toContain("liveTurnTimeoutMs(env, 180000)"); + expect( + JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow), + ).toContain("EmbeddedAttemptSessionTakeoverError"); + expect( + JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow), + ).toContain("AbortError"); + expect( + JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow), + ).toContain("This operation was aborted"); + expect( + JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow), + ).toContain("liveTurnTimeoutMs(env, 180000)"); expect(readQaScenarioExecutionConfig("long-context-progress-watchdog")).toMatchObject({ requiredProviderMode: "live-frontier", harnessRuntime: "codex", @@ -323,8 +327,8 @@ describe("qa scenario catalog", () => { const scenario = readQaScenarioById("gpt55-thinking-visibility-switch"); const config = readQaScenarioExecutionConfig("gpt55-thinking-visibility-switch") as | { - requiredLiveProvider?: string; - requiredLiveModel?: string; + requiredProvider?: string; + requiredModel?: string; offDirective?: string; maxDirective?: string; reasoningDirective?: string; @@ -332,16 +336,15 @@ describe("qa scenario catalog", () => { | undefined; expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt55-thinking-visibility-switch.md"); - expect(config?.requiredLiveProvider).toBe("openai"); - expect(config?.requiredLiveModel).toBe("gpt-5.5"); + expect(config?.requiredProvider).toBe("openai"); + expect(config?.requiredModel).toBe("gpt-5.5"); expect(config?.offDirective).toBe("/think off"); expect(config?.maxDirective).toBe("/think medium"); expect(config?.reasoningDirective).toBe("/reasoning on"); expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([ "enables reasoning display and disables thinking", "switches to medium thinking", - "verifies medium thinking emits visible reasoning", - "verifies medium thinking completes the answer", + "verifies medium thinking reaches the provider", ]); }); diff --git a/extensions/qa-lab/src/suite-planning.test.ts b/extensions/qa-lab/src/suite-planning.test.ts index 142fcf8cb43f..f2efff4c1189 100644 --- a/extensions/qa-lab/src/suite-planning.test.ts +++ b/extensions/qa-lab/src/suite-planning.test.ts @@ -413,4 +413,30 @@ describe("qa suite planning helpers", () => { }).map((scenario) => scenario.id), ).toEqual(["generic", "live-only"]); }); + + it("keeps live-only runtime parity scenarios out of implicit mock selections", () => { + const scenarios = [ + makeQaSuiteTestScenario("generic"), + makeQaSuiteTestScenario("live-runtime", { + runtimeParityTier: "live-only", + }), + ]; + + expect( + selectQaSuiteScenarios({ + scenarios, + providerMode: "mock-openai", + primaryModel: "mock-openai/gpt-5.5", + }).map((scenario) => scenario.id), + ).toEqual(["generic"]); + + expect( + selectQaSuiteScenarios({ + scenarios, + scenarioIds: ["live-runtime"], + providerMode: "mock-openai", + primaryModel: "mock-openai/gpt-5.5", + }).map((scenario) => scenario.id), + ).toEqual(["live-runtime"]); + }); }); diff --git a/extensions/qa-lab/src/suite-planning.ts b/extensions/qa-lab/src/suite-planning.ts index 441090eb93d6..14a62ddd834a 100644 --- a/extensions/qa-lab/src/suite-planning.ts +++ b/extensions/qa-lab/src/suite-planning.ts @@ -33,12 +33,16 @@ function scenarioMatchesLiveLane(params: { providerMode: QaProviderMode; claudeCliAuthMode?: QaCliBackendAuthMode; }) { + const provider = getQaProvider(params.providerMode); + if (params.scenario.runtimeParityTier === "live-only" && provider.kind !== "live") { + return false; + } const config = params.scenario.execution.config ?? {}; const requiredProviderMode = normalizeQaConfigString(config.requiredProviderMode); if (requiredProviderMode && params.providerMode !== requiredProviderMode) { return false; } - if (getQaProvider(params.providerMode).kind !== "live") { + if (provider.kind !== "live") { return true; } const selected = splitModelRef(params.primaryModel); diff --git a/extensions/qa-lab/src/suite-runtime-agent-tools.test.ts b/extensions/qa-lab/src/suite-runtime-agent-tools.test.ts index 8c7225738356..d0d4c6fbb120 100644 --- a/extensions/qa-lab/src/suite-runtime-agent-tools.test.ts +++ b/extensions/qa-lab/src/suite-runtime-agent-tools.test.ts @@ -10,12 +10,14 @@ const listToolsMock = vi.hoisted(() => vi.fn(async () => ({ tools: [] }))); const callToolMock = vi.hoisted(() => vi.fn(async () => ({ content: [] }))); const closeMock = vi.hoisted(() => vi.fn(async () => undefined)); const resolveQaNodeExecPathMock = vi.hoisted(() => vi.fn(async () => "/usr/bin/node")); +const stderrOnMock = vi.hoisted(() => vi.fn()); const stdioTransportMock = vi.hoisted(() => vi.fn().mockImplementation(function StdioClientTransport( - this: { params?: unknown }, + this: { params?: unknown; stderr?: { on: typeof stderrOnMock } }, params: unknown, ) { this.params = params; + this.stderr = { on: stderrOnMock }; }), ); @@ -66,6 +68,7 @@ describe("qa suite runtime agent tools helpers", () => { callToolMock.mockReset(); closeMock.mockClear(); resolveQaNodeExecPathMock.mockClear(); + stderrOnMock.mockClear(); stdioTransportMock.mockClear(); }); @@ -146,10 +149,17 @@ describe("qa suite runtime agent tools helpers", () => { OPENCLAW_KEY: "1", }, }); - expect(callToolMock).toHaveBeenCalledWith({ - name: "plugin.echo", - arguments: { text: "hello" }, - }); + expect(stderrOnMock).toHaveBeenCalledWith("data", expect.any(Function)); + expect(connectMock).toHaveBeenCalledWith(expect.anything(), { timeout: 180_000 }); + expect(listToolsMock).toHaveBeenCalledWith({}, { timeout: 180_000 }); + expect(callToolMock).toHaveBeenCalledWith( + { + name: "plugin.echo", + arguments: { text: "hello" }, + }, + undefined, + { timeout: 180_000 }, + ); expect(closeMock).toHaveBeenCalled(); }); diff --git a/extensions/qa-lab/src/suite-runtime-agent-tools.ts b/extensions/qa-lab/src/suite-runtime-agent-tools.ts index 977bb26c2432..8898d1c21efb 100644 --- a/extensions/qa-lab/src/suite-runtime-agent-tools.ts +++ b/extensions/qa-lab/src/suite-runtime-agent-tools.ts @@ -13,6 +13,8 @@ import type { } from "./suite-runtime-types.js"; const requireFromHere = createRequire(import.meta.url); +const MCP_STDERR_TAIL_LIMIT = 8_192; +const MCP_REQUEST_TIMEOUT_MS = 180_000; function findSkill(skills: QaSkillStatusEntry[], name: string) { return skills.find((skill) => skill.name === name); @@ -52,10 +54,17 @@ async function callPluginToolsMcp(params: { cwd: params.env.gateway.tempRoot, env: transportEnv, }); + let stderrTail = ""; + const stderr = transport.stderr; + if (stderr && typeof stderr.on === "function") { + stderr.on("data", (chunk: unknown) => { + stderrTail = `${stderrTail}${String(chunk)}`.slice(-MCP_STDERR_TAIL_LIMIT); + }); + } const client = new Client({ name: "openclaw-qa-suite", version: "0.0.0" }, {}); try { - await client.connect(transport); - const listed = await client.listTools(); + await client.connect(transport, { timeout: MCP_REQUEST_TIMEOUT_MS }); + const listed = await client.listTools({}, { timeout: MCP_REQUEST_TIMEOUT_MS }); const tool = listed.tools.find((entry) => entry.name === params.toolName); if (!tool) { const availableTools = listed.tools @@ -66,10 +75,22 @@ async function callPluginToolsMcp(params: { `MCP tool missing: ${params.toolName}; available tools: ${availableTools.join(", ") || ""}`, ); } - return await client.callTool({ - name: params.toolName, - arguments: params.args, - }); + try { + return await client.callTool( + { + name: params.toolName, + arguments: params.args, + }, + undefined, + { timeout: MCP_REQUEST_TIMEOUT_MS }, + ); + } catch (error) { + const tail = stderrTail.trim(); + if (!tail || !(error instanceof Error)) { + throw error; + } + throw new Error(`${error.message}\nMCP stderr tail:\n${tail}`, { cause: error }); + } } finally { await client.close().catch(() => {}); } diff --git a/extensions/qa-lab/src/suite-test-helpers.ts b/extensions/qa-lab/src/suite-test-helpers.ts index b8fe47403cbf..0db66416ace4 100644 --- a/extensions/qa-lab/src/suite-test-helpers.ts +++ b/extensions/qa-lab/src/suite-test-helpers.ts @@ -9,6 +9,7 @@ export function makeQaSuiteTestScenario( plugins?: string[]; gatewayConfigPatch?: Record; gatewayRuntime?: { forwardHostHome?: boolean }; + runtimeParityTier?: QaSuiteTestScenario["runtimeParityTier"]; surface?: string; } = {}, ): QaSuiteTestScenario { @@ -18,6 +19,7 @@ export function makeQaSuiteTestScenario( surface: params.surface ?? "test", objective: "test", successCriteria: ["test"], + ...(params.runtimeParityTier ? { runtimeParityTier: params.runtimeParityTier } : {}), ...(params.plugins ? { plugins: params.plugins } : {}), ...(params.gatewayConfigPatch ? { gatewayConfigPatch: params.gatewayConfigPatch } : {}), ...(params.gatewayRuntime ? { gatewayRuntime: params.gatewayRuntime } : {}), diff --git a/qa/scenarios/models/gpt55-thinking-visibility-switch.md b/qa/scenarios/models/gpt55-thinking-visibility-switch.md index adbd9f6d66d5..5551fc65b1d1 100644 --- a/qa/scenarios/models/gpt55-thinking-visibility-switch.md +++ b/qa/scenarios/models/gpt55-thinking-visibility-switch.md @@ -13,8 +13,9 @@ objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking w successCriteria: - Live runs target openai/gpt-5.5, not a mini or pro variant. - The session enables reasoning display before the comparison turns. - - The disabled-thinking turn returns its visible marker without a non-empty Reasoning summary. - - The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message. + - The disabled-thinking turn returns its visible marker without sending a reasoning payload to OpenAI-compatible providers. + - The medium-thinking turn sends a medium reasoning request and returns its visible marker. + - Transports with a visible reasoning lane expose a separate Reasoning-prefixed message; qa-channel validates provider behavior because generic delivery suppresses reasoning payloads by design. docsRefs: - docs/tools/thinking.md - docs/help/testing.md @@ -29,8 +30,8 @@ execution: kind: flow summary: Toggle reasoning display and GPT-5.5 thinking between off/none and medium, then verify visible reasoning only on the medium turn. config: - requiredLiveProvider: openai - requiredLiveModel: gpt-5.5 + requiredProvider: openai + requiredModel: gpt-5.5 offDirective: /think off maxDirective: /think medium reasoningDirective: /reasoning on @@ -77,22 +78,25 @@ steps: - lambda: expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)" - expr: liveTurnTimeoutMs(env, 20000) - - call: patchConfig + - set: thinkOffCursor + value: + expr: state.getSnapshot().messages.length + - call: state.addInboundMessage args: - - env: - ref: env - patch: - agents: - defaults: - thinkingDefault: "off" - - call: waitForGatewayHealthy + - conversation: + id: + expr: config.conversationId + kind: direct + senderId: qa-operator + senderName: QA Operator + text: + expr: config.offDirective + - call: waitForCondition + saveAs: thinkOffAck args: - - ref: env - - 60000 - - call: waitForQaChannelReady - args: - - ref: env - - 60000 + - lambda: + expr: "state.getSnapshot().messages.slice(thinkOffCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)" + - expr: liveTurnTimeoutMs(env, 20000) - set: offCursor value: expr: state.getSnapshot().messages.length @@ -111,7 +115,7 @@ steps: args: - lambda: expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.offMarker)).at(-1)" - - expr: liveTurnTimeoutMs(env, 30000) + - expr: liveTurnTimeoutMs(env, 90000) - set: offMessages value: expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId)" @@ -136,27 +140,34 @@ steps: expr: "String(offRequest?.model ?? '').includes('gpt-5.5')" message: expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`" - detailsExpr: "`reasoning ack=${reasoningAck.text}; off answer=${offAnswer.text}`" + - assert: + expr: "offRequest?.body && !Object.prototype.hasOwnProperty.call(offRequest.body, 'reasoning')" + message: + expr: "`disabled thinking should omit OpenAI reasoning payload, got ${JSON.stringify(offRequest?.body?.reasoning ?? null)}`" + detailsExpr: "`reasoning ack=${reasoningAck.text}; thinking off=${thinkOffAck.text}; off answer=${offAnswer.text}`" - name: switches to medium thinking actions: - - call: patchConfig + - set: thinkMediumCursor + value: + expr: state.getSnapshot().messages.length + - call: state.addInboundMessage args: - - env: - ref: env - patch: - agents: - defaults: - thinkingDefault: "medium" - - call: waitForGatewayHealthy + - conversation: + id: + expr: config.conversationId + kind: direct + senderId: qa-operator + senderName: QA Operator + text: + expr: config.maxDirective + - call: waitForCondition + saveAs: thinkMediumAck args: - - ref: env - - 60000 - - call: waitForQaChannelReady - args: - - ref: env - - 60000 - detailsExpr: "`thinking default patched to medium`" - - name: verifies medium thinking emits visible reasoning + - lambda: + expr: "state.getSnapshot().messages.slice(thinkMediumCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)" + - expr: liveTurnTimeoutMs(env, 20000) + detailsExpr: "`thinking medium=${thinkMediumAck.text}`" + - name: verifies medium thinking reaches the provider actions: - set: maxCursor value: @@ -171,19 +182,6 @@ steps: senderName: QA Operator text: expr: config.maxPrompt - - call: waitForCondition - saveAs: maxReasoning - args: - - lambda: - expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.trimStart().startsWith('Reasoning:')).at(-1)" - - expr: liveTurnTimeoutMs(env, 120000) - - assert: - expr: "maxReasoning.text.trimStart().startsWith('Reasoning:')" - message: - expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`" - detailsExpr: "`reasoning=${maxReasoning.text}`" - - name: verifies medium thinking completes the answer - actions: - call: waitForCondition saveAs: maxAnswer args: @@ -193,7 +191,7 @@ steps: - assert: expr: "maxAnswer.text.includes(config.maxMarker)" message: - expr: "`missing max marker: ${maxAnswer.text}`" + expr: "`missing max marker near answer: ${recentOutboundSummary(state, 6)}`" - if: expr: "Boolean(env.mock)" then: @@ -207,5 +205,22 @@ steps: expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')" message: expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`" - detailsExpr: "`answer=${maxAnswer.text}`" + - assert: + expr: "maxRequest?.body?.reasoning?.effort === 'medium'" + message: + expr: "`expected medium OpenAI reasoning payload, got ${JSON.stringify(maxRequest?.body?.reasoning ?? null)}`" + - if: + expr: "env.transport.id !== 'qa-channel'" + then: + - call: waitForCondition + saveAs: maxReasoning + args: + - lambda: + expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.trimStart().startsWith('Reasoning:')).at(-1)" + - expr: liveTurnTimeoutMs(env, 120000) + - assert: + expr: "maxReasoning.text.trimStart().startsWith('Reasoning:')" + message: + expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`" + detailsExpr: "env.transport.id === 'qa-channel' ? `answer=${maxAnswer.text}; medium reasoning=${env.mock ? String(maxRequest?.body?.reasoning?.effort ?? '') : 'live'}; qa-channel suppresses reasoning delivery` : `answer=${maxAnswer.text}; reasoning=${maxReasoning.text}`" ``` diff --git a/qa/scenarios/plugins/skill-workshop-pending-approval.md b/qa/scenarios/plugins/skill-workshop-pending-approval.md index 91cce8074cce..c1876711f075 100644 --- a/qa/scenarios/plugins/skill-workshop-pending-approval.md +++ b/qa/scenarios/plugins/skill-workshop-pending-approval.md @@ -14,6 +14,9 @@ objective: Verify an explicit pending skill suggestion queues for review, then a plugins: - skill-workshop gatewayConfigPatch: + tools: + alsoAllow: + - skill_workshop plugins: entries: skill-workshop: diff --git a/qa/scenarios/runtime/runtime-inventory-drift-check.md b/qa/scenarios/runtime/runtime-inventory-drift-check.md index 9d3f978a175f..939a035d4d11 100644 --- a/qa/scenarios/runtime/runtime-inventory-drift-check.md +++ b/qa/scenarios/runtime/runtime-inventory-drift-check.md @@ -38,6 +38,9 @@ execution: steps: - name: keeps tools.effective and skills.status aligned after config changes actions: + - call: ensureImageGenerationConfigured + args: + - ref: env - call: writeWorkspaceSkill args: - env: diff --git a/src/agents/pi-tools.before-tool-call.ts b/src/agents/pi-tools.before-tool-call.ts index 7c5c67a19f78..ef0a6e2cd5dc 100644 --- a/src/agents/pi-tools.before-tool-call.ts +++ b/src/agents/pi-tools.before-tool-call.ts @@ -106,6 +106,10 @@ type HookOutcome = } | { blocked: false; params: unknown }; type PluginApprovalRequest = NonNullable; +type BeforeToolCallWrapperOptions = { + approvalMode?: "request" | "report"; + emitDiagnostics: boolean; +}; export type BeforeToolCallPolicyDiagnosticState = { hasBeforeToolCallHook: boolean; @@ -910,7 +914,7 @@ export async function runBeforeToolCallHook(args: { export function wrapToolWithBeforeToolCallHook( tool: AnyAgentTool, ctx?: HookContext, - options: { emitDiagnostics?: boolean } = {}, + options: { approvalMode?: "request" | "report"; emitDiagnostics?: boolean } = {}, ): AnyAgentTool { const execute = tool.execute; if (!execute) { @@ -918,7 +922,10 @@ export function wrapToolWithBeforeToolCallHook( } const toolName = tool.name || "tool"; const diagnosticIdentity = resolveToolDiagnosticIdentity(tool); - const diagnosticOptions = { emitDiagnostics: options.emitDiagnostics !== false }; + const hookOptions: BeforeToolCallWrapperOptions = { + ...(options.approvalMode ? { approvalMode: options.approvalMode } : {}), + emitDiagnostics: options.emitDiagnostics !== false, + }; const wrappedTool: AnyAgentTool = { ...tool, execute: async (toolCallId, params, signal, onUpdate) => { @@ -931,6 +938,7 @@ export function wrapToolWithBeforeToolCallHook( toolCallId, ctx, signal, + approvalMode: hookOptions.approvalMode, }); if (outcome.blocked) { if (outcome.kind !== "veto") { @@ -950,7 +958,7 @@ export function wrapToolWithBeforeToolCallHook( ...(toolCallId && { toolCallId }), paramsSummary: summarizeToolParams(outcome.params ?? hookParams), }; - if (diagnosticOptions.emitDiagnostics) { + if (hookOptions.emitDiagnostics) { emitTrustedDiagnosticEvent({ type: "tool.execution.blocked", ...eventBase, @@ -992,7 +1000,7 @@ export function wrapToolWithBeforeToolCallHook( ...(toolCallId && { toolCallId }), paramsSummary: summarizeToolParams(executeParams), }; - if (diagnosticOptions.emitDiagnostics) { + if (hookOptions.emitDiagnostics) { emitTrustedDiagnosticEvent({ type: "tool.execution.started", ...eventBase, @@ -1014,7 +1022,7 @@ export function wrapToolWithBeforeToolCallHook( toolParams: executeParams, ctx, }); - if (diagnosticOptions.emitDiagnostics) { + if (hookOptions.emitDiagnostics) { if (skillMatch) { emitSkillUsedDiagnostic({ ctx, @@ -1033,7 +1041,7 @@ export function wrapToolWithBeforeToolCallHook( } catch (err) { const cause = unwrapErrorCause(err); const errorCode = diagnosticHttpStatusCode(cause); - if (diagnosticOptions.emitDiagnostics) { + if (hookOptions.emitDiagnostics) { emitTrustedDiagnosticEvent({ type: "tool.execution.error", ...eventBase, @@ -1060,7 +1068,7 @@ export function wrapToolWithBeforeToolCallHook( enumerable: true, }); Object.defineProperty(wrappedTool, BEFORE_TOOL_CALL_DIAGNOSTIC_OPTIONS, { - value: diagnosticOptions, + value: hookOptions, enumerable: false, }); return wrappedTool; @@ -1079,6 +1087,17 @@ export function setBeforeToolCallDiagnosticsEnabled(tool: AnyAgentTool, enabled: } } +export function setBeforeToolCallApprovalMode( + tool: AnyAgentTool, + approvalMode: "request" | "report" | undefined, +): void { + const taggedTool = tool as unknown as Record; + const options = taggedTool[BEFORE_TOOL_CALL_DIAGNOSTIC_OPTIONS]; + if (options && typeof options === "object") { + (options as BeforeToolCallWrapperOptions).approvalMode = approvalMode; + } +} + export function copyBeforeToolCallHookMarker(source: AnyAgentTool, target: AnyAgentTool): void { if (!isToolWrappedWithBeforeToolCallHook(source)) { return; diff --git a/src/mcp/plugin-tools-handlers.ts b/src/mcp/plugin-tools-handlers.ts index dd3e4e07298e..4c208371b47b 100644 --- a/src/mcp/plugin-tools-handlers.ts +++ b/src/mcp/plugin-tools-handlers.ts @@ -1,5 +1,6 @@ import { isToolWrappedWithBeforeToolCallHook, + setBeforeToolCallApprovalMode, wrapToolWithBeforeToolCallHook, } from "../agents/pi-tools.before-tool-call.js"; import type { AnyAgentTool } from "../agents/tools/common.js"; @@ -22,11 +23,12 @@ function resolveJsonSchemaForTool(tool: AnyAgentTool): Record { export function createPluginToolsMcpHandlers(tools: AnyAgentTool[]) { const wrappedTools = tools.map((tool) => { if (isToolWrappedWithBeforeToolCallHook(tool)) { + setBeforeToolCallApprovalMode(tool, "report"); return tool; } // The ACPX MCP bridge should enforce the same pre-execution hook boundary // as the agent and HTTP tool execution paths. - return wrapToolWithBeforeToolCallHook(tool); + return wrapToolWithBeforeToolCallHook(tool, undefined, { approvalMode: "report" }); }); const toolMap = new Map(); for (const tool of wrappedTools) { diff --git a/src/mcp/plugin-tools-serve.test.ts b/src/mcp/plugin-tools-serve.test.ts index e365dcdbbdf6..e9f552f2f39f 100644 --- a/src/mcp/plugin-tools-serve.test.ts +++ b/src/mcp/plugin-tools-serve.test.ts @@ -1,4 +1,5 @@ import { afterEach, describe, expect, it, vi } from "vitest"; +import { wrapToolWithBeforeToolCallHook } from "../agents/pi-tools.before-tool-call.js"; import type { AnyAgentTool } from "../agents/tools/common.js"; import { initializeGlobalHookRunner, @@ -226,7 +227,7 @@ describe("plugin tools MCP server", () => { expect(failed.content).toEqual([{ type: "text", text: "Tool error: boom" }]); }); - it("blocks tool execution when before_tool_call requires approval on the MCP bridge", async () => { + it("reports approval requirements without opening plugin approvals on the MCP bridge", async () => { let hookCalls = 0; const execute = vi.fn().mockResolvedValue({ content: "Stored.", @@ -248,7 +249,6 @@ describe("plugin tools MCP server", () => { }, ]), ); - callGatewayTool.mockRejectedValueOnce(new Error("gateway unavailable")); const tool = { name: "memory_store", description: "Store memory", @@ -262,10 +262,46 @@ describe("plugin tools MCP server", () => { arguments: { text: "remember this" }, }); expect(hookCalls).toBe(1); + expect(callGatewayTool).not.toHaveBeenCalled(); expect(execute).not.toHaveBeenCalled(); expect(result.isError).toBe(true); - expect(result.content).toEqual([ - { type: "text", text: "Tool error: Plugin approval required (gateway unavailable)" }, - ]); + expect(result.content).toEqual([{ type: "text", text: "Tool error: Approval required" }]); + }); + + it("switches pre-wrapped plugin tools to approval report mode on the MCP bridge", async () => { + const execute = vi.fn().mockResolvedValue({ + content: "Stored.", + }); + initializeGlobalHookRunner( + createMockPluginRegistry([ + { + hookName: "before_tool_call", + handler: async () => ({ + requireApproval: { + pluginId: "test-plugin", + title: "Approval required", + description: "Approval required", + }, + }), + }, + ]), + ); + callGatewayTool.mockRejectedValue(new Error("gateway unavailable")); + const tool = wrapToolWithBeforeToolCallHook({ + name: "memory_store", + description: "Store memory", + parameters: { type: "object", properties: {} }, + execute, + } as unknown as AnyAgentTool); + + const handlers = createPluginToolsMcpHandlers([tool]); + const result = await handlers.callTool({ + name: "memory_store", + arguments: { text: "remember this" }, + }); + expect(callGatewayTool).not.toHaveBeenCalled(); + expect(execute).not.toHaveBeenCalled(); + expect(result.isError).toBe(true); + expect(result.content).toEqual([{ type: "text", text: "Tool error: Approval required" }]); }); });