fix(qa): stabilize mock QA scenario contracts

2026-06-06 05:51:15 +08:00 · 2026-05-27 06:22:26 +02:00
parent 7e702bb43d
commit 81c1892c9a
16 changed files with 266 additions and 94 deletions
--- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts
@@ -3859,6 +3859,14 @@ describe("qa mock openai server", () => {
    expect(maxReasoning.summary).toEqual([]);
    expect(outputItem(maxPayload, 1).type).toBe("message");
    expect(outputText(maxPayload, 1)).toBe("THINKING-MAX-OK");
+
+    const maxStream = await expectResponsesText(server, {
+      stream: true,
+      model: "gpt-5.5",
+      input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)],
+    });
+    expect(maxStream).toContain('"type":"response.output_text.delta"');
+    expect(maxStream).toContain('"delta":"THINKING-MAX-OK"');
  });

  it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => {
--- a/extensions/qa-lab/src/providers/mock-openai/server.ts
+++ b/extensions/qa-lab/src/providers/mock-openai/server.ts
@@ -1512,6 +1512,20 @@ function buildReasoningAndAssistantEvents(params: {
        status: "in_progress",
      },
    },
+    {
+      type: "response.output_text.delta",
+      item_id: answerItem.id,
+      output_index: 1,
+      content_index: 0,
+      delta: params.answerText,
+    },
+    {
+      type: "response.output_text.done",
+      item_id: answerItem.id,
+      output_index: 1,
+      content_index: 0,
+      text: params.answerText,
+    },
    {
      type: "response.output_item.done",
      item: answerItem,
@@ -1664,13 +1678,13 @@ async function buildResponsesPayload(
    }
    return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY");
  }
-  if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(prompt)) {
+  if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(allInputText)) {
    return buildReasoningAndAssistantEvents({
      reasoningId: "rs_mock_thinking_visibility_max",
      answerText: "THINKING-MAX-OK",
    });
  }
-  if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(prompt)) {
+  if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(allInputText)) {
    return buildAssistantEvents("THINKING-OFF-OK");
  }
  if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) {
--- a/extensions/qa-lab/src/providers/shared/mock-model-config.ts
+++ b/extensions/qa-lab/src/providers/shared/mock-model-config.ts
@@ -31,7 +31,7 @@ function createMockOpenAiResponsesProvider(baseUrl: string): ModelProviderConfig
        id: "gpt-5.5",
        name: "gpt-5.5",
        api: "openai-responses",
-        reasoning: false,
+        reasoning: true,
        input: ["text", "image"],
        cost: ZERO_COST,
        contextWindow: 128_000,
@@ -41,7 +41,7 @@ function createMockOpenAiResponsesProvider(baseUrl: string): ModelProviderConfig
        id: "gpt-5.5-alt",
        name: "gpt-5.5-alt",
        api: "openai-responses",
-        reasoning: false,
+        reasoning: true,
        input: ["text", "image"],
        cost: ZERO_COST,
        contextWindow: 128_000,
--- a/extensions/qa-lab/src/qa-gateway-config.test.ts
+++ b/extensions/qa-lab/src/qa-gateway-config.test.ts
@@ -55,6 +55,12 @@ describe("buildQaGatewayConfig", () => {
    expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("mock-openai/gpt-5.5");
    expect(cfg.models?.providers?.["mock-openai"]?.baseUrl).toBe("http://127.0.0.1:44080/v1");
    expect(cfg.models?.providers?.["mock-openai"]?.request).toEqual({ allowPrivateNetwork: true });
+    expect(cfg.models?.providers?.["mock-openai"]?.models).toEqual(
+      expect.arrayContaining([
+        expect.objectContaining({ id: "gpt-5.5", reasoning: true }),
+        expect.objectContaining({ id: "gpt-5.5-alt", reasoning: true }),
+      ]),
+    );
    expect(cfg.models?.providers?.openai?.baseUrl).toBe("http://127.0.0.1:44080/v1");
    expect(cfg.models?.providers?.openai?.request).toEqual({ allowPrivateNetwork: true });
    expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080");
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -197,14 +197,18 @@ describe("qa scenario catalog", () => {
    expect(readQaScenarioById("long-context-progress-watchdog").sourcePath).toBe(
      "qa/scenarios/runtime/long-context-progress-watchdog.md",
    );
-    expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow))
-      .toContain("EmbeddedAttemptSessionTakeoverError");
-    expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow))
-      .toContain("AbortError");
-    expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow))
-      .toContain("This operation was aborted");
-    expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow))
-      .toContain("liveTurnTimeoutMs(env, 180000)");
+    expect(
+      JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow),
+    ).toContain("EmbeddedAttemptSessionTakeoverError");
+    expect(
+      JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow),
+    ).toContain("AbortError");
+    expect(
+      JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow),
+    ).toContain("This operation was aborted");
+    expect(
+      JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow),
+    ).toContain("liveTurnTimeoutMs(env, 180000)");
    expect(readQaScenarioExecutionConfig("long-context-progress-watchdog")).toMatchObject({
      requiredProviderMode: "live-frontier",
      harnessRuntime: "codex",
@@ -323,8 +327,8 @@ describe("qa scenario catalog", () => {
    const scenario = readQaScenarioById("gpt55-thinking-visibility-switch");
    const config = readQaScenarioExecutionConfig("gpt55-thinking-visibility-switch") as
      | {
-          requiredLiveProvider?: string;
-          requiredLiveModel?: string;
+          requiredProvider?: string;
+          requiredModel?: string;
          offDirective?: string;
          maxDirective?: string;
          reasoningDirective?: string;
@@ -332,16 +336,15 @@ describe("qa scenario catalog", () => {
      | undefined;

    expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt55-thinking-visibility-switch.md");
-    expect(config?.requiredLiveProvider).toBe("openai");
-    expect(config?.requiredLiveModel).toBe("gpt-5.5");
+    expect(config?.requiredProvider).toBe("openai");
+    expect(config?.requiredModel).toBe("gpt-5.5");
    expect(config?.offDirective).toBe("/think off");
    expect(config?.maxDirective).toBe("/think medium");
    expect(config?.reasoningDirective).toBe("/reasoning on");
    expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
      "enables reasoning display and disables thinking",
      "switches to medium thinking",
-      "verifies medium thinking emits visible reasoning",
-      "verifies medium thinking completes the answer",
+      "verifies medium thinking reaches the provider",
    ]);
  });

--- a/extensions/qa-lab/src/suite-planning.test.ts
+++ b/extensions/qa-lab/src/suite-planning.test.ts
@@ -413,4 +413,30 @@ describe("qa suite planning helpers", () => {
      }).map((scenario) => scenario.id),
    ).toEqual(["generic", "live-only"]);
  });
+
+  it("keeps live-only runtime parity scenarios out of implicit mock selections", () => {
+    const scenarios = [
+      makeQaSuiteTestScenario("generic"),
+      makeQaSuiteTestScenario("live-runtime", {
+        runtimeParityTier: "live-only",
+      }),
+    ];
+
+    expect(
+      selectQaSuiteScenarios({
+        scenarios,
+        providerMode: "mock-openai",
+        primaryModel: "mock-openai/gpt-5.5",
+      }).map((scenario) => scenario.id),
+    ).toEqual(["generic"]);
+
+    expect(
+      selectQaSuiteScenarios({
+        scenarios,
+        scenarioIds: ["live-runtime"],
+        providerMode: "mock-openai",
+        primaryModel: "mock-openai/gpt-5.5",
+      }).map((scenario) => scenario.id),
+    ).toEqual(["live-runtime"]);
+  });
 });
--- a/extensions/qa-lab/src/suite-planning.ts
+++ b/extensions/qa-lab/src/suite-planning.ts
@@ -33,12 +33,16 @@ function scenarioMatchesLiveLane(params: {
  providerMode: QaProviderMode;
  claudeCliAuthMode?: QaCliBackendAuthMode;
 }) {
+  const provider = getQaProvider(params.providerMode);
+  if (params.scenario.runtimeParityTier === "live-only" && provider.kind !== "live") {
+    return false;
+  }
  const config = params.scenario.execution.config ?? {};
  const requiredProviderMode = normalizeQaConfigString(config.requiredProviderMode);
  if (requiredProviderMode && params.providerMode !== requiredProviderMode) {
    return false;
  }
-  if (getQaProvider(params.providerMode).kind !== "live") {
+  if (provider.kind !== "live") {
    return true;
  }
  const selected = splitModelRef(params.primaryModel);
--- a/extensions/qa-lab/src/suite-runtime-agent-tools.test.ts
+++ b/extensions/qa-lab/src/suite-runtime-agent-tools.test.ts
@@ -10,12 +10,14 @@ const listToolsMock = vi.hoisted(() => vi.fn(async () => ({ tools: [] })));
 const callToolMock = vi.hoisted(() => vi.fn(async () => ({ content: [] })));
 const closeMock = vi.hoisted(() => vi.fn(async () => undefined));
 const resolveQaNodeExecPathMock = vi.hoisted(() => vi.fn(async () => "/usr/bin/node"));
+const stderrOnMock = vi.hoisted(() => vi.fn());
 const stdioTransportMock = vi.hoisted(() =>
  vi.fn().mockImplementation(function StdioClientTransport(
-    this: { params?: unknown },
+    this: { params?: unknown; stderr?: { on: typeof stderrOnMock } },
    params: unknown,
  ) {
    this.params = params;
+    this.stderr = { on: stderrOnMock };
  }),
 );

@@ -66,6 +68,7 @@ describe("qa suite runtime agent tools helpers", () => {
    callToolMock.mockReset();
    closeMock.mockClear();
    resolveQaNodeExecPathMock.mockClear();
+    stderrOnMock.mockClear();
    stdioTransportMock.mockClear();
  });

@@ -146,10 +149,17 @@ describe("qa suite runtime agent tools helpers", () => {
        OPENCLAW_KEY: "1",
      },
    });
-    expect(callToolMock).toHaveBeenCalledWith({
-      name: "plugin.echo",
-      arguments: { text: "hello" },
-    });
+    expect(stderrOnMock).toHaveBeenCalledWith("data", expect.any(Function));
+    expect(connectMock).toHaveBeenCalledWith(expect.anything(), { timeout: 180_000 });
+    expect(listToolsMock).toHaveBeenCalledWith({}, { timeout: 180_000 });
+    expect(callToolMock).toHaveBeenCalledWith(
+      {
+        name: "plugin.echo",
+        arguments: { text: "hello" },
+      },
+      undefined,
+      { timeout: 180_000 },
+    );
    expect(closeMock).toHaveBeenCalled();
  });

--- a/extensions/qa-lab/src/suite-runtime-agent-tools.ts
+++ b/extensions/qa-lab/src/suite-runtime-agent-tools.ts
@@ -13,6 +13,8 @@ import type {
 } from "./suite-runtime-types.js";

 const requireFromHere = createRequire(import.meta.url);
+const MCP_STDERR_TAIL_LIMIT = 8_192;
+const MCP_REQUEST_TIMEOUT_MS = 180_000;

 function findSkill(skills: QaSkillStatusEntry[], name: string) {
  return skills.find((skill) => skill.name === name);
@@ -52,10 +54,17 @@ async function callPluginToolsMcp(params: {
    cwd: params.env.gateway.tempRoot,
    env: transportEnv,
  });
+  let stderrTail = "";
+  const stderr = transport.stderr;
+  if (stderr && typeof stderr.on === "function") {
+    stderr.on("data", (chunk: unknown) => {
+      stderrTail = `${stderrTail}${String(chunk)}`.slice(-MCP_STDERR_TAIL_LIMIT);
+    });
+  }
  const client = new Client({ name: "openclaw-qa-suite", version: "0.0.0" }, {});
  try {
-    await client.connect(transport);
-    const listed = await client.listTools();
+    await client.connect(transport, { timeout: MCP_REQUEST_TIMEOUT_MS });
+    const listed = await client.listTools({}, { timeout: MCP_REQUEST_TIMEOUT_MS });
    const tool = listed.tools.find((entry) => entry.name === params.toolName);
    if (!tool) {
      const availableTools = listed.tools
@@ -66,10 +75,22 @@ async function callPluginToolsMcp(params: {
        `MCP tool missing: ${params.toolName}; available tools: ${availableTools.join(", ") || "<none>"}`,
      );
    }
-    return await client.callTool({
-      name: params.toolName,
-      arguments: params.args,
-    });
+    try {
+      return await client.callTool(
+        {
+          name: params.toolName,
+          arguments: params.args,
+        },
+        undefined,
+        { timeout: MCP_REQUEST_TIMEOUT_MS },
+      );
+    } catch (error) {
+      const tail = stderrTail.trim();
+      if (!tail || !(error instanceof Error)) {
+        throw error;
+      }
+      throw new Error(`${error.message}\nMCP stderr tail:\n${tail}`, { cause: error });
+    }
  } finally {
    await client.close().catch(() => {});
  }
--- a/extensions/qa-lab/src/suite-test-helpers.ts
+++ b/extensions/qa-lab/src/suite-test-helpers.ts
@@ -9,6 +9,7 @@ export function makeQaSuiteTestScenario(
    plugins?: string[];
    gatewayConfigPatch?: Record<string, unknown>;
    gatewayRuntime?: { forwardHostHome?: boolean };
+    runtimeParityTier?: QaSuiteTestScenario["runtimeParityTier"];
    surface?: string;
  } = {},
 ): QaSuiteTestScenario {
@@ -18,6 +19,7 @@ export function makeQaSuiteTestScenario(
    surface: params.surface ?? "test",
    objective: "test",
    successCriteria: ["test"],
+    ...(params.runtimeParityTier ? { runtimeParityTier: params.runtimeParityTier } : {}),
    ...(params.plugins ? { plugins: params.plugins } : {}),
    ...(params.gatewayConfigPatch ? { gatewayConfigPatch: params.gatewayConfigPatch } : {}),
    ...(params.gatewayRuntime ? { gatewayRuntime: params.gatewayRuntime } : {}),
--- a/qa/scenarios/models/gpt55-thinking-visibility-switch.md
+++ b/qa/scenarios/models/gpt55-thinking-visibility-switch.md
@@ -13,8 +13,9 @@ objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking w
 successCriteria:
  - Live runs target openai/gpt-5.5, not a mini or pro variant.
  - The session enables reasoning display before the comparison turns.
-  - The disabled-thinking turn returns its visible marker without a non-empty Reasoning summary.
-  - The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
+  - The disabled-thinking turn returns its visible marker without sending a reasoning payload to OpenAI-compatible providers.
+  - The medium-thinking turn sends a medium reasoning request and returns its visible marker.
+  - Transports with a visible reasoning lane expose a separate Reasoning-prefixed message; qa-channel validates provider behavior because generic delivery suppresses reasoning payloads by design.
 docsRefs:
  - docs/tools/thinking.md
  - docs/help/testing.md
@@ -29,8 +30,8 @@ execution:
  kind: flow
  summary: Toggle reasoning display and GPT-5.5 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
  config:
-    requiredLiveProvider: openai
-    requiredLiveModel: gpt-5.5
+    requiredProvider: openai
+    requiredModel: gpt-5.5
    offDirective: /think off
    maxDirective: /think medium
    reasoningDirective: /reasoning on
@@ -77,22 +78,25 @@ steps:
          - lambda:
              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)"
          - expr: liveTurnTimeoutMs(env, 20000)
-      - call: patchConfig
+      - set: thinkOffCursor
+        value:
+          expr: state.getSnapshot().messages.length
+      - call: state.addInboundMessage
        args:
-          - env:
-              ref: env
-            patch:
-              agents:
-                defaults:
-                  thinkingDefault: "off"
-      - call: waitForGatewayHealthy
+          - conversation:
+              id:
+                expr: config.conversationId
+              kind: direct
+            senderId: qa-operator
+            senderName: QA Operator
+            text:
+              expr: config.offDirective
+      - call: waitForCondition
+        saveAs: thinkOffAck
        args:
-          - ref: env
-          - 60000
-      - call: waitForQaChannelReady
-        args:
-          - ref: env
-          - 60000
+          - lambda:
+              expr: "state.getSnapshot().messages.slice(thinkOffCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 20000)
      - set: offCursor
        value:
          expr: state.getSnapshot().messages.length
@@ -111,7 +115,7 @@ steps:
        args:
          - lambda:
              expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.offMarker)).at(-1)"
-          - expr: liveTurnTimeoutMs(env, 30000)
+          - expr: liveTurnTimeoutMs(env, 90000)
      - set: offMessages
        value:
          expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId)"
@@ -136,27 +140,34 @@ steps:
                expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
                message:
                  expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
-    detailsExpr: "`reasoning ack=${reasoningAck.text}; off answer=${offAnswer.text}`"
+            - assert:
+                expr: "offRequest?.body && !Object.prototype.hasOwnProperty.call(offRequest.body, 'reasoning')"
+                message:
+                  expr: "`disabled thinking should omit OpenAI reasoning payload, got ${JSON.stringify(offRequest?.body?.reasoning ?? null)}`"
+    detailsExpr: "`reasoning ack=${reasoningAck.text}; thinking off=${thinkOffAck.text}; off answer=${offAnswer.text}`"
  - name: switches to medium thinking
    actions:
-      - call: patchConfig
+      - set: thinkMediumCursor
+        value:
+          expr: state.getSnapshot().messages.length
+      - call: state.addInboundMessage
        args:
-          - env:
-              ref: env
-            patch:
-              agents:
-                defaults:
-                  thinkingDefault: "medium"
-      - call: waitForGatewayHealthy
+          - conversation:
+              id:
+                expr: config.conversationId
+              kind: direct
+            senderId: qa-operator
+            senderName: QA Operator
+            text:
+              expr: config.maxDirective
+      - call: waitForCondition
+        saveAs: thinkMediumAck
        args:
-          - ref: env
-          - 60000
-      - call: waitForQaChannelReady
-        args:
-          - ref: env
-          - 60000
-    detailsExpr: "`thinking default patched to medium`"
-  - name: verifies medium thinking emits visible reasoning
+          - lambda:
+              expr: "state.getSnapshot().messages.slice(thinkMediumCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 20000)
+    detailsExpr: "`thinking medium=${thinkMediumAck.text}`"
+  - name: verifies medium thinking reaches the provider
    actions:
      - set: maxCursor
        value:
@@ -171,19 +182,6 @@ steps:
            senderName: QA Operator
            text:
              expr: config.maxPrompt
-      - call: waitForCondition
-        saveAs: maxReasoning
-        args:
-          - lambda:
-              expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.trimStart().startsWith('Reasoning:')).at(-1)"
-          - expr: liveTurnTimeoutMs(env, 120000)
-      - assert:
-          expr: "maxReasoning.text.trimStart().startsWith('Reasoning:')"
-          message:
-            expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`"
-    detailsExpr: "`reasoning=${maxReasoning.text}`"
-  - name: verifies medium thinking completes the answer
-    actions:
      - call: waitForCondition
        saveAs: maxAnswer
        args:
@@ -193,7 +191,7 @@ steps:
      - assert:
          expr: "maxAnswer.text.includes(config.maxMarker)"
          message:
-            expr: "`missing max marker: ${maxAnswer.text}`"
+            expr: "`missing max marker near answer: ${recentOutboundSummary(state, 6)}`"
      - if:
          expr: "Boolean(env.mock)"
          then:
@@ -207,5 +205,22 @@ steps:
                expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
                message:
                  expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
-    detailsExpr: "`answer=${maxAnswer.text}`"
+            - assert:
+                expr: "maxRequest?.body?.reasoning?.effort === 'medium'"
+                message:
+                  expr: "`expected medium OpenAI reasoning payload, got ${JSON.stringify(maxRequest?.body?.reasoning ?? null)}`"
+      - if:
+          expr: "env.transport.id !== 'qa-channel'"
+          then:
+            - call: waitForCondition
+              saveAs: maxReasoning
+              args:
+                - lambda:
+                    expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.trimStart().startsWith('Reasoning:')).at(-1)"
+                - expr: liveTurnTimeoutMs(env, 120000)
+            - assert:
+                expr: "maxReasoning.text.trimStart().startsWith('Reasoning:')"
+                message:
+                  expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`"
+    detailsExpr: "env.transport.id === 'qa-channel' ? `answer=${maxAnswer.text}; medium reasoning=${env.mock ? String(maxRequest?.body?.reasoning?.effort ?? '') : 'live'}; qa-channel suppresses reasoning delivery` : `answer=${maxAnswer.text}; reasoning=${maxReasoning.text}`"
 ```
--- a/qa/scenarios/plugins/skill-workshop-pending-approval.md
+++ b/qa/scenarios/plugins/skill-workshop-pending-approval.md
@@ -14,6 +14,9 @@ objective: Verify an explicit pending skill suggestion queues for review, then a
 plugins:
  - skill-workshop
 gatewayConfigPatch:
+  tools:
+    alsoAllow:
+      - skill_workshop
  plugins:
    entries:
      skill-workshop:
--- a/qa/scenarios/runtime/runtime-inventory-drift-check.md
+++ b/qa/scenarios/runtime/runtime-inventory-drift-check.md
@@ -38,6 +38,9 @@ execution:
 steps:
  - name: keeps tools.effective and skills.status aligned after config changes
    actions:
+      - call: ensureImageGenerationConfigured
+        args:
+          - ref: env
      - call: writeWorkspaceSkill
        args:
          - env:
--- a/src/agents/pi-tools.before-tool-call.ts
+++ b/src/agents/pi-tools.before-tool-call.ts
@@ -106,6 +106,10 @@ type HookOutcome =
    }
  | { blocked: false; params: unknown };
 type PluginApprovalRequest = NonNullable<PluginHookBeforeToolCallResult["requireApproval"]>;
+type BeforeToolCallWrapperOptions = {
+  approvalMode?: "request" | "report";
+  emitDiagnostics: boolean;
+};

 export type BeforeToolCallPolicyDiagnosticState = {
  hasBeforeToolCallHook: boolean;
@@ -910,7 +914,7 @@ export async function runBeforeToolCallHook(args: {
 export function wrapToolWithBeforeToolCallHook(
  tool: AnyAgentTool,
  ctx?: HookContext,
-  options: { emitDiagnostics?: boolean } = {},
+  options: { approvalMode?: "request" | "report"; emitDiagnostics?: boolean } = {},
 ): AnyAgentTool {
  const execute = tool.execute;
  if (!execute) {
@@ -918,7 +922,10 @@ export function wrapToolWithBeforeToolCallHook(
  }
  const toolName = tool.name || "tool";
  const diagnosticIdentity = resolveToolDiagnosticIdentity(tool);
-  const diagnosticOptions = { emitDiagnostics: options.emitDiagnostics !== false };
+  const hookOptions: BeforeToolCallWrapperOptions = {
+    ...(options.approvalMode ? { approvalMode: options.approvalMode } : {}),
+    emitDiagnostics: options.emitDiagnostics !== false,
+  };
  const wrappedTool: AnyAgentTool = {
    ...tool,
    execute: async (toolCallId, params, signal, onUpdate) => {
@@ -931,6 +938,7 @@ export function wrapToolWithBeforeToolCallHook(
        toolCallId,
        ctx,
        signal,
+        approvalMode: hookOptions.approvalMode,
      });
      if (outcome.blocked) {
        if (outcome.kind !== "veto") {
@@ -950,7 +958,7 @@ export function wrapToolWithBeforeToolCallHook(
          ...(toolCallId && { toolCallId }),
          paramsSummary: summarizeToolParams(outcome.params ?? hookParams),
        };
-        if (diagnosticOptions.emitDiagnostics) {
+        if (hookOptions.emitDiagnostics) {
          emitTrustedDiagnosticEvent({
            type: "tool.execution.blocked",
            ...eventBase,
@@ -992,7 +1000,7 @@ export function wrapToolWithBeforeToolCallHook(
        ...(toolCallId && { toolCallId }),
        paramsSummary: summarizeToolParams(executeParams),
      };
-      if (diagnosticOptions.emitDiagnostics) {
+      if (hookOptions.emitDiagnostics) {
        emitTrustedDiagnosticEvent({
          type: "tool.execution.started",
          ...eventBase,
@@ -1014,7 +1022,7 @@ export function wrapToolWithBeforeToolCallHook(
          toolParams: executeParams,
          ctx,
        });
-        if (diagnosticOptions.emitDiagnostics) {
+        if (hookOptions.emitDiagnostics) {
          if (skillMatch) {
            emitSkillUsedDiagnostic({
              ctx,
@@ -1033,7 +1041,7 @@ export function wrapToolWithBeforeToolCallHook(
      } catch (err) {
        const cause = unwrapErrorCause(err);
        const errorCode = diagnosticHttpStatusCode(cause);
-        if (diagnosticOptions.emitDiagnostics) {
+        if (hookOptions.emitDiagnostics) {
          emitTrustedDiagnosticEvent({
            type: "tool.execution.error",
            ...eventBase,
@@ -1060,7 +1068,7 @@ export function wrapToolWithBeforeToolCallHook(
    enumerable: true,
  });
  Object.defineProperty(wrappedTool, BEFORE_TOOL_CALL_DIAGNOSTIC_OPTIONS, {
-    value: diagnosticOptions,
+    value: hookOptions,
    enumerable: false,
  });
  return wrappedTool;
@@ -1079,6 +1087,17 @@ export function setBeforeToolCallDiagnosticsEnabled(tool: AnyAgentTool, enabled:
  }
 }

+export function setBeforeToolCallApprovalMode(
+  tool: AnyAgentTool,
+  approvalMode: "request" | "report" | undefined,
+): void {
+  const taggedTool = tool as unknown as Record<symbol, unknown>;
+  const options = taggedTool[BEFORE_TOOL_CALL_DIAGNOSTIC_OPTIONS];
+  if (options && typeof options === "object") {
+    (options as BeforeToolCallWrapperOptions).approvalMode = approvalMode;
+  }
+}
+
 export function copyBeforeToolCallHookMarker(source: AnyAgentTool, target: AnyAgentTool): void {
  if (!isToolWrappedWithBeforeToolCallHook(source)) {
    return;
--- a/src/mcp/plugin-tools-handlers.ts
+++ b/src/mcp/plugin-tools-handlers.ts
@@ -1,5 +1,6 @@
 import {
  isToolWrappedWithBeforeToolCallHook,
+  setBeforeToolCallApprovalMode,
  wrapToolWithBeforeToolCallHook,
 } from "../agents/pi-tools.before-tool-call.js";
 import type { AnyAgentTool } from "../agents/tools/common.js";
@@ -22,11 +23,12 @@ function resolveJsonSchemaForTool(tool: AnyAgentTool): Record<string, unknown> {
 export function createPluginToolsMcpHandlers(tools: AnyAgentTool[]) {
  const wrappedTools = tools.map((tool) => {
    if (isToolWrappedWithBeforeToolCallHook(tool)) {
+      setBeforeToolCallApprovalMode(tool, "report");
      return tool;
    }
    // The ACPX MCP bridge should enforce the same pre-execution hook boundary
    // as the agent and HTTP tool execution paths.
-    return wrapToolWithBeforeToolCallHook(tool);
+    return wrapToolWithBeforeToolCallHook(tool, undefined, { approvalMode: "report" });
  });
  const toolMap = new Map<string, AnyAgentTool>();
  for (const tool of wrappedTools) {
--- a/src/mcp/plugin-tools-serve.test.ts
+++ b/src/mcp/plugin-tools-serve.test.ts
@@ -1,4 +1,5 @@
 import { afterEach, describe, expect, it, vi } from "vitest";
+import { wrapToolWithBeforeToolCallHook } from "../agents/pi-tools.before-tool-call.js";
 import type { AnyAgentTool } from "../agents/tools/common.js";
 import {
  initializeGlobalHookRunner,
@@ -226,7 +227,7 @@ describe("plugin tools MCP server", () => {
    expect(failed.content).toEqual([{ type: "text", text: "Tool error: boom" }]);
  });

-  it("blocks tool execution when before_tool_call requires approval on the MCP bridge", async () => {
+  it("reports approval requirements without opening plugin approvals on the MCP bridge", async () => {
    let hookCalls = 0;
    const execute = vi.fn().mockResolvedValue({
      content: "Stored.",
@@ -248,7 +249,6 @@ describe("plugin tools MCP server", () => {
        },
      ]),
    );
-    callGatewayTool.mockRejectedValueOnce(new Error("gateway unavailable"));
    const tool = {
      name: "memory_store",
      description: "Store memory",
@@ -262,10 +262,46 @@ describe("plugin tools MCP server", () => {
      arguments: { text: "remember this" },
    });
    expect(hookCalls).toBe(1);
+    expect(callGatewayTool).not.toHaveBeenCalled();
    expect(execute).not.toHaveBeenCalled();
    expect(result.isError).toBe(true);
-    expect(result.content).toEqual([
-      { type: "text", text: "Tool error: Plugin approval required (gateway unavailable)" },
-    ]);
+    expect(result.content).toEqual([{ type: "text", text: "Tool error: Approval required" }]);
+  });
+
+  it("switches pre-wrapped plugin tools to approval report mode on the MCP bridge", async () => {
+    const execute = vi.fn().mockResolvedValue({
+      content: "Stored.",
+    });
+    initializeGlobalHookRunner(
+      createMockPluginRegistry([
+        {
+          hookName: "before_tool_call",
+          handler: async () => ({
+            requireApproval: {
+              pluginId: "test-plugin",
+              title: "Approval required",
+              description: "Approval required",
+            },
+          }),
+        },
+      ]),
+    );
+    callGatewayTool.mockRejectedValue(new Error("gateway unavailable"));
+    const tool = wrapToolWithBeforeToolCallHook({
+      name: "memory_store",
+      description: "Store memory",
+      parameters: { type: "object", properties: {} },
+      execute,
+    } as unknown as AnyAgentTool);
+
+    const handlers = createPluginToolsMcpHandlers([tool]);
+    const result = await handlers.callTool({
+      name: "memory_store",
+      arguments: { text: "remember this" },
+    });
+    expect(callGatewayTool).not.toHaveBeenCalled();
+    expect(execute).not.toHaveBeenCalled();
+    expect(result.isError).toBe(true);
+    expect(result.content).toEqual([{ type: "text", text: "Tool error: Approval required" }]);
  });
 });