mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-06 05:51:15 +08:00
fix(qa): stabilize mock QA scenario contracts
This commit is contained in:
@@ -3859,6 +3859,14 @@ describe("qa mock openai server", () => {
|
||||
expect(maxReasoning.summary).toEqual([]);
|
||||
expect(outputItem(maxPayload, 1).type).toBe("message");
|
||||
expect(outputText(maxPayload, 1)).toBe("THINKING-MAX-OK");
|
||||
|
||||
const maxStream = await expectResponsesText(server, {
|
||||
stream: true,
|
||||
model: "gpt-5.5",
|
||||
input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)],
|
||||
});
|
||||
expect(maxStream).toContain('"type":"response.output_text.delta"');
|
||||
expect(maxStream).toContain('"delta":"THINKING-MAX-OK"');
|
||||
});
|
||||
|
||||
it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => {
|
||||
|
||||
@@ -1512,6 +1512,20 @@ function buildReasoningAndAssistantEvents(params: {
|
||||
status: "in_progress",
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "response.output_text.delta",
|
||||
item_id: answerItem.id,
|
||||
output_index: 1,
|
||||
content_index: 0,
|
||||
delta: params.answerText,
|
||||
},
|
||||
{
|
||||
type: "response.output_text.done",
|
||||
item_id: answerItem.id,
|
||||
output_index: 1,
|
||||
content_index: 0,
|
||||
text: params.answerText,
|
||||
},
|
||||
{
|
||||
type: "response.output_item.done",
|
||||
item: answerItem,
|
||||
@@ -1664,13 +1678,13 @@ async function buildResponsesPayload(
|
||||
}
|
||||
return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY");
|
||||
}
|
||||
if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(prompt)) {
|
||||
if (QA_THINKING_VISIBILITY_MAX_PROMPT_RE.test(allInputText)) {
|
||||
return buildReasoningAndAssistantEvents({
|
||||
reasoningId: "rs_mock_thinking_visibility_max",
|
||||
answerText: "THINKING-MAX-OK",
|
||||
});
|
||||
}
|
||||
if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(prompt)) {
|
||||
if (QA_THINKING_VISIBILITY_OFF_PROMPT_RE.test(allInputText)) {
|
||||
return buildAssistantEvents("THINKING-OFF-OK");
|
||||
}
|
||||
if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) {
|
||||
|
||||
@@ -31,7 +31,7 @@ function createMockOpenAiResponsesProvider(baseUrl: string): ModelProviderConfig
|
||||
id: "gpt-5.5",
|
||||
name: "gpt-5.5",
|
||||
api: "openai-responses",
|
||||
reasoning: false,
|
||||
reasoning: true,
|
||||
input: ["text", "image"],
|
||||
cost: ZERO_COST,
|
||||
contextWindow: 128_000,
|
||||
@@ -41,7 +41,7 @@ function createMockOpenAiResponsesProvider(baseUrl: string): ModelProviderConfig
|
||||
id: "gpt-5.5-alt",
|
||||
name: "gpt-5.5-alt",
|
||||
api: "openai-responses",
|
||||
reasoning: false,
|
||||
reasoning: true,
|
||||
input: ["text", "image"],
|
||||
cost: ZERO_COST,
|
||||
contextWindow: 128_000,
|
||||
|
||||
@@ -55,6 +55,12 @@ describe("buildQaGatewayConfig", () => {
|
||||
expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("mock-openai/gpt-5.5");
|
||||
expect(cfg.models?.providers?.["mock-openai"]?.baseUrl).toBe("http://127.0.0.1:44080/v1");
|
||||
expect(cfg.models?.providers?.["mock-openai"]?.request).toEqual({ allowPrivateNetwork: true });
|
||||
expect(cfg.models?.providers?.["mock-openai"]?.models).toEqual(
|
||||
expect.arrayContaining([
|
||||
expect.objectContaining({ id: "gpt-5.5", reasoning: true }),
|
||||
expect.objectContaining({ id: "gpt-5.5-alt", reasoning: true }),
|
||||
]),
|
||||
);
|
||||
expect(cfg.models?.providers?.openai?.baseUrl).toBe("http://127.0.0.1:44080/v1");
|
||||
expect(cfg.models?.providers?.openai?.request).toEqual({ allowPrivateNetwork: true });
|
||||
expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080");
|
||||
|
||||
@@ -197,14 +197,18 @@ describe("qa scenario catalog", () => {
|
||||
expect(readQaScenarioById("long-context-progress-watchdog").sourcePath).toBe(
|
||||
"qa/scenarios/runtime/long-context-progress-watchdog.md",
|
||||
);
|
||||
expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow))
|
||||
.toContain("EmbeddedAttemptSessionTakeoverError");
|
||||
expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow))
|
||||
.toContain("AbortError");
|
||||
expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow))
|
||||
.toContain("This operation was aborted");
|
||||
expect(JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow))
|
||||
.toContain("liveTurnTimeoutMs(env, 180000)");
|
||||
expect(
|
||||
JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow),
|
||||
).toContain("EmbeddedAttemptSessionTakeoverError");
|
||||
expect(
|
||||
JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow),
|
||||
).toContain("AbortError");
|
||||
expect(
|
||||
JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow),
|
||||
).toContain("This operation was aborted");
|
||||
expect(
|
||||
JSON.stringify(readQaScenarioById("gateway-restart-inflight-run").execution.flow),
|
||||
).toContain("liveTurnTimeoutMs(env, 180000)");
|
||||
expect(readQaScenarioExecutionConfig("long-context-progress-watchdog")).toMatchObject({
|
||||
requiredProviderMode: "live-frontier",
|
||||
harnessRuntime: "codex",
|
||||
@@ -323,8 +327,8 @@ describe("qa scenario catalog", () => {
|
||||
const scenario = readQaScenarioById("gpt55-thinking-visibility-switch");
|
||||
const config = readQaScenarioExecutionConfig("gpt55-thinking-visibility-switch") as
|
||||
| {
|
||||
requiredLiveProvider?: string;
|
||||
requiredLiveModel?: string;
|
||||
requiredProvider?: string;
|
||||
requiredModel?: string;
|
||||
offDirective?: string;
|
||||
maxDirective?: string;
|
||||
reasoningDirective?: string;
|
||||
@@ -332,16 +336,15 @@ describe("qa scenario catalog", () => {
|
||||
| undefined;
|
||||
|
||||
expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt55-thinking-visibility-switch.md");
|
||||
expect(config?.requiredLiveProvider).toBe("openai");
|
||||
expect(config?.requiredLiveModel).toBe("gpt-5.5");
|
||||
expect(config?.requiredProvider).toBe("openai");
|
||||
expect(config?.requiredModel).toBe("gpt-5.5");
|
||||
expect(config?.offDirective).toBe("/think off");
|
||||
expect(config?.maxDirective).toBe("/think medium");
|
||||
expect(config?.reasoningDirective).toBe("/reasoning on");
|
||||
expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([
|
||||
"enables reasoning display and disables thinking",
|
||||
"switches to medium thinking",
|
||||
"verifies medium thinking emits visible reasoning",
|
||||
"verifies medium thinking completes the answer",
|
||||
"verifies medium thinking reaches the provider",
|
||||
]);
|
||||
});
|
||||
|
||||
|
||||
@@ -413,4 +413,30 @@ describe("qa suite planning helpers", () => {
|
||||
}).map((scenario) => scenario.id),
|
||||
).toEqual(["generic", "live-only"]);
|
||||
});
|
||||
|
||||
it("keeps live-only runtime parity scenarios out of implicit mock selections", () => {
|
||||
const scenarios = [
|
||||
makeQaSuiteTestScenario("generic"),
|
||||
makeQaSuiteTestScenario("live-runtime", {
|
||||
runtimeParityTier: "live-only",
|
||||
}),
|
||||
];
|
||||
|
||||
expect(
|
||||
selectQaSuiteScenarios({
|
||||
scenarios,
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "mock-openai/gpt-5.5",
|
||||
}).map((scenario) => scenario.id),
|
||||
).toEqual(["generic"]);
|
||||
|
||||
expect(
|
||||
selectQaSuiteScenarios({
|
||||
scenarios,
|
||||
scenarioIds: ["live-runtime"],
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "mock-openai/gpt-5.5",
|
||||
}).map((scenario) => scenario.id),
|
||||
).toEqual(["live-runtime"]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -33,12 +33,16 @@ function scenarioMatchesLiveLane(params: {
|
||||
providerMode: QaProviderMode;
|
||||
claudeCliAuthMode?: QaCliBackendAuthMode;
|
||||
}) {
|
||||
const provider = getQaProvider(params.providerMode);
|
||||
if (params.scenario.runtimeParityTier === "live-only" && provider.kind !== "live") {
|
||||
return false;
|
||||
}
|
||||
const config = params.scenario.execution.config ?? {};
|
||||
const requiredProviderMode = normalizeQaConfigString(config.requiredProviderMode);
|
||||
if (requiredProviderMode && params.providerMode !== requiredProviderMode) {
|
||||
return false;
|
||||
}
|
||||
if (getQaProvider(params.providerMode).kind !== "live") {
|
||||
if (provider.kind !== "live") {
|
||||
return true;
|
||||
}
|
||||
const selected = splitModelRef(params.primaryModel);
|
||||
|
||||
@@ -10,12 +10,14 @@ const listToolsMock = vi.hoisted(() => vi.fn(async () => ({ tools: [] })));
|
||||
const callToolMock = vi.hoisted(() => vi.fn(async () => ({ content: [] })));
|
||||
const closeMock = vi.hoisted(() => vi.fn(async () => undefined));
|
||||
const resolveQaNodeExecPathMock = vi.hoisted(() => vi.fn(async () => "/usr/bin/node"));
|
||||
const stderrOnMock = vi.hoisted(() => vi.fn());
|
||||
const stdioTransportMock = vi.hoisted(() =>
|
||||
vi.fn().mockImplementation(function StdioClientTransport(
|
||||
this: { params?: unknown },
|
||||
this: { params?: unknown; stderr?: { on: typeof stderrOnMock } },
|
||||
params: unknown,
|
||||
) {
|
||||
this.params = params;
|
||||
this.stderr = { on: stderrOnMock };
|
||||
}),
|
||||
);
|
||||
|
||||
@@ -66,6 +68,7 @@ describe("qa suite runtime agent tools helpers", () => {
|
||||
callToolMock.mockReset();
|
||||
closeMock.mockClear();
|
||||
resolveQaNodeExecPathMock.mockClear();
|
||||
stderrOnMock.mockClear();
|
||||
stdioTransportMock.mockClear();
|
||||
});
|
||||
|
||||
@@ -146,10 +149,17 @@ describe("qa suite runtime agent tools helpers", () => {
|
||||
OPENCLAW_KEY: "1",
|
||||
},
|
||||
});
|
||||
expect(callToolMock).toHaveBeenCalledWith({
|
||||
name: "plugin.echo",
|
||||
arguments: { text: "hello" },
|
||||
});
|
||||
expect(stderrOnMock).toHaveBeenCalledWith("data", expect.any(Function));
|
||||
expect(connectMock).toHaveBeenCalledWith(expect.anything(), { timeout: 180_000 });
|
||||
expect(listToolsMock).toHaveBeenCalledWith({}, { timeout: 180_000 });
|
||||
expect(callToolMock).toHaveBeenCalledWith(
|
||||
{
|
||||
name: "plugin.echo",
|
||||
arguments: { text: "hello" },
|
||||
},
|
||||
undefined,
|
||||
{ timeout: 180_000 },
|
||||
);
|
||||
expect(closeMock).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
|
||||
@@ -13,6 +13,8 @@ import type {
|
||||
} from "./suite-runtime-types.js";
|
||||
|
||||
const requireFromHere = createRequire(import.meta.url);
|
||||
const MCP_STDERR_TAIL_LIMIT = 8_192;
|
||||
const MCP_REQUEST_TIMEOUT_MS = 180_000;
|
||||
|
||||
function findSkill(skills: QaSkillStatusEntry[], name: string) {
|
||||
return skills.find((skill) => skill.name === name);
|
||||
@@ -52,10 +54,17 @@ async function callPluginToolsMcp(params: {
|
||||
cwd: params.env.gateway.tempRoot,
|
||||
env: transportEnv,
|
||||
});
|
||||
let stderrTail = "";
|
||||
const stderr = transport.stderr;
|
||||
if (stderr && typeof stderr.on === "function") {
|
||||
stderr.on("data", (chunk: unknown) => {
|
||||
stderrTail = `${stderrTail}${String(chunk)}`.slice(-MCP_STDERR_TAIL_LIMIT);
|
||||
});
|
||||
}
|
||||
const client = new Client({ name: "openclaw-qa-suite", version: "0.0.0" }, {});
|
||||
try {
|
||||
await client.connect(transport);
|
||||
const listed = await client.listTools();
|
||||
await client.connect(transport, { timeout: MCP_REQUEST_TIMEOUT_MS });
|
||||
const listed = await client.listTools({}, { timeout: MCP_REQUEST_TIMEOUT_MS });
|
||||
const tool = listed.tools.find((entry) => entry.name === params.toolName);
|
||||
if (!tool) {
|
||||
const availableTools = listed.tools
|
||||
@@ -66,10 +75,22 @@ async function callPluginToolsMcp(params: {
|
||||
`MCP tool missing: ${params.toolName}; available tools: ${availableTools.join(", ") || "<none>"}`,
|
||||
);
|
||||
}
|
||||
return await client.callTool({
|
||||
name: params.toolName,
|
||||
arguments: params.args,
|
||||
});
|
||||
try {
|
||||
return await client.callTool(
|
||||
{
|
||||
name: params.toolName,
|
||||
arguments: params.args,
|
||||
},
|
||||
undefined,
|
||||
{ timeout: MCP_REQUEST_TIMEOUT_MS },
|
||||
);
|
||||
} catch (error) {
|
||||
const tail = stderrTail.trim();
|
||||
if (!tail || !(error instanceof Error)) {
|
||||
throw error;
|
||||
}
|
||||
throw new Error(`${error.message}\nMCP stderr tail:\n${tail}`, { cause: error });
|
||||
}
|
||||
} finally {
|
||||
await client.close().catch(() => {});
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ export function makeQaSuiteTestScenario(
|
||||
plugins?: string[];
|
||||
gatewayConfigPatch?: Record<string, unknown>;
|
||||
gatewayRuntime?: { forwardHostHome?: boolean };
|
||||
runtimeParityTier?: QaSuiteTestScenario["runtimeParityTier"];
|
||||
surface?: string;
|
||||
} = {},
|
||||
): QaSuiteTestScenario {
|
||||
@@ -18,6 +19,7 @@ export function makeQaSuiteTestScenario(
|
||||
surface: params.surface ?? "test",
|
||||
objective: "test",
|
||||
successCriteria: ["test"],
|
||||
...(params.runtimeParityTier ? { runtimeParityTier: params.runtimeParityTier } : {}),
|
||||
...(params.plugins ? { plugins: params.plugins } : {}),
|
||||
...(params.gatewayConfigPatch ? { gatewayConfigPatch: params.gatewayConfigPatch } : {}),
|
||||
...(params.gatewayRuntime ? { gatewayRuntime: params.gatewayRuntime } : {}),
|
||||
|
||||
@@ -13,8 +13,9 @@ objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking w
|
||||
successCriteria:
|
||||
- Live runs target openai/gpt-5.5, not a mini or pro variant.
|
||||
- The session enables reasoning display before the comparison turns.
|
||||
- The disabled-thinking turn returns its visible marker without a non-empty Reasoning summary.
|
||||
- The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
|
||||
- The disabled-thinking turn returns its visible marker without sending a reasoning payload to OpenAI-compatible providers.
|
||||
- The medium-thinking turn sends a medium reasoning request and returns its visible marker.
|
||||
- Transports with a visible reasoning lane expose a separate Reasoning-prefixed message; qa-channel validates provider behavior because generic delivery suppresses reasoning payloads by design.
|
||||
docsRefs:
|
||||
- docs/tools/thinking.md
|
||||
- docs/help/testing.md
|
||||
@@ -29,8 +30,8 @@ execution:
|
||||
kind: flow
|
||||
summary: Toggle reasoning display and GPT-5.5 thinking between off/none and medium, then verify visible reasoning only on the medium turn.
|
||||
config:
|
||||
requiredLiveProvider: openai
|
||||
requiredLiveModel: gpt-5.5
|
||||
requiredProvider: openai
|
||||
requiredModel: gpt-5.5
|
||||
offDirective: /think off
|
||||
maxDirective: /think medium
|
||||
reasoningDirective: /reasoning on
|
||||
@@ -77,22 +78,25 @@ steps:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
- call: patchConfig
|
||||
- set: thinkOffCursor
|
||||
value:
|
||||
expr: state.getSnapshot().messages.length
|
||||
- call: state.addInboundMessage
|
||||
args:
|
||||
- env:
|
||||
ref: env
|
||||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
thinkingDefault: "off"
|
||||
- call: waitForGatewayHealthy
|
||||
- conversation:
|
||||
id:
|
||||
expr: config.conversationId
|
||||
kind: direct
|
||||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: config.offDirective
|
||||
- call: waitForCondition
|
||||
saveAs: thinkOffAck
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: waitForQaChannelReady
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.slice(thinkOffCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
- set: offCursor
|
||||
value:
|
||||
expr: state.getSnapshot().messages.length
|
||||
@@ -111,7 +115,7 @@ steps:
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.offMarker)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 30000)
|
||||
- expr: liveTurnTimeoutMs(env, 90000)
|
||||
- set: offMessages
|
||||
value:
|
||||
expr: "state.getSnapshot().messages.slice(offCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId)"
|
||||
@@ -136,27 +140,34 @@ steps:
|
||||
expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
|
||||
message:
|
||||
expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
|
||||
detailsExpr: "`reasoning ack=${reasoningAck.text}; off answer=${offAnswer.text}`"
|
||||
- assert:
|
||||
expr: "offRequest?.body && !Object.prototype.hasOwnProperty.call(offRequest.body, 'reasoning')"
|
||||
message:
|
||||
expr: "`disabled thinking should omit OpenAI reasoning payload, got ${JSON.stringify(offRequest?.body?.reasoning ?? null)}`"
|
||||
detailsExpr: "`reasoning ack=${reasoningAck.text}; thinking off=${thinkOffAck.text}; off answer=${offAnswer.text}`"
|
||||
- name: switches to medium thinking
|
||||
actions:
|
||||
- call: patchConfig
|
||||
- set: thinkMediumCursor
|
||||
value:
|
||||
expr: state.getSnapshot().messages.length
|
||||
- call: state.addInboundMessage
|
||||
args:
|
||||
- env:
|
||||
ref: env
|
||||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
thinkingDefault: "medium"
|
||||
- call: waitForGatewayHealthy
|
||||
- conversation:
|
||||
id:
|
||||
expr: config.conversationId
|
||||
kind: direct
|
||||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: config.maxDirective
|
||||
- call: waitForCondition
|
||||
saveAs: thinkMediumAck
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: waitForQaChannelReady
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
detailsExpr: "`thinking default patched to medium`"
|
||||
- name: verifies medium thinking emits visible reasoning
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.slice(thinkMediumCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
detailsExpr: "`thinking medium=${thinkMediumAck.text}`"
|
||||
- name: verifies medium thinking reaches the provider
|
||||
actions:
|
||||
- set: maxCursor
|
||||
value:
|
||||
@@ -171,19 +182,6 @@ steps:
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: config.maxPrompt
|
||||
- call: waitForCondition
|
||||
saveAs: maxReasoning
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.trimStart().startsWith('Reasoning:')).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 120000)
|
||||
- assert:
|
||||
expr: "maxReasoning.text.trimStart().startsWith('Reasoning:')"
|
||||
message:
|
||||
expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`"
|
||||
detailsExpr: "`reasoning=${maxReasoning.text}`"
|
||||
- name: verifies medium thinking completes the answer
|
||||
actions:
|
||||
- call: waitForCondition
|
||||
saveAs: maxAnswer
|
||||
args:
|
||||
@@ -193,7 +191,7 @@ steps:
|
||||
- assert:
|
||||
expr: "maxAnswer.text.includes(config.maxMarker)"
|
||||
message:
|
||||
expr: "`missing max marker: ${maxAnswer.text}`"
|
||||
expr: "`missing max marker near answer: ${recentOutboundSummary(state, 6)}`"
|
||||
- if:
|
||||
expr: "Boolean(env.mock)"
|
||||
then:
|
||||
@@ -207,5 +205,22 @@ steps:
|
||||
expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')"
|
||||
message:
|
||||
expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`"
|
||||
detailsExpr: "`answer=${maxAnswer.text}`"
|
||||
- assert:
|
||||
expr: "maxRequest?.body?.reasoning?.effort === 'medium'"
|
||||
message:
|
||||
expr: "`expected medium OpenAI reasoning payload, got ${JSON.stringify(maxRequest?.body?.reasoning ?? null)}`"
|
||||
- if:
|
||||
expr: "env.transport.id !== 'qa-channel'"
|
||||
then:
|
||||
- call: waitForCondition
|
||||
saveAs: maxReasoning
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.slice(maxCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.trimStart().startsWith('Reasoning:')).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 120000)
|
||||
- assert:
|
||||
expr: "maxReasoning.text.trimStart().startsWith('Reasoning:')"
|
||||
message:
|
||||
expr: "`missing max reasoning message near answer: ${recentOutboundSummary(state, 6)}`"
|
||||
detailsExpr: "env.transport.id === 'qa-channel' ? `answer=${maxAnswer.text}; medium reasoning=${env.mock ? String(maxRequest?.body?.reasoning?.effort ?? '') : 'live'}; qa-channel suppresses reasoning delivery` : `answer=${maxAnswer.text}; reasoning=${maxReasoning.text}`"
|
||||
```
|
||||
|
||||
@@ -14,6 +14,9 @@ objective: Verify an explicit pending skill suggestion queues for review, then a
|
||||
plugins:
|
||||
- skill-workshop
|
||||
gatewayConfigPatch:
|
||||
tools:
|
||||
alsoAllow:
|
||||
- skill_workshop
|
||||
plugins:
|
||||
entries:
|
||||
skill-workshop:
|
||||
|
||||
@@ -38,6 +38,9 @@ execution:
|
||||
steps:
|
||||
- name: keeps tools.effective and skills.status aligned after config changes
|
||||
actions:
|
||||
- call: ensureImageGenerationConfigured
|
||||
args:
|
||||
- ref: env
|
||||
- call: writeWorkspaceSkill
|
||||
args:
|
||||
- env:
|
||||
|
||||
@@ -106,6 +106,10 @@ type HookOutcome =
|
||||
}
|
||||
| { blocked: false; params: unknown };
|
||||
type PluginApprovalRequest = NonNullable<PluginHookBeforeToolCallResult["requireApproval"]>;
|
||||
type BeforeToolCallWrapperOptions = {
|
||||
approvalMode?: "request" | "report";
|
||||
emitDiagnostics: boolean;
|
||||
};
|
||||
|
||||
export type BeforeToolCallPolicyDiagnosticState = {
|
||||
hasBeforeToolCallHook: boolean;
|
||||
@@ -910,7 +914,7 @@ export async function runBeforeToolCallHook(args: {
|
||||
export function wrapToolWithBeforeToolCallHook(
|
||||
tool: AnyAgentTool,
|
||||
ctx?: HookContext,
|
||||
options: { emitDiagnostics?: boolean } = {},
|
||||
options: { approvalMode?: "request" | "report"; emitDiagnostics?: boolean } = {},
|
||||
): AnyAgentTool {
|
||||
const execute = tool.execute;
|
||||
if (!execute) {
|
||||
@@ -918,7 +922,10 @@ export function wrapToolWithBeforeToolCallHook(
|
||||
}
|
||||
const toolName = tool.name || "tool";
|
||||
const diagnosticIdentity = resolveToolDiagnosticIdentity(tool);
|
||||
const diagnosticOptions = { emitDiagnostics: options.emitDiagnostics !== false };
|
||||
const hookOptions: BeforeToolCallWrapperOptions = {
|
||||
...(options.approvalMode ? { approvalMode: options.approvalMode } : {}),
|
||||
emitDiagnostics: options.emitDiagnostics !== false,
|
||||
};
|
||||
const wrappedTool: AnyAgentTool = {
|
||||
...tool,
|
||||
execute: async (toolCallId, params, signal, onUpdate) => {
|
||||
@@ -931,6 +938,7 @@ export function wrapToolWithBeforeToolCallHook(
|
||||
toolCallId,
|
||||
ctx,
|
||||
signal,
|
||||
approvalMode: hookOptions.approvalMode,
|
||||
});
|
||||
if (outcome.blocked) {
|
||||
if (outcome.kind !== "veto") {
|
||||
@@ -950,7 +958,7 @@ export function wrapToolWithBeforeToolCallHook(
|
||||
...(toolCallId && { toolCallId }),
|
||||
paramsSummary: summarizeToolParams(outcome.params ?? hookParams),
|
||||
};
|
||||
if (diagnosticOptions.emitDiagnostics) {
|
||||
if (hookOptions.emitDiagnostics) {
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "tool.execution.blocked",
|
||||
...eventBase,
|
||||
@@ -992,7 +1000,7 @@ export function wrapToolWithBeforeToolCallHook(
|
||||
...(toolCallId && { toolCallId }),
|
||||
paramsSummary: summarizeToolParams(executeParams),
|
||||
};
|
||||
if (diagnosticOptions.emitDiagnostics) {
|
||||
if (hookOptions.emitDiagnostics) {
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "tool.execution.started",
|
||||
...eventBase,
|
||||
@@ -1014,7 +1022,7 @@ export function wrapToolWithBeforeToolCallHook(
|
||||
toolParams: executeParams,
|
||||
ctx,
|
||||
});
|
||||
if (diagnosticOptions.emitDiagnostics) {
|
||||
if (hookOptions.emitDiagnostics) {
|
||||
if (skillMatch) {
|
||||
emitSkillUsedDiagnostic({
|
||||
ctx,
|
||||
@@ -1033,7 +1041,7 @@ export function wrapToolWithBeforeToolCallHook(
|
||||
} catch (err) {
|
||||
const cause = unwrapErrorCause(err);
|
||||
const errorCode = diagnosticHttpStatusCode(cause);
|
||||
if (diagnosticOptions.emitDiagnostics) {
|
||||
if (hookOptions.emitDiagnostics) {
|
||||
emitTrustedDiagnosticEvent({
|
||||
type: "tool.execution.error",
|
||||
...eventBase,
|
||||
@@ -1060,7 +1068,7 @@ export function wrapToolWithBeforeToolCallHook(
|
||||
enumerable: true,
|
||||
});
|
||||
Object.defineProperty(wrappedTool, BEFORE_TOOL_CALL_DIAGNOSTIC_OPTIONS, {
|
||||
value: diagnosticOptions,
|
||||
value: hookOptions,
|
||||
enumerable: false,
|
||||
});
|
||||
return wrappedTool;
|
||||
@@ -1079,6 +1087,17 @@ export function setBeforeToolCallDiagnosticsEnabled(tool: AnyAgentTool, enabled:
|
||||
}
|
||||
}
|
||||
|
||||
export function setBeforeToolCallApprovalMode(
|
||||
tool: AnyAgentTool,
|
||||
approvalMode: "request" | "report" | undefined,
|
||||
): void {
|
||||
const taggedTool = tool as unknown as Record<symbol, unknown>;
|
||||
const options = taggedTool[BEFORE_TOOL_CALL_DIAGNOSTIC_OPTIONS];
|
||||
if (options && typeof options === "object") {
|
||||
(options as BeforeToolCallWrapperOptions).approvalMode = approvalMode;
|
||||
}
|
||||
}
|
||||
|
||||
export function copyBeforeToolCallHookMarker(source: AnyAgentTool, target: AnyAgentTool): void {
|
||||
if (!isToolWrappedWithBeforeToolCallHook(source)) {
|
||||
return;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import {
|
||||
isToolWrappedWithBeforeToolCallHook,
|
||||
setBeforeToolCallApprovalMode,
|
||||
wrapToolWithBeforeToolCallHook,
|
||||
} from "../agents/pi-tools.before-tool-call.js";
|
||||
import type { AnyAgentTool } from "../agents/tools/common.js";
|
||||
@@ -22,11 +23,12 @@ function resolveJsonSchemaForTool(tool: AnyAgentTool): Record<string, unknown> {
|
||||
export function createPluginToolsMcpHandlers(tools: AnyAgentTool[]) {
|
||||
const wrappedTools = tools.map((tool) => {
|
||||
if (isToolWrappedWithBeforeToolCallHook(tool)) {
|
||||
setBeforeToolCallApprovalMode(tool, "report");
|
||||
return tool;
|
||||
}
|
||||
// The ACPX MCP bridge should enforce the same pre-execution hook boundary
|
||||
// as the agent and HTTP tool execution paths.
|
||||
return wrapToolWithBeforeToolCallHook(tool);
|
||||
return wrapToolWithBeforeToolCallHook(tool, undefined, { approvalMode: "report" });
|
||||
});
|
||||
const toolMap = new Map<string, AnyAgentTool>();
|
||||
for (const tool of wrappedTools) {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { wrapToolWithBeforeToolCallHook } from "../agents/pi-tools.before-tool-call.js";
|
||||
import type { AnyAgentTool } from "../agents/tools/common.js";
|
||||
import {
|
||||
initializeGlobalHookRunner,
|
||||
@@ -226,7 +227,7 @@ describe("plugin tools MCP server", () => {
|
||||
expect(failed.content).toEqual([{ type: "text", text: "Tool error: boom" }]);
|
||||
});
|
||||
|
||||
it("blocks tool execution when before_tool_call requires approval on the MCP bridge", async () => {
|
||||
it("reports approval requirements without opening plugin approvals on the MCP bridge", async () => {
|
||||
let hookCalls = 0;
|
||||
const execute = vi.fn().mockResolvedValue({
|
||||
content: "Stored.",
|
||||
@@ -248,7 +249,6 @@ describe("plugin tools MCP server", () => {
|
||||
},
|
||||
]),
|
||||
);
|
||||
callGatewayTool.mockRejectedValueOnce(new Error("gateway unavailable"));
|
||||
const tool = {
|
||||
name: "memory_store",
|
||||
description: "Store memory",
|
||||
@@ -262,10 +262,46 @@ describe("plugin tools MCP server", () => {
|
||||
arguments: { text: "remember this" },
|
||||
});
|
||||
expect(hookCalls).toBe(1);
|
||||
expect(callGatewayTool).not.toHaveBeenCalled();
|
||||
expect(execute).not.toHaveBeenCalled();
|
||||
expect(result.isError).toBe(true);
|
||||
expect(result.content).toEqual([
|
||||
{ type: "text", text: "Tool error: Plugin approval required (gateway unavailable)" },
|
||||
]);
|
||||
expect(result.content).toEqual([{ type: "text", text: "Tool error: Approval required" }]);
|
||||
});
|
||||
|
||||
it("switches pre-wrapped plugin tools to approval report mode on the MCP bridge", async () => {
|
||||
const execute = vi.fn().mockResolvedValue({
|
||||
content: "Stored.",
|
||||
});
|
||||
initializeGlobalHookRunner(
|
||||
createMockPluginRegistry([
|
||||
{
|
||||
hookName: "before_tool_call",
|
||||
handler: async () => ({
|
||||
requireApproval: {
|
||||
pluginId: "test-plugin",
|
||||
title: "Approval required",
|
||||
description: "Approval required",
|
||||
},
|
||||
}),
|
||||
},
|
||||
]),
|
||||
);
|
||||
callGatewayTool.mockRejectedValue(new Error("gateway unavailable"));
|
||||
const tool = wrapToolWithBeforeToolCallHook({
|
||||
name: "memory_store",
|
||||
description: "Store memory",
|
||||
parameters: { type: "object", properties: {} },
|
||||
execute,
|
||||
} as unknown as AnyAgentTool);
|
||||
|
||||
const handlers = createPluginToolsMcpHandlers([tool]);
|
||||
const result = await handlers.callTool({
|
||||
name: "memory_store",
|
||||
arguments: { text: "remember this" },
|
||||
});
|
||||
expect(callGatewayTool).not.toHaveBeenCalled();
|
||||
expect(execute).not.toHaveBeenCalled();
|
||||
expect(result.isError).toBe(true);
|
||||
expect(result.content).toEqual([{ type: "text", text: "Tool error: Approval required" }]);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user