diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bbdffcf1f3e..e3a2fb9526db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,6 +67,7 @@ Docs: https://docs.openclaw.ai - Release/CI/E2E: require the Kitchen Sink RPC walk to prove every expected plugin tool is cataloged and effective before invoking tool fixtures. - Release/CI/E2E: stop tracked Docker build commands when centralized build wrappers receive shutdown signals. - Release/CI/E2E: cover MCP channel pairing reconnects by asserting the same temporary client state is reused across reconnects. +- Release/CI/E2E: require QA channel baseline and reconnect scenarios to assert their scenario markers instead of accepting any outbound reply. - Release/CI/E2E: fail secret-provider proof runs when temporary state cleanup still fails after retries instead of hiding the cleanup error. - Release/CI/E2E: fail package-candidate ref proofs when temporary source worktree cleanup fails instead of leaving stale worktrees behind. - Release/CI/E2E: remove package tarball extract directories when tar extraction fails before validation can continue. diff --git a/extensions/qa-lab/src/scenario-flow-runner.test.ts b/extensions/qa-lab/src/scenario-flow-runner.test.ts index 727c14393ce4..60227e85911f 100644 --- a/extensions/qa-lab/src/scenario-flow-runner.test.ts +++ b/extensions/qa-lab/src/scenario-flow-runner.test.ts @@ -1,7 +1,98 @@ import { describe, expect, it } from "vitest"; import { createQaBusState } from "./bus-state.js"; +import { readQaScenarioById } from "./scenario-catalog.js"; import { runScenarioFlow } from "./scenario-flow-runner.js"; +type QaFlowStep = { + name: string; + run: () => Promise; +}; + +function formatTestTranscript(state: ReturnType) { + return state + .getSnapshot() + .messages.map((message) => `${message.direction}:${message.conversation.id}:${message.text}`) + .join("\n"); +} + +async function runLoadedScenarioFlow( + scenarioId: string, + params: { + onWaitForOutboundMessage?: (params: { + waitCount: number; + state: ReturnType; + }) => void; + } = {}, +) { + const scenario = readQaScenarioById(scenarioId); + const flow = scenario.execution.flow; + if (!flow) { + throw new Error(`scenario has no flow: ${scenarioId}`); + } + + const state = createQaBusState(); + let waitCount = 0; + const api = { + env: {}, + state, + scenario, + config: scenario.execution.config ?? {}, + randomUUID: () => "00000000-0000-4000-8000-000000000000", + liveTurnTimeoutMs: (_env: unknown, timeoutMs: number) => timeoutMs, + waitForGatewayHealthy: async () => undefined, + waitForQaChannelReady: async () => undefined, + waitForNoOutbound: async () => undefined, + sleep: async () => undefined, + reset: async () => { + state.reset(); + }, + resetBus: async () => { + state.reset(); + }, + runAgentPrompt: async () => undefined, + formatTransportTranscript: formatTestTranscript, + waitForOutboundMessage: async ( + stateLocal: ReturnType, + predicate: (candidate: unknown) => boolean, + timeoutMs: number, + options?: { sinceIndex?: number }, + ) => { + waitCount += 1; + params.onWaitForOutboundMessage?.({ waitCount, state: stateLocal }); + const match = stateLocal + .getSnapshot() + .messages.slice(options?.sinceIndex ?? 0) + .find((candidate) => predicate(candidate)); + if (match) { + return match; + } + throw new Error(`timed out after ${timeoutMs}ms waiting for outbound marker`); + }, + runScenario: async (_name: string, steps: QaFlowStep[]) => { + const stepResults = []; + for (const step of steps) { + const details = await step.run(); + stepResults.push({ + name: step.name, + status: "pass" as const, + ...(details !== undefined ? { details } : {}), + }); + } + return { + name: scenario.title, + status: "pass" as const, + steps: stepResults, + }; + }, + }; + + return await runScenarioFlow({ + api, + scenarioTitle: scenario.title, + flow, + }); +} + describe("scenario-flow-runner", () => { it("supports qaImport inside flow expressions", async () => { const result = await runScenarioFlow({ @@ -221,4 +312,78 @@ describe("scenario-flow-runner", () => { expect(result.status).toBe("pass"); expect(result.steps[0]?.details).toBe("QA_CODEX_PLUGIN_TURN_OK"); }); + + it.each([ + { + scenarioId: "channel-chat-baseline", + to: "channel:qa-room", + text: "generic shared-channel reply without the required marker", + }, + { + scenarioId: "dm-chat-baseline", + to: "dm:alice", + text: "generic DM reply without the required marker", + }, + ])("rejects unmarked outbound replies for $scenarioId", async ({ scenarioId, to, text }) => { + await expect( + runLoadedScenarioFlow(scenarioId, { + onWaitForOutboundMessage: ({ state }) => { + state.addOutboundMessage({ + accountId: "qa-channel", + to, + text, + }); + }, + }), + ).rejects.toThrow("waiting for outbound marker"); + }); + + it("rejects reconnect follow-up replies that replay the first marker", async () => { + await expect( + runLoadedScenarioFlow("qa-channel-reconnect-dedupe", { + onWaitForOutboundMessage: ({ waitCount, state }) => { + if (waitCount === 1) { + state.addOutboundMessage({ + accountId: "qa-channel", + to: "channel:qa-room", + text: "RECONNECT-FIRST-OK", + }); + return; + } + state.addOutboundMessage({ + accountId: "qa-channel", + to: "channel:qa-room", + text: "RECONNECT-FIRST-OK", + }); + }, + }), + ).rejects.toThrow("waiting for outbound marker"); + }); + + it("rejects reconnect follow-up turns with extra unmarked outbound replies", async () => { + await expect( + runLoadedScenarioFlow("qa-channel-reconnect-dedupe", { + onWaitForOutboundMessage: ({ waitCount, state }) => { + if (waitCount === 1) { + state.addOutboundMessage({ + accountId: "qa-channel", + to: "channel:qa-room", + text: "RECONNECT-FIRST-OK", + }); + return; + } + state.addOutboundMessage({ + accountId: "qa-channel", + to: "channel:qa-room", + text: "RECONNECT-SECOND-OK", + }); + state.addOutboundMessage({ + accountId: "qa-channel", + to: "channel:qa-room", + text: "unmarked duplicate delivery", + }); + }, + }), + ).rejects.toThrow("exactly one marked post-restart reply"); + }); }); diff --git a/qa/scenarios/channels/channel-chat-baseline.md b/qa/scenarios/channels/channel-chat-baseline.md index 87227a98edf8..614d15e07434 100644 --- a/qa/scenarios/channels/channel-chat-baseline.md +++ b/qa/scenarios/channels/channel-chat-baseline.md @@ -12,6 +12,7 @@ coverage: objective: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics. successCriteria: - Agent replies in the shared channel transcript. + - Agent visible reply contains the scenario marker. - Agent keeps the conversation scoped to the channel. - Agent respects mention-driven group routing semantics. docsRefs: @@ -24,7 +25,8 @@ execution: kind: flow summary: Verify the QA agent can respond correctly in a shared channel and respect mention-driven group semantics. config: - mentionPrompt: "@openclaw explain the QA lab" + expectedMarker: QA-CHANNEL-BASELINE-OK + mentionPrompt: "@openclaw qa channel baseline marker check. Reply exactly: QA-CHANNEL-BASELINE-OK" ``` ```yaml qa-flow @@ -78,7 +80,14 @@ steps: - ref: state - lambda: params: [candidate] - expr: "candidate.conversation.id === 'qa-room' && !candidate.threadId" + expr: "candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room' && candidate.conversation.kind === 'channel' && !candidate.threadId && String(candidate.text ?? '').includes(config.expectedMarker)" - expr: liveTurnTimeoutMs(env, 180000) + - set: matchingOutbound + value: + expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room' && candidate.conversation.kind === 'channel' && String(candidate.text ?? '').includes(config.expectedMarker))" + - assert: + expr: matchingOutbound.length === 1 + message: + expr: "`expected exactly one channel baseline marker reply, saw ${matchingOutbound.length}; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`" detailsExpr: message.text ``` diff --git a/qa/scenarios/channels/dm-chat-baseline.md b/qa/scenarios/channels/dm-chat-baseline.md index 278525d7064e..4e7668e12e3c 100644 --- a/qa/scenarios/channels/dm-chat-baseline.md +++ b/qa/scenarios/channels/dm-chat-baseline.md @@ -12,6 +12,7 @@ coverage: objective: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character. successCriteria: - Agent replies in DM without channel routing mistakes. + - Agent visible reply contains the scenario marker. - Agent explains the QA lab and message bus correctly. - Agent keeps the dev C-3PO personality. docsRefs: @@ -24,7 +25,8 @@ execution: kind: flow summary: Verify the QA agent can chat coherently in a DM, explain the QA setup, and stay in character. config: - prompt: "Hello there, who are you?" + expectedMarker: QA-DM-BASELINE-OK + prompt: "DM baseline marker check. Include exact marker: `QA-DM-BASELINE-OK` and briefly identify the QA lab message bus." ``` ```yaml qa-flow @@ -47,7 +49,14 @@ steps: - ref: state - lambda: params: [candidate] - expr: "candidate.conversation.id === 'alice'" + expr: "candidate.direction === 'outbound' && candidate.conversation.id === 'alice' && candidate.conversation.kind === 'direct' && String(candidate.text ?? '').includes(config.expectedMarker)" - expr: liveTurnTimeoutMs(env, 45000) + - set: matchingOutbound + value: + expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'alice' && candidate.conversation.kind === 'direct' && String(candidate.text ?? '').includes(config.expectedMarker))" + - assert: + expr: matchingOutbound.length === 1 + message: + expr: "`expected exactly one DM baseline marker reply, saw ${matchingOutbound.length}; transcript=${formatTransportTranscript(state, { conversationId: 'alice' })}`" detailsExpr: outbound.text ``` diff --git a/qa/scenarios/channels/qa-channel-reconnect-dedupe.md b/qa/scenarios/channels/qa-channel-reconnect-dedupe.md index 15ab1dd8611d..710c233f8006 100644 --- a/qa/scenarios/channels/qa-channel-reconnect-dedupe.md +++ b/qa/scenarios/channels/qa-channel-reconnect-dedupe.md @@ -64,7 +64,7 @@ steps: - ref: state - lambda: params: [candidate] - expr: "candidate.conversation.id === 'qa-room' && candidate.direction === 'outbound'" + expr: "candidate.conversation.id === 'qa-room' && candidate.direction === 'outbound' && String(candidate.text ?? '').includes(config.firstMarker)" - expr: liveTurnTimeoutMs(env, 60000) - set: beforeRestartCursor value: @@ -80,9 +80,9 @@ steps: value: expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room')" - assert: - expr: "firstMatchesBeforeFollowup.length === 1" + expr: "firstMatchesBeforeFollowup.length === 1 && String(firstMatchesBeforeFollowup[0]?.text ?? '').includes(config.firstMarker)" message: - expr: "`readiness cycle replayed first reply ${firstMatchesBeforeFollowup.length} times; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`" + expr: "`readiness cycle should preserve exactly one marked first reply, saw ${firstMatchesBeforeFollowup.length}; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`" - call: runAgentPrompt args: - ref: env @@ -99,7 +99,7 @@ steps: - ref: state - lambda: params: [candidate] - expr: "candidate.conversation.id === 'qa-room' && candidate.direction === 'outbound'" + expr: "candidate.conversation.id === 'qa-room' && candidate.direction === 'outbound' && String(candidate.text ?? '').includes(config.secondMarker)" - expr: liveTurnTimeoutMs(env, 60000) - sinceIndex: ref: beforeRestartCursor @@ -108,13 +108,16 @@ steps: expr: state.getSnapshot() - set: firstMatches value: - expr: "snapshot.messages.slice(0, beforeRestartCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room')" + expr: "snapshot.messages.slice(0, beforeRestartCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room' && String(candidate.text ?? '').includes(config.firstMarker))" - set: secondMatches + value: + expr: "snapshot.messages.slice(beforeRestartCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room' && String(candidate.text ?? '').includes(config.secondMarker))" + - set: postRestartOutbounds value: expr: "snapshot.messages.slice(beforeRestartCursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-room')" - assert: - expr: "firstMatches.length === 1 && secondMatches.length === 1" + expr: "firstMatches.length === 1 && secondMatches.length === 1 && postRestartOutbounds.length === 1 && !postRestartOutbounds.some((candidate) => String(candidate.text ?? '').includes(config.firstMarker))" message: - expr: "`expected one pre-restart and one post-restart reply; first=${firstMatches.length} second=${secondMatches.length}; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`" + expr: "`expected one marked pre-restart reply and exactly one marked post-restart reply without replaying the first marker; first=${firstMatches.length} second=${secondMatches.length} post=${postRestartOutbounds.length}; transcript=${formatTransportTranscript(state, { conversationId: 'qa-room' })}`" detailsExpr: "`before=${firstOutbound.text}\\nafter=${secondOutbound.text}`" ```