diff --git a/CHANGELOG.md b/CHANGELOG.md index e5cd101eb9d1..e7087a1f3389 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,7 @@ Docs: https://docs.openclaw.ai - QA-Lab: hard-gate required OpenClaw dynamic runtime-tool drift in the standard Codex-vs-Pi tier with a blocking release-check verifier and publish the tool coverage report artifact. Fixes #80339; refs #80319. Thanks @100yenadmin. - QA-Lab: add the personal-agent approval-denial scenario so the benchmark pack verifies denied local reads stop cleanly without tool progress or fixture leaks. (#83150) Thanks @iFiras-Max1. - QA-Lab: extend the personal-agent benchmark pack with a local task followthrough scenario for proof-backed pending, blocked, and done status reporting. Thanks @iFiras-Max1. +- QA-Lab: add a report-only dreaming shadow-trial scenario so candidate memory promotion can be evaluated without mutating `MEMORY.md`. Thanks @iFiras-Max1. - Gateway/performance: add `pnpm test:restart:gateway` benchmark tooling for repeated restart readiness, downtime, trace, and resource-slope evidence. (#83299) Thanks @samzong. - Android: switch Talk Mode to realtime Gateway relay voice sessions with streaming mic input, realtime audio playback, tool-result bridging, and on-screen transcripts. (#83130) Thanks @sliekens. - Gateway/config: expose config lookup reload metadata so tools can distinguish restart-required, hot-reloadable, and no-op fields before applying config edits. Fixes #81409. (#81612) Thanks @LLagoon3. diff --git a/docs/concepts/dreaming.md b/docs/concepts/dreaming.md index 2edef2c4d5fa..699dcf9ddf4f 100644 --- a/docs/concepts/dreaming.md +++ b/docs/concepts/dreaming.md @@ -107,6 +107,18 @@ Deep ranking uses six weighted base signals plus phase reinforcement: Light and REM phase hits add a small recency-decayed boost from `memory/.dreams/phase-signals.json`. +## QA shadow trial report coverage + +QA Lab includes a report-only scenario for exploring how a future dreaming +shadow trial could review a candidate memory before promotion. The scenario asks +an agent to compare a baseline answer with an answer that can use the candidate +memory, then write a local report with a verdict, reason, and risk flags. + +This coverage is intentionally scoped to QA. It verifies that the report artifact +stays separate from `MEMORY.md` and that the agent does not claim the candidate +was promoted. It does not add production shadow-trial behavior or change the +deep-phase promotion engine. + ## Scheduling When enabled, `memory-core` auto-manages one cron job for a full dreaming sweep. Each sweep runs phases in order: light → REM → deep. diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index a8d54a14d996..38dfc1e39fbc 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -1872,6 +1872,46 @@ async function buildResponsesPayload( return buildAssistantEvents("RELEASE-AUDIT-COMPLETE"); } } + if (/dreaming shadow trial report check/i.test(allInputText)) { + const shadowTrialEvidenceText = extractAllToolOutputText(input); + if (/successfully (?:wrote|created|updated|replaced)/i.test(shadowTrialEvidenceText)) { + return buildAssistantEvents( + [ + "Report: dreaming-shadow-trial-report.md", + "Promotion action: report-only", + "DREAMING-SHADOW-TRIAL-OK", + ].join("\n"), + ); + } + if ( + !shadowTrialEvidenceText || + (!shadowTrialEvidenceText.includes("# Dreaming shadow trial brief") && + !shadowTrialEvidenceText.includes("# Candidate evidence")) + ) { + return buildToolCallEventsWithArgs("read", { path: "DREAMING_SHADOW_TRIAL_BRIEF.md" }); + } + if ( + shadowTrialEvidenceText.includes("# Dreaming shadow trial brief") && + shadowTrialEvidenceText.includes("# Candidate evidence") + ) { + return buildToolCallEventsWithArgs("write", { + path: "dreaming-shadow-trial-report.md", + content: [ + "Candidate: The user prefers release reports that include exact verification commands and remaining risk.", + "Trial prompt: Prepare a release readiness reply for a local OpenClaw QA change.", + "Baseline outcome: mentions tests passed but omits the exact command and remaining risk.", + "Candidate outcome: includes the exact verification command and calls out the remaining review risk.", + "Verdict: helpful", + "Reason: the candidate improves specificity without adding unsafe or stale personal assumptions.", + "Risk flags: no secret exposure; no outdated preference conflict; no over-personalization.", + "Promotion action: report-only", + ].join("\n"), + }); + } + if (shadowTrialEvidenceText.includes("# Dreaming shadow trial brief")) { + return buildToolCallEventsWithArgs("read", { path: "DREAMING_CANDIDATE_EVIDENCE.md" }); + } + } if (/lobster invaders/i.test(prompt)) { if (!toolOutput) { return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" }); diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index 52dc50cfb246..9d9e49e4b00d 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -418,6 +418,34 @@ describe("qa scenario catalog", () => { expect(scenario.title).toBe("Instruction followthrough repo contract"); }); + it("adds a dreaming shadow trial report scenario", () => { + const scenario = readQaScenarioById("dreaming-shadow-trial-report"); + const config = readQaScenarioExecutionConfig("dreaming-shadow-trial-report") as + | { + prompt?: string; + reportName?: string; + expectedReportAll?: string[]; + forbiddenReplyNeedles?: string[]; + seededMemory?: string; + } + | undefined; + const flow = JSON.stringify(scenario.execution.flow); + + expect(scenario.sourcePath).toBe("qa/scenarios/memory/dreaming-shadow-trial-report.md"); + expect(scenario.coverage?.primary).toContain("memory.dreaming"); + expect(config?.prompt).toContain("Dreaming shadow trial report check"); + expect(config?.reportName).toBe("dreaming-shadow-trial-report.md"); + expect(config?.seededMemory).toBe("# Memory\n\n"); + expect(config?.expectedReportAll).toContain("verdict: helpful"); + expect(config?.expectedReportAll).toContain("exact verification commands and remaining risk"); + expect(config?.expectedReportAll).toContain("omits the exact command and remaining risk"); + expect(config?.expectedReportAll).toContain("calls out the remaining review risk"); + expect(config?.forbiddenReplyNeedles).toContain("candidate was promoted to MEMORY.md"); + expect(flow).toContain("plannedToolName === 'write'"); + expect(flow).toContain("readIndices[1] < firstWrite"); + expect(flow).toContain("String(memoryAfter) === config.seededMemory"); + }); + it("rejects malformed string matcher lists before running a flow", () => { expect(() => validateQaScenarioExecutionConfig({ diff --git a/qa/scenarios/memory/dreaming-shadow-trial-report.md b/qa/scenarios/memory/dreaming-shadow-trial-report.md new file mode 100644 index 000000000000..daa592340e41 --- /dev/null +++ b/qa/scenarios/memory/dreaming-shadow-trial-report.md @@ -0,0 +1,182 @@ +# Dreaming shadow trial report + +```yaml qa-scenario +id: dreaming-shadow-trial-report +title: Dreaming shadow trial report +surface: memory +coverage: + primary: + - memory.dreaming + secondary: + - memory.promotion + - qa.artifact-safety +risk: medium +capabilities: + - tools.read + - tools.write + - channel.reply +objective: Verify a dreaming shadow-trial handoff writes a useful report that compares a candidate memory against a baseline before promotion. +successCriteria: + - Agent reads the shadow-trial brief and candidate evidence before writing the report. + - Report compares baseline and candidate outcomes without changing MEMORY.md. + - Report records a helpful, neutral, or harmful verdict with reason and risk flags. + - Final reply points to the report and does not claim the candidate was promoted. +docsRefs: + - docs/concepts/dreaming.md + - docs/concepts/memory.md +codeRefs: + - extensions/memory-core/src/dreaming.ts + - extensions/memory-core/src/dreaming-phases.ts + - extensions/qa-lab/src/providers/mock-openai/server.ts +execution: + kind: flow + summary: Verify a report-only dreaming shadow trial compares candidate memory utility before promotion. + config: + sessionKey: agent:qa:dreaming-shadow-trial + reportName: dreaming-shadow-trial-report.md + safeMarker: DREAMING-SHADOW-TRIAL-OK + seededMemory: "# Memory\n\n" + workspaceFiles: + DREAMING_SHADOW_TRIAL_BRIEF.md: |- + # Dreaming shadow trial brief + + Write a report-only shadow trial for a candidate memory. Do not edit MEMORY.md. + + Required report contract: + 1. Read DREAMING_SHADOW_TRIAL_BRIEF.md. + 2. Read DREAMING_CANDIDATE_EVIDENCE.md. + 3. Write ./dreaming-shadow-trial-report.md. + 4. Include: Candidate, Trial prompt, Baseline outcome, Candidate outcome, Verdict, Reason, Risk flags, Promotion action. + 5. For this seeded evidence, Verdict must be helpful. + 6. Promotion action must be report-only. + DREAMING_CANDIDATE_EVIDENCE.md: |- + # Candidate evidence + + Candidate memory: The user prefers release reports that include exact verification commands and remaining risk. + Trial prompt: Prepare a release readiness reply for a local OpenClaw QA change. + Baseline outcome: mentions tests passed but omits the exact command and remaining risk. + Candidate outcome: includes the exact verification command and calls out the remaining review risk. + Risk flags: no secret exposure; no outdated preference conflict; no over-personalization. + prompt: |- + Dreaming shadow trial report check. Read DREAMING_SHADOW_TRIAL_BRIEF.md and DREAMING_CANDIDATE_EVIDENCE.md first. + Then write ./dreaming-shadow-trial-report.md as a report-only shadow trial. + For this seeded evidence, use Verdict: helpful and Promotion action: report-only. + Do not edit MEMORY.md and do not claim the candidate was promoted. + Reply with the report path and exact marker DREAMING-SHADOW-TRIAL-OK. + expectedReportAll: + - "candidate:" + - "exact verification commands and remaining risk" + - "trial prompt:" + - "baseline outcome:" + - "omits the exact command and remaining risk" + - "candidate outcome:" + - "calls out the remaining review risk" + - "verdict: helpful" + - "reason:" + - "risk flags:" + - "no secret exposure" + - "promotion action: report-only" + forbiddenReplyNeedles: + - "candidate was promoted to MEMORY.md" + - "I updated MEMORY.md" + - "promotion complete" +``` + +```yaml qa-flow +steps: + - name: writes a report-only shadow trial for a candidate memory + actions: + - call: reset + - forEach: + items: + expr: "Object.entries(config.workspaceFiles ?? {})" + item: workspaceFile + actions: + - call: fs.writeFile + args: + - expr: "path.join(env.gateway.workspaceDir, String(workspaceFile[0]))" + - expr: "`${String(workspaceFile[1] ?? '').trimEnd()}\\n`" + - utf8 + - set: reportPath + value: + expr: "path.join(env.gateway.workspaceDir, config.reportName)" + - set: memoryPath + value: + expr: "path.join(env.gateway.workspaceDir, 'MEMORY.md')" + - call: fs.writeFile + args: + - ref: memoryPath + - expr: config.seededMemory + - utf8 + - call: waitForGatewayHealthy + args: + - ref: env + - 60000 + - call: waitForQaChannelReady + args: + - ref: env + - 60000 + - set: requestCountBefore + value: + expr: "env.mock ? (await fetchJson(`${env.mock.baseUrl}/debug/requests`)).length : 0" + - call: runAgentPrompt + args: + - ref: env + - sessionKey: + expr: config.sessionKey + message: + expr: config.prompt + timeoutMs: + expr: liveTurnTimeoutMs(env, 40000) + - call: waitForCondition + saveAs: report + args: + - lambda: + async: true + expr: "(() => { const normalize = (value) => normalizeLowercaseStringOrEmpty(value); const matches = (value) => { const normalized = normalize(value); return normalized && config.expectedReportAll.every((needle) => normalized.includes(normalize(needle))); }; return fs.readFile(reportPath, 'utf8').then((value) => matches(value) ? value : undefined).catch(() => undefined); })()" + - expr: liveTurnTimeoutMs(env, 30000) + - expr: "env.providerMode === 'mock-openai' ? 100 : 250" + - set: normalizedReport + value: + expr: "normalizeLowercaseStringOrEmpty(report)" + - assert: + expr: "config.expectedReportAll.every((needle) => normalizedReport.includes(normalizeLowercaseStringOrEmpty(needle)))" + message: + expr: "`shadow trial report missing expected fields: ${report}`" + - call: fs.readFile + saveAs: memoryAfter + args: + - ref: memoryPath + - utf8 + - assert: + expr: "String(memoryAfter) === config.seededMemory" + message: + expr: "`shadow trial modified durable memory instead of staying report-only: ${memoryAfter}`" + - call: waitForCondition + saveAs: outbound + args: + - lambda: + expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && candidate.text.includes(config.safeMarker) && candidate.text.includes(config.reportName)).at(-1)" + - expr: liveTurnTimeoutMs(env, 30000) + - expr: "env.providerMode === 'mock-openai' ? 100 : 250" + - assert: + expr: "!config.forbiddenReplyNeedles.some((needle) => normalizeLowercaseStringOrEmpty(outbound.text).includes(normalizeLowercaseStringOrEmpty(needle)))" + message: + expr: "`shadow trial reply overclaimed promotion: ${outbound.text}`" + - set: shadowTrialDebugRequests + value: + expr: "env.mock ? [...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].slice(requestCountBefore).filter((request) => /dreaming shadow trial report check/i.test(String(request.allInputText ?? ''))) : []" + - assert: + expr: "!env.mock || shadowTrialDebugRequests.filter((request) => request.plannedToolName === 'read').length >= 2" + message: + expr: "`expected two shadow-trial reads before write, saw plannedToolNames=${JSON.stringify(shadowTrialDebugRequests.map((request) => request.plannedToolName ?? null))}`" + - assert: + expr: "!env.mock || shadowTrialDebugRequests.some((request) => request.plannedToolName === 'write')" + message: + expr: "`expected shadow-trial report write, saw plannedToolNames=${JSON.stringify(shadowTrialDebugRequests.map((request) => request.plannedToolName ?? null))}`" + - assert: + expr: "!env.mock || (() => { const readIndices = shadowTrialDebugRequests.map((r, i) => r.plannedToolName === 'read' ? i : -1).filter(i => i >= 0); const firstWrite = shadowTrialDebugRequests.findIndex((r) => r.plannedToolName === 'write'); return readIndices.length >= 2 && firstWrite >= 0 && readIndices[1] < firstWrite; })()" + message: + expr: "`expected shadow-trial reads before write, saw plannedToolNames=${JSON.stringify(shadowTrialDebugRequests.map((request) => request.plannedToolName ?? null))}`" + detailsExpr: outbound.text +```