diff --git a/extensions/codex/src/app-server/run-attempt.test.ts b/extensions/codex/src/app-server/run-attempt.test.ts index 881c93d215df..373dc6ac3bd4 100644 --- a/extensions/codex/src/app-server/run-attempt.test.ts +++ b/extensions/codex/src/app-server/run-attempt.test.ts @@ -2844,11 +2844,17 @@ describe("runCodexAppServerAttempt", () => { expect(report?.provider).toBe("codex"); expect(report?.model).toBe("gpt-5.4-codex"); expect(report?.systemPrompt.chars).toBeGreaterThan(0); + expect(report?.systemPrompt.hash).toMatch(/^[a-f0-9]{64}$/u); + expect(report?.skills.hash).toMatch(/^[a-f0-9]{64}$/u); const message = report?.tools.entries.find((tool) => tool.name === "message"); const webSearch = report?.tools.entries.find((tool) => tool.name === "web_search"); expect(message?.schemaChars).toBeGreaterThan(0); + expect(message?.summaryHash).toMatch(/^[a-f0-9]{64}$/u); + expect(message?.schemaHash).toMatch(/^[a-f0-9]{64}$/u); expect(webSearch?.schemaChars).toBe(0); + expect(webSearch?.summaryHash).toMatch(/^[a-f0-9]{64}$/u); + expect(webSearch?.schemaHash).toMatch(/^[a-f0-9]{64}$/u); expect(report?.tools.schemaChars).toBe(message?.schemaChars); }); @@ -6574,7 +6580,8 @@ describe("runCodexAppServerAttempt", () => { input?: Array<{ text?: string }>; }; expect(turnStartParams.input?.[0]?.text).toBe(exactCommand); - expect(result.systemPromptReport?.skills).toEqual({ promptChars: 0, entries: [] }); + expect(result.systemPromptReport?.skills).toMatchObject({ promptChars: 0, entries: [] }); + expect(result.systemPromptReport?.skills.hash).toMatch(/^[a-f0-9]{64}$/u); }); it("fires llm_input, llm_output, and agent_end hooks for codex turns", async () => { diff --git a/extensions/codex/src/app-server/run-attempt.ts b/extensions/codex/src/app-server/run-attempt.ts index 968abe24fb66..8a782d736d5e 100644 --- a/extensions/codex/src/app-server/run-attempt.ts +++ b/extensions/codex/src/app-server/run-attempt.ts @@ -5095,6 +5095,7 @@ function buildCodexSystemPromptReport(params: { chars: params.developerInstructions.length, projectContextChars: 0, nonProjectContextChars: params.developerInstructions.length, + hash: sha256Text(params.developerInstructions), }, injectedWorkspaceFiles: buildCodexBootstrapInjectionStats({ bootstrapFiles: params.workspaceBootstrapContext.bootstrapFiles, @@ -5106,6 +5107,7 @@ function buildCodexSystemPromptReport(params: { }), skills: { promptChars: skillsPrompt.length, + hash: sha256Text(skillsPrompt), entries: buildCodexSkillReportEntries(skillsPrompt), }, tools: { @@ -5137,20 +5139,23 @@ function buildCodexToolReportEntry(tool: CodexDynamicToolSpec): CodexToolReportE return { name: tool.name, summaryChars: summary.length, + summaryHash: sha256Text(summary), schemaChars: 0, + schemaHash: stableJsonHash(null), propertiesCount: null, }; } return { name: tool.name, summaryChars: summary.length, + summaryHash: sha256Text(summary), ...buildCodexToolSchemaStats(tool.inputSchema), }; } function buildCodexToolSchemaStats( schema: JsonValue, -): Pick { +): Pick { const schemaChars = (() => { try { return JSON.stringify(schema).length; @@ -5162,10 +5167,34 @@ function buildCodexToolSchemaStats( isJsonObject(schema) && isJsonObject(schema.properties) ? schema.properties : null; return { schemaChars, + schemaHash: stableJsonHash(schema), propertiesCount: properties ? Object.keys(properties).length : null, }; } +function sha256Text(value: string): string { + return createHash("sha256").update(value).digest("hex"); +} + +function normalizeForStableHash(value: unknown): unknown { + if (Array.isArray(value)) { + return value.map((entry) => normalizeForStableHash(entry)); + } + if (value && typeof value === "object") { + const record = value as Record; + return Object.fromEntries( + Object.keys(record) + .toSorted((left, right) => left.localeCompare(right)) + .map((key) => [key, normalizeForStableHash(record[key])]), + ); + } + return value; +} + +function stableJsonHash(value: JsonValue): string { + return sha256Text(JSON.stringify(normalizeForStableHash(value)) ?? "null"); +} + function buildCodexBootstrapInjectionStats(params: { bootstrapFiles: CodexBootstrapFile[]; injectedFiles: EmbeddedContextFile[]; diff --git a/extensions/qa-lab/confidence-profiles/codex-100.json b/extensions/qa-lab/confidence-profiles/codex-100.json new file mode 100644 index 000000000000..05053ad13e70 --- /dev/null +++ b/extensions/qa-lab/confidence-profiles/codex-100.json @@ -0,0 +1,168 @@ +{ + "version": 1, + "profile": "codex-100", + "lanes": [ + { + "id": "tool-defaults-direct", + "title": "Tool-defaults direct runtime parity", + "kind": "qa-suite-summary", + "artifact": "tool-defaults-direct/qa-suite-summary.json", + "required": true, + "productImpact": "P2", + "qaImpact": "P0", + "issue": "https://github.com/openclaw/openclaw/issues/80319", + "ownerAction": "Fix product or harness before claiming the tool-defaults gate is trusted.", + "labels": ["qa-lab", "runtime-parity", "codex"] + }, + { + "id": "openclaw-dynamic-tools-direct", + "title": "OpenClaw dynamic integration tools direct runtime parity", + "kind": "qa-suite-summary", + "artifact": "openclaw-dynamic-tools-direct/qa-suite-summary.json", + "required": true, + "productImpact": "P1", + "qaImpact": "P0", + "issue": "https://github.com/openclaw/openclaw/issues/80319", + "ownerAction": "Investigate any hard failure as an OpenClaw dynamic integration or QA loading regression.", + "labels": ["qa-lab", "runtime-parity", "openclaw-dynamic-tools"] + }, + { + "id": "tool-defaults-searchable", + "title": "Tool-defaults searchable runtime parity", + "kind": "qa-suite-summary", + "artifact": "tool-defaults-searchable/qa-suite-summary.json", + "required": true, + "failureVerdict": "mock-limitation", + "skipBackfillLane": "openclaw-dynamic-tools-searchable-live", + "productImpact": "P4", + "qaImpact": "P2", + "issue": "https://github.com/openclaw/openclaw/issues/80319", + "ownerAction": "Keep as report-only until searchable/deferred tool modeling has no mock-only ambiguity.", + "labels": ["qa-lab", "runtime-parity", "searchable-tools"] + }, + { + "id": "first-hour-20-direct", + "title": "First-hour 20-turn direct runtime parity", + "kind": "qa-suite-summary", + "artifact": "first-hour-20-direct/qa-suite-summary.json", + "required": true, + "skipBackfillLane": "codex-native-live", + "productImpact": "P1", + "qaImpact": "P0", + "ownerAction": "Triage row-by-row; do not file product bugs unless live/native proof reproduces.", + "labels": ["qa-lab", "runtime-parity", "first-hour"] + }, + { + "id": "mock-token-efficiency", + "title": "Mock assistant-message token efficiency estimate", + "kind": "token-efficiency-summary", + "artifact": "first-hour-20-direct-report/qa-runtime-token-efficiency-summary.json", + "required": true, + "expectedTokenUsageSource": "mock-estimate", + "productImpact": "P4", + "qaImpact": "P1", + "ownerAction": "Fix labeling before trusting token-efficiency comparisons.", + "labels": ["qa-lab", "runtime-parity", "token-efficiency"] + }, + { + "id": "fault-injection-mock", + "title": "Mock fault-injection runtime parity", + "kind": "qa-suite-summary", + "artifact": "fault-injection-mock/qa-suite-summary.json", + "required": true, + "skipBackfillLane": "codex-native-live", + "productImpact": "P2", + "qaImpact": "P0", + "ownerAction": "Treat failures as retry/recovery regressions unless evidence shows fixture drift.", + "labels": ["qa-lab", "runtime-parity", "fault-injection"] + }, + { + "id": "jsonl-expanded", + "title": "Expanded curated JSONL replay", + "kind": "jsonl-replay-summary", + "artifact": "jsonl-expanded/qa-jsonl-replay-summary.json", + "required": true, + "productImpact": "P2", + "qaImpact": "P0", + "ownerAction": "Inspect first drift turn and transcript class before filing any product issue.", + "labels": ["qa-lab", "runtime-parity", "jsonl-replay"] + }, + { + "id": "confidence-self-test", + "title": "Seeded confidence negative controls", + "kind": "self-test-summary", + "artifact": "confidence-self-test/qa-confidence-self-test-summary.json", + "required": true, + "productImpact": "P4", + "qaImpact": "P0", + "ownerAction": "Fix the harness before trusting any green parity result.", + "labels": ["qa-lab", "confidence-gate", "negative-controls"] + }, + { + "id": "codex-native-live", + "title": "Codex-native live workspace capability proof", + "kind": "qa-suite-summary", + "artifact": "codex-native-live/qa-suite-summary.json", + "required": true, + "missingVerdict": "environment-blocked", + "missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.", + "productImpact": "P1", + "qaImpact": "P1", + "ownerAction": "Run with live-frontier OAuth before using this lane as product proof.", + "labels": ["qa-lab", "runtime-parity", "live-proof"] + }, + { + "id": "first-hour-live", + "title": "Live first-hour capability proof", + "kind": "qa-suite-summary", + "artifact": "first-hour-live/qa-suite-summary.json", + "required": true, + "missingVerdict": "environment-blocked", + "missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.", + "productImpact": "P1", + "qaImpact": "P1", + "ownerAction": "Run with live-frontier OAuth before claiming live first-hour coverage.", + "labels": ["qa-lab", "runtime-parity", "live-proof"] + }, + { + "id": "openclaw-dynamic-tools-searchable-live", + "title": "Live OpenClaw dynamic tools searchable proof", + "kind": "qa-suite-summary", + "artifact": "openclaw-dynamic-tools-searchable-live/qa-suite-summary.json", + "required": true, + "missingVerdict": "environment-blocked", + "missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.", + "productImpact": "P1", + "qaImpact": "P1", + "ownerAction": "Run with live-frontier OAuth before claiming production-shaped searchable OpenClaw dynamic tool coverage.", + "labels": ["qa-lab", "runtime-parity", "searchable-tools", "live-proof"] + }, + { + "id": "live-token-efficiency", + "title": "Live assistant-message token efficiency", + "kind": "token-efficiency-summary", + "artifact": "live-token-efficiency/qa-runtime-token-efficiency-summary.json", + "required": true, + "expectedTokenUsageSource": "live-usage", + "missingVerdict": "environment-blocked", + "missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.", + "productImpact": "P3", + "qaImpact": "P1", + "ownerAction": "Run a live-frontier runtime parity summary and regenerate token efficiency.", + "labels": ["qa-lab", "runtime-parity", "token-efficiency"] + }, + { + "id": "soak-100", + "title": "Optional 100-turn soak", + "kind": "qa-suite-summary", + "artifact": "soak-100/qa-suite-summary.json", + "required": true, + "missingVerdict": "environment-blocked", + "missingReason": "Scheduled/Testbox soak runner did not upload artifacts for this proof bundle.", + "productImpact": "P3", + "qaImpact": "P2", + "ownerAction": "Run remotely with a long timeout or record the runner budget blocker.", + "labels": ["qa-lab", "runtime-parity", "soak"] + } + ] +} diff --git a/extensions/qa-lab/src/cli.runtime.ts b/extensions/qa-lab/src/cli.runtime.ts index b1ed377da6ae..db6bc12f3ac9 100644 --- a/extensions/qa-lab/src/cli.runtime.ts +++ b/extensions/qa-lab/src/cli.runtime.ts @@ -13,6 +13,12 @@ import { import { resolveQaParityPackScenarioIds } from "./agentic-parity.js"; import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js"; import { resolveRepoRelativeOutputDir } from "./cli-paths.js"; +import { + buildQaConfidenceReport, + readQaConfidenceManifestFile, + renderQaConfidenceMarkdownReport, + writeQaConfidenceSelfTestArtifacts, +} from "./confidence-report.js"; import { buildQaCoverageInventory, findQaScenarioMatches, @@ -786,6 +792,60 @@ export async function runQaParityReportCommand(opts: { } } +export async function runQaConfidenceReportCommand(opts: { + repoRoot?: string; + manifest: string; + artifactRoot?: string; + outputDir?: string; + strictZeroUnknowns?: boolean; + strictGlobalPass?: boolean; +}) { + const repoRoot = path.resolve(opts.repoRoot ?? process.cwd()); + const manifestPath = path.resolve(repoRoot, opts.manifest); + const artifactRoot = path.resolve(repoRoot, opts.artifactRoot ?? "."); + const outputDir = + resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ?? + path.join(repoRoot, ".artifacts", "qa-e2e", `confidence-${Date.now().toString(36)}`); + await fs.mkdir(outputDir, { recursive: true }); + const manifest = await readQaConfidenceManifestFile(manifestPath); + const reportPayload = await buildQaConfidenceReport({ + manifest, + artifactRoot, + strictZeroUnknowns: opts.strictZeroUnknowns === true, + strictGlobalPass: opts.strictGlobalPass === true, + }); + const report = renderQaConfidenceMarkdownReport(reportPayload); + const reportPath = path.join(outputDir, "qa-confidence-report.md"); + const summaryPath = path.join(outputDir, "qa-confidence-summary.json"); + await fs.writeFile(reportPath, report, "utf8"); + await fs.writeFile(summaryPath, `${JSON.stringify(reportPayload, null, 2)}\n`, "utf8"); + process.stdout.write(`QA confidence report: ${reportPath}\n`); + process.stdout.write(`QA confidence summary: ${summaryPath}\n`); + process.stdout.write(`QA confidence verdict: ${reportPayload.pass ? "pass" : "fail"}\n`); + if (!reportPayload.pass) { + process.exitCode = 1; + } +} + +export async function runQaConfidenceSelfTestCommand(opts: { + repoRoot?: string; + outputDir?: string; +}) { + const repoRoot = path.resolve(opts.repoRoot ?? process.cwd()); + const outputDir = + resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ?? + path.join(repoRoot, ".artifacts", "qa-e2e", `confidence-self-test-${Date.now().toString(36)}`); + const result = await writeQaConfidenceSelfTestArtifacts({ outputDir }); + process.stdout.write(`QA confidence self-test report: ${result.reportPath}\n`); + process.stdout.write(`QA confidence self-test summary: ${result.summaryPath}\n`); + process.stdout.write( + `QA confidence self-test verdict: ${result.summary.pass ? "pass" : "fail"}\n`, + ); + if (!result.summary.pass) { + process.exitCode = 1; + } +} + export async function runQaCoverageReportCommand(opts: { repoRoot?: string; output?: string; diff --git a/extensions/qa-lab/src/cli.ts b/extensions/qa-lab/src/cli.ts index bc156636bfd9..d34c5460831d 100644 --- a/extensions/qa-lab/src/cli.ts +++ b/extensions/qa-lab/src/cli.ts @@ -72,6 +72,23 @@ async function runQaParityReport(opts: { await runtime.runQaParityReportCommand(opts); } +async function runQaConfidenceReport(opts: { + repoRoot?: string; + manifest: string; + artifactRoot?: string; + outputDir?: string; + strictZeroUnknowns?: boolean; + strictGlobalPass?: boolean; +}) { + const runtime = await loadQaLabCliRuntime(); + await runtime.runQaConfidenceReportCommand(opts); +} + +async function runQaConfidenceSelfTest(opts: { repoRoot?: string; outputDir?: string }) { + const runtime = await loadQaLabCliRuntime(); + await runtime.runQaConfidenceSelfTestCommand(opts); +} + async function runQaCoverageReport(opts: { repoRoot?: string; output?: string; @@ -424,6 +441,43 @@ export function registerQaLabCli(program: Command) { }, ); + qa.command("confidence-report") + .description("Classify QA proof artifacts into a zero-unknown confidence report") + .requiredOption("--manifest ", "Confidence profile manifest JSON") + .option("--repo-root ", "Repository root to target when running from a neutral cwd") + .option("--artifact-root ", "Root directory for relative artifact paths", ".") + .option("--output-dir ", "Artifact directory for the confidence report") + .option( + "--strict-zero-unknowns", + "Fail unless every lane passes or has an explicit non-unknown verdict", + false, + ) + .option( + "--strict-global-pass", + "Fail unless every lane passes with no blocked, missing, unknown, classified-fail, or unbackfilled skipped rows", + false, + ) + .action( + async (opts: { + repoRoot?: string; + manifest: string; + artifactRoot?: string; + outputDir?: string; + strictZeroUnknowns?: boolean; + strictGlobalPass?: boolean; + }) => { + await runQaConfidenceReport(opts); + }, + ); + + qa.command("confidence-self-test") + .description("Write seeded negative-control canaries proving the confidence gate detects drift") + .option("--repo-root ", "Repository root to target when running from a neutral cwd") + .option("--output-dir ", "Artifact directory for the confidence self-test") + .action(async (opts: { repoRoot?: string; outputDir?: string }) => { + await runQaConfidenceSelfTest(opts); + }); + qa.command("jsonl-replay") .description("Replay curated JSONL transcripts through the runtime parity replay harness") .option("--repo-root ", "Repository root to target when running from a neutral cwd") diff --git a/extensions/qa-lab/src/confidence-report.test.ts b/extensions/qa-lab/src/confidence-report.test.ts new file mode 100644 index 000000000000..47cbcdadc9f5 --- /dev/null +++ b/extensions/qa-lab/src/confidence-report.test.ts @@ -0,0 +1,881 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { + buildQaConfidenceReport, + buildQaConfidenceSelfTestSummary, + renderQaConfidenceMarkdownReport, + writeQaConfidenceSelfTestArtifacts, + type QaConfidenceManifest, +} from "./confidence-report.js"; + +describe("qa confidence report", () => { + let tempRoot: string; + + beforeEach(async () => { + tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-confidence-")); + }); + + afterEach(async () => { + await fs.rm(tempRoot, { recursive: true, force: true }); + }); + + async function writeJson(relativePath: string, payload: unknown) { + const filePath = path.join(tempRoot, relativePath); + await fs.mkdir(path.dirname(filePath), { recursive: true }); + await fs.writeFile(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8"); + return filePath; + } + + it("passes strict zero-unknowns when every lane passes or has an allowed blocked verdict", async () => { + await writeJson("tool-defaults/qa-suite-summary.json", { + counts: { total: 20, passed: 18, skipped: 2, failed: 0 }, + scenarios: [], + }); + await writeJson("token/qa-runtime-token-efficiency-summary.json", { + status: "estimated", + pass: true, + rows: [{ scenarioId: "one", usageSource: "mock-estimate" }], + }); + + const manifest: QaConfidenceManifest = { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "tool-defaults-direct", + title: "Tool defaults direct", + kind: "qa-suite-summary", + artifact: "tool-defaults/qa-suite-summary.json", + required: true, + }, + { + id: "mock-token-efficiency", + title: "Mock token efficiency", + kind: "token-efficiency-summary", + artifact: "token/qa-runtime-token-efficiency-summary.json", + required: true, + expectedTokenUsageSource: "mock-estimate", + }, + { + id: "live-token-efficiency", + title: "Live token efficiency", + kind: "token-efficiency-summary", + artifact: "live/qa-runtime-token-efficiency-summary.json", + required: true, + missingVerdict: "environment-blocked", + missingReason: "OPENAI OAuth credentials are not available in this runner.", + }, + ], + }; + + const report = await buildQaConfidenceReport({ + manifest, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + expect(report.pass).toBe(true); + expect(report.globalPass).toBe(false); + expect(report.counts).toMatchObject({ passed: 2, blocked: 1, unknown: 0, failed: 0 }); + expect(report.lanes.map((lane) => lane.verdict)).toEqual([ + "pass", + "pass", + "environment-blocked", + ]); + expect(report.lanes[0]?.artifactPath).toBe("tool-defaults/qa-suite-summary.json"); + expect(report.lanes[0]?.artifactPath).not.toContain(tempRoot); + expect(report.lanes[0]?.details).toContain("counts.skipped=2"); + expect(renderQaConfidenceMarkdownReport(report)).toContain("Zero unknowns: yes"); + expect(renderQaConfidenceMarkdownReport(report)).toContain("Global pass: no"); + }); + + it("does not let optional lanes block strict gates", async () => { + await writeJson("required/qa-suite-summary.json", { + counts: { total: 1, passed: 1, skipped: 0, failed: 0 }, + scenarios: [], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "required", + title: "Required", + kind: "qa-suite-summary", + artifact: "required/qa-suite-summary.json", + required: true, + }, + { + id: "optional-missing", + title: "Optional missing", + kind: "qa-suite-summary", + artifact: "optional/qa-suite-summary.json", + required: false, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + strictGlobalPass: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(true); + expect(report.counts).toMatchObject({ total: 1, passed: 1, unknown: 0 }); + expect(report.failures).toEqual([]); + expect(report.lanes[1]).toMatchObject({ id: "optional-missing", status: "missing" }); + }); + + it("fails strict global pass when any lane is blocked, missing, unknown, or classified failed", async () => { + await writeJson("classified/qa-suite-summary.json", { + counts: { total: 1, passed: 0, skipped: 0, failed: 1 }, + scenarios: [{ name: "classified", status: "fail" }], + }); + await writeJson("unknown/qa-suite-summary.json", { + counts: { total: 1, passed: 0, skipped: 0, failed: 1 }, + scenarios: [{ name: "unknown", status: "fail" }], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "blocked-live", + title: "Blocked live", + kind: "qa-suite-summary", + artifact: "live/qa-suite-summary.json", + required: true, + missingVerdict: "environment-blocked", + missingReason: "OPENAI_API_KEY missing.", + }, + { + id: "missing-soak", + title: "Missing soak", + kind: "qa-suite-summary", + artifact: "soak/qa-suite-summary.json", + required: true, + }, + { + id: "classified-fixture", + title: "Classified fixture", + kind: "qa-suite-summary", + artifact: "classified/qa-suite-summary.json", + required: true, + failureVerdict: "fixture-bug", + }, + { + id: "unknown-failure", + title: "Unknown failure", + kind: "qa-suite-summary", + artifact: "unknown/qa-suite-summary.json", + required: true, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + strictGlobalPass: true, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.zeroUnknowns).toBe(false); + expect(report.globalPass).toBe(false); + expect(report.counts).toMatchObject({ + blocked: 1, + missing: 1, + failed: 1, + unknown: 2, + }); + expect(report.failures).toEqual([ + "blocked-live is blocked: OPENAI_API_KEY missing.", + "missing-soak is missing: artifact missing and no missingVerdict was configured", + "classified-fixture is classified fixture-bug: qa-suite-summary counts.failed=1 counts.total=1 counts.skipped=0", + "unknown-failure is unclassified: qa-suite-summary counts.failed=1 counts.total=1 counts.skipped=0", + ]); + }); + + it("fails strict global pass for skipped suite rows until a backfill lane passes", async () => { + await writeJson("report-only/qa-suite-summary.json", { + counts: { total: 3, passed: 2, skipped: 1, failed: 0 }, + scenarios: [], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "report-only", + title: "Report-only", + kind: "qa-suite-summary", + artifact: "report-only/qa-suite-summary.json", + required: true, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + strictGlobalPass: true, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + expect(report.zeroUnknowns).toBe(true); + expect(report.globalPass).toBe(false); + expect(report.failures).toEqual([ + "report-only has 1 skipped row(s) with no passing backfill lane", + ]); + }); + + it("infers skipped suite rows from totals and scenario status", async () => { + for (const [artifact, expectedDetail] of [ + [{ counts: { total: 3, passed: 2, failed: 0 }, scenarios: [] }, "counts.skipped=1"], + [ + { + counts: { total: 2, passed: 2, failed: 0 }, + scenarios: [ + { name: "passing", status: "pass" }, + { name: "skipped", status: "skip" }, + ], + }, + "counts.skipped=1", + ], + ] as const) { + await writeJson("report-only/qa-suite-summary.json", artifact); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "report-only", + title: "Report-only", + kind: "qa-suite-summary", + artifact: "report-only/qa-suite-summary.json", + required: true, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + strictGlobalPass: true, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + expect(report.globalPass).toBe(false); + expect(report.failures).toEqual([ + "report-only has 1 skipped row(s) with no passing backfill lane", + ]); + expect(report.lanes[0]).toMatchObject({ skippedCount: 1 }); + expect(report.lanes[0]?.details).toContain(expectedDetail); + } + }); + + it("rejects skipped token reports when a live usage source is required", async () => { + await writeJson("live-token/qa-runtime-token-efficiency-summary.json", { + status: "skipped", + pass: true, + rows: [], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "live-token-efficiency", + title: "Live token efficiency", + kind: "token-efficiency-summary", + artifact: "live-token/qa-runtime-token-efficiency-summary.json", + required: true, + expectedTokenUsageSource: "live-usage", + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.lanes[0]).toMatchObject({ + status: "unknown", + details: "token summary has no live-usage rows", + }); + }); + + it("preserves partial zero-unknown mode for classified failing lanes", async () => { + await writeJson("classified/qa-suite-summary.json", { + counts: { total: 1, passed: 0, skipped: 0, failed: 1 }, + scenarios: [{ name: "classified", status: "fail" }], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "classified-fixture", + title: "Classified fixture", + kind: "qa-suite-summary", + artifact: "classified/qa-suite-summary.json", + required: true, + failureVerdict: "fixture-bug", + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + expect(report.pass).toBe(true); + expect(report.zeroUnknowns).toBe(true); + expect(report.globalPass).toBe(false); + expect(report.counts.failed).toBe(1); + }); + + it("passes strict global pass when skipped suite rows are backfilled by a passing lane", async () => { + await writeJson("report-only/qa-suite-summary.json", { + counts: { total: 3, passed: 2, skipped: 1, failed: 0 }, + scenarios: [], + }); + await writeJson("live-backfill/qa-suite-summary.json", { + counts: { total: 1, passed: 1, skipped: 0, failed: 0 }, + scenarios: [], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "report-only", + title: "Report-only", + kind: "qa-suite-summary", + artifact: "report-only/qa-suite-summary.json", + required: true, + skipBackfillLane: "live-backfill", + }, + { + id: "live-backfill", + title: "Live backfill", + kind: "qa-suite-summary", + artifact: "live-backfill/qa-suite-summary.json", + required: true, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + strictGlobalPass: true, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + expect(report.pass).toBe(true); + expect(report.zeroUnknowns).toBe(true); + expect(report.globalPass).toBe(true); + expect(report.lanes[0]).toMatchObject({ + skippedCount: 1, + skipBackfillLane: "live-backfill", + skipBackfilled: true, + }); + }); + + it("classifies environment-blocking gateway sentinels without turning them into unknowns", async () => { + await writeJson("live/qa-suite-summary.json", { + counts: { total: 1, passed: 1, skipped: 0, failed: 0 }, + gatewayLogSentinels: [ + { + kind: "live-quota-or-subscription", + verdict: "environment-blocked", + owner: "environment", + productImpact: "P4", + qaImpact: "P0", + line: 12, + text: "OpenAI quota exceeded", + }, + ], + scenarios: [], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "first-hour-live", + title: "First hour live", + kind: "qa-suite-summary", + artifact: "live/qa-suite-summary.json", + required: true, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(true); + expect(report.globalPass).toBe(false); + expect(report.counts).toMatchObject({ blocked: 1, unknown: 0 }); + expect(report.lanes[0]).toMatchObject({ + status: "blocked", + verdict: "environment-blocked", + }); + }); + + it("does not let environment sentinels hide separate suite failures", async () => { + await writeJson("live/qa-suite-summary.json", { + counts: { total: 2, passed: 1, skipped: 0, failed: 1 }, + gatewayLogSentinels: [ + { + kind: "live-quota-or-subscription", + verdict: "environment-blocked", + owner: "environment", + line: 12, + text: "OpenAI quota exceeded", + }, + ], + scenarios: [ + { name: "quota", status: "pass" }, + { name: "unrelated-drift", status: "fail" }, + ], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "first-hour-live", + title: "First hour live", + kind: "qa-suite-summary", + artifact: "live/qa-suite-summary.json", + required: true, + missingVerdict: "environment-blocked", + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.counts).toMatchObject({ blocked: 0, unknown: 1 }); + expect(report.lanes[0]).toMatchObject({ status: "unknown" }); + expect(report.lanes[0]?.details).toContain("suite also reports failures"); + }); + + it("classifies product and plugin gateway sentinels as known failing lanes", async () => { + await writeJson("live/qa-suite-summary.json", { + counts: { total: 1, passed: 1, skipped: 0, failed: 0 }, + scenarios: [ + { + name: "plugin hook health sentinel", + status: "pass", + steps: [], + runtimeParity: { + scenarioId: "plugin-hook-health-sentinel", + drift: "none", + cells: { + pi: { sentinelFindings: [] }, + codex: { + sentinelFindings: [ + { + kind: "plugin-hook-failure", + verdict: "qa-harness-bug", + owner: "plugin", + productImpact: "P1", + qaImpact: "P0", + line: 4, + text: "before_prompt_build hook failed", + }, + ], + }, + }, + }, + }, + ], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "first-hour-live", + title: "First hour live", + kind: "qa-suite-summary", + artifact: "live/qa-suite-summary.json", + required: true, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(true); + expect(report.globalPass).toBe(false); + expect(report.counts).toMatchObject({ failed: 1, unknown: 0 }); + expect(report.lanes[0]).toMatchObject({ + status: "fail", + verdict: "qa-harness-bug", + }); + }); + + it("treats corrupt artifacts as unknown instead of allowed missing lanes", async () => { + const artifactPath = path.join(tempRoot, "live", "qa-suite-summary.json"); + await fs.mkdir(path.dirname(artifactPath), { recursive: true }); + await fs.writeFile(artifactPath, "{not-json", "utf8"); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "first-hour-live", + title: "First hour live", + kind: "qa-suite-summary", + artifact: "live/qa-suite-summary.json", + required: true, + missingVerdict: "environment-blocked", + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.counts).toMatchObject({ blocked: 0, unknown: 1 }); + expect(report.lanes[0]).toMatchObject({ + status: "unknown", + }); + expect(report.lanes[0]?.details).toContain("artifact unreadable"); + }); + + it("treats schema-invalid suite artifacts as unknown", async () => { + await writeJson("live/qa-suite-summary.json", {}); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "first-hour-live", + title: "First hour live", + kind: "qa-suite-summary", + artifact: "live/qa-suite-summary.json", + required: true, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.counts.unknown).toBe(1); + expect(report.lanes[0]?.details).toContain("missing counts.failed and scenarios[]"); + }); + + it("treats empty suite artifacts as unknown", async () => { + await writeJson("live/qa-suite-summary.json", { + counts: { total: 0, passed: 0, skipped: 0, failed: 0 }, + scenarios: [], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "first-hour-live", + title: "First hour live", + kind: "qa-suite-summary", + artifact: "live/qa-suite-summary.json", + required: true, + failureVerdict: "qa-harness-bug", + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.counts).toMatchObject({ failed: 0, unknown: 1 }); + expect(report.lanes[0]).toMatchObject({ status: "unknown" }); + expect(report.lanes[0]?.details).toContain("no executed scenarios"); + }); + + it("treats suite count and scenario mismatches as unknown", async () => { + await writeJson("live/qa-suite-summary.json", { + counts: { total: 2, passed: 2, skipped: 0, failed: 0 }, + scenarios: [ + { name: "passing", status: "pass" }, + { name: "stale-failure", status: "fail" }, + ], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "first-hour-live", + title: "First hour live", + kind: "qa-suite-summary", + artifact: "live/qa-suite-summary.json", + required: true, + failureVerdict: "qa-harness-bug", + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.counts).toMatchObject({ failed: 0, unknown: 1 }); + expect(report.lanes[0]).toMatchObject({ status: "unknown" }); + expect(report.lanes[0]?.details).toContain("count/scenario mismatch"); + }); + + it("requires generic summary lanes to expose an explicit pass signal", async () => { + await writeJson("runtime/qa-runtime-parity-summary.json", {}); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "runtime-parity", + title: "Runtime parity", + kind: "runtime-parity-summary", + artifact: "runtime/qa-runtime-parity-summary.json", + required: true, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.counts.unknown).toBe(1); + expect(report.lanes[0]?.details).toContain("explicit pass signal"); + }); + + it("requires JSONL replay summaries to contain replayed user turns", async () => { + for (const [artifact, expectedDetail] of [ + [{ transcripts: [] }, "no transcripts"], + [ + { transcripts: [{ transcriptPath: "empty.jsonl", userTurnCount: 0, drift: [] }] }, + "no replayed user turns", + ], + [ + { transcripts: [{ transcriptPath: "missing-drift.jsonl", userTurnCount: 1 }] }, + "missing drift array", + ], + ] as const) { + await writeJson("jsonl/qa-jsonl-replay-summary.json", artifact); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "jsonl-expanded", + title: "Expanded JSONL replay", + kind: "jsonl-replay-summary", + artifact: "jsonl/qa-jsonl-replay-summary.json", + required: true, + failureVerdict: "fixture-bug", + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.counts).toMatchObject({ failed: 0, unknown: 1 }); + expect(report.lanes[0]).toMatchObject({ status: "unknown" }); + expect(report.lanes[0]?.details).toContain(expectedDetail); + } + }); + + it("requires confidence self-test summaries to contain every seeded canary", async () => { + for (const [artifact, expectedDetail] of [ + [{ pass: true, canaries: [] }, "no canaries"], + [ + { pass: true, canaries: [{ id: "prompt-drift", detected: true }] }, + "missing expected canaries", + ], + ] as const) { + await writeJson("confidence-self-test/qa-confidence-self-test-summary.json", artifact); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "confidence-self-test", + title: "Confidence self-test", + kind: "self-test-summary", + artifact: "confidence-self-test/qa-confidence-self-test-summary.json", + required: true, + failureVerdict: "qa-harness-bug", + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-13T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.counts).toMatchObject({ failed: 0, unknown: 1 }); + expect(report.lanes[0]).toMatchObject({ status: "unknown" }); + expect(report.lanes[0]?.details).toContain(expectedDetail); + } + }); + + it("fails strict zero-unknowns for an unclassified failing lane", async () => { + await writeJson("first-hour/qa-suite-summary.json", { + counts: { total: 18, passed: 17, failed: 1 }, + scenarios: [{ name: "approval-turn-tool-followthrough", status: "fail", steps: [] }], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "first-hour-20-direct", + title: "First-hour 20 direct", + kind: "qa-suite-summary", + artifact: "first-hour/qa-suite-summary.json", + required: true, + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + expect(report.pass).toBe(false); + expect(report.counts.unknown).toBe(1); + expect(report.failures[0]).toContain("first-hour-20-direct is unclassified"); + }); + + it("accepts a classified failing lane without treating it as unknown", async () => { + await writeJson("jsonl/qa-jsonl-replay-summary.json", { + transcripts: [ + { + transcriptPath: "curated.jsonl", + userTurnCount: 2, + drift: ["none", "tool-result-shape"], + firstDriftAtTurn: 2, + }, + ], + }); + + const report = await buildQaConfidenceReport({ + manifest: { + version: 1, + profile: "codex-100", + lanes: [ + { + id: "jsonl-expanded", + title: "Expanded JSONL replay", + kind: "jsonl-replay-summary", + artifact: "jsonl/qa-jsonl-replay-summary.json", + required: true, + failureVerdict: "fixture-bug", + productImpact: "P4", + qaImpact: "P1", + }, + ], + }, + artifactRoot: tempRoot, + strictZeroUnknowns: true, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + expect(report.pass).toBe(true); + expect(report.globalPass).toBe(false); + expect(report.counts.failed).toBe(1); + expect(report.counts.unknown).toBe(0); + expect(report.lanes[0]).toMatchObject({ + status: "fail", + verdict: "fixture-bug", + productImpact: "P4", + qaImpact: "P1", + }); + }); + + it("emits confidence self-test canaries for every drift class we need to catch", async () => { + const summary = await buildQaConfidenceSelfTestSummary("2026-05-12T00:00:00.000Z"); + + expect(summary.pass).toBe(true); + expect(summary.canaries.map((canary) => canary.id)).toEqual([ + "prompt-drift", + "tool-description-schema-drift", + "runtime-tool-call-drop", + "tool-result-mismatch", + "failure-mode-drift", + "token-efficiency-regression", + "jsonl-replay-ordering-drift", + ]); + expect(summary.canaries.every((canary) => canary.detected)).toBe(true); + }); + + it("writes confidence self-test artifacts", async () => { + const result = await writeQaConfidenceSelfTestArtifacts({ + outputDir: tempRoot, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + + await expect(fs.stat(result.summaryPath)).resolves.toBeTruthy(); + await expect(fs.stat(result.reportPath)).resolves.toBeTruthy(); + const summary = JSON.parse(await fs.readFile(result.summaryPath, "utf8")) as { pass: boolean }; + expect(summary.pass).toBe(true); + }); +}); diff --git a/extensions/qa-lab/src/confidence-report.ts b/extensions/qa-lab/src/confidence-report.ts new file mode 100644 index 000000000000..285a6606baa8 --- /dev/null +++ b/extensions/qa-lab/src/confidence-report.ts @@ -0,0 +1,1238 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime"; +import { + formatGatewayLogSentinelSummary, + type GatewayLogSentinelFinding, +} from "./gateway-log-sentinel.js"; +import { + buildHarnessParityCell, + buildHarnessParityResult, + type HarnessParityDrift, + type HarnessRuntimeParityCell, + type RuntimeParitySystemPromptReport, +} from "./harness-parity.js"; +import { + runRuntimeParityScenario, + type RuntimeParityCell, + type RuntimeParityDrift, + type RuntimeParityResult, + type RuntimeParityToolCall, +} from "./runtime-parity.js"; +import { buildTokenEfficiencyReport } from "./token-efficiency-report.js"; + +export const QA_CONFIDENCE_VERDICTS = [ + "pass", + "product-bug", + "qa-harness-bug", + "fixture-bug", + "optional-gap", + "mock-limitation", + "environment-blocked", +] as const; + +export type QaConfidenceVerdict = (typeof QA_CONFIDENCE_VERDICTS)[number]; + +export type QaConfidenceLaneKind = + | "qa-suite-summary" + | "runtime-parity-summary" + | "harness-parity-summary" + | "token-efficiency-summary" + | "jsonl-replay-summary" + | "self-test-summary" + | "generic-pass-summary"; + +export type QaConfidenceManifestLane = { + id: string; + title: string; + kind: QaConfidenceLaneKind; + artifact: string; + required: boolean; + failureVerdict?: Exclude; + missingVerdict?: "environment-blocked" | "optional-gap"; + missingReason?: string; + expectedTokenUsageSource?: "mock-estimate" | "live-usage"; + skipBackfillLane?: string; + productImpact?: string; + qaImpact?: string; + issue?: string; + ownerAction?: string; + labels?: string[]; +}; + +export type QaConfidenceManifest = { + version: 1; + profile: string; + lanes: QaConfidenceManifestLane[]; +}; + +export type QaConfidenceLaneStatus = "pass" | "fail" | "blocked" | "missing" | "unknown"; + +export type QaConfidenceLaneResult = { + id: string; + title: string; + kind: QaConfidenceLaneKind; + artifact: string; + artifactPath: string; + required: boolean; + status: QaConfidenceLaneStatus; + verdict?: QaConfidenceVerdict; + details: string; + productImpact?: string; + qaImpact?: string; + issue?: string; + ownerAction?: string; + labels?: string[]; + skippedCount?: number; + skipBackfillLane?: string; + skipBackfilled?: boolean; +}; + +export type QaConfidenceReport = { + generatedAt: string; + profile: string; + strictZeroUnknowns: boolean; + strictGlobalPass: boolean; + pass: boolean; + zeroUnknowns: boolean; + globalPass: boolean; + counts: { + total: number; + passed: number; + failed: number; + blocked: number; + missing: number; + unknown: number; + }; + failures: string[]; + lanes: QaConfidenceLaneResult[]; +}; + +export type QaConfidenceSelfTestCanary = { + id: string; + category: + | "prompt" + | "tool-schema" + | "tool-call" + | "tool-result" + | "failure-mode" + | "token-efficiency" + | "jsonl-replay"; + detected: boolean; + expectedVerdict: Exclude; + details: string; +}; + +export type QaConfidenceSelfTestSummary = { + generatedAt: string; + pass: boolean; + canaries: QaConfidenceSelfTestCanary[]; +}; + +const QA_CONFIDENCE_SELF_TEST_CANARY_IDS = [ + "prompt-drift", + "tool-description-schema-drift", + "runtime-tool-call-drop", + "tool-result-mismatch", + "failure-mode-drift", + "token-efficiency-regression", + "jsonl-replay-ordering-drift", +] as const; + +function isRecord(value: unknown): value is Record { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function readString(value: unknown): string | undefined { + return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined; +} + +function readNumber(value: unknown): number | undefined { + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function readBoolean(value: unknown): boolean | undefined { + return typeof value === "boolean" ? value : undefined; +} + +function readStringArray(value: unknown): string[] | undefined { + if (!Array.isArray(value)) { + return undefined; + } + const values = value.filter((entry): entry is string => typeof entry === "string"); + return values.length === value.length ? values : undefined; +} + +function isGatewayLogSentinelFinding(value: unknown): value is GatewayLogSentinelFinding { + if (!isRecord(value)) { + return false; + } + const kind = readString(value.kind); + const verdict = readString(value.verdict); + return Boolean(kind && verdict && isQaConfidenceVerdict(verdict)); +} + +function collectGatewayLogSentinels(value: unknown): GatewayLogSentinelFinding[] { + const findings: GatewayLogSentinelFinding[] = []; + const visit = (candidate: unknown) => { + if (Array.isArray(candidate)) { + for (const entry of candidate) { + visit(entry); + } + return; + } + if (!isRecord(candidate)) { + return; + } + if (Array.isArray(candidate.gatewayLogSentinels)) { + findings.push(...candidate.gatewayLogSentinels.filter(isGatewayLogSentinelFinding)); + } + if (Array.isArray(candidate.sentinelFindings)) { + findings.push(...candidate.sentinelFindings.filter(isGatewayLogSentinelFinding)); + } + for (const [key, nested] of Object.entries(candidate)) { + if (key === "gatewayLogSentinels" || key === "sentinelFindings") { + continue; + } + visit(nested); + } + }; + visit(value); + return findings; +} + +function isQaConfidenceVerdict(value: string): value is QaConfidenceVerdict { + return QA_CONFIDENCE_VERDICTS.includes(value as QaConfidenceVerdict); +} + +function readRequiredString(record: Record, key: string): string { + const value = readString(record[key]); + if (!value) { + throw new Error(`confidence manifest lane missing ${key}`); + } + return value; +} + +function readVerdict(value: unknown, key: string): QaConfidenceVerdict | undefined { + const text = readString(value); + if (!text) { + return undefined; + } + if (!isQaConfidenceVerdict(text)) { + throw new Error( + `confidence manifest ${key} must be one of ${QA_CONFIDENCE_VERDICTS.join(", ")}`, + ); + } + return text; +} + +function readLaneKind(value: unknown): QaConfidenceLaneKind { + const text = readString(value); + switch (text) { + case "qa-suite-summary": + case "runtime-parity-summary": + case "harness-parity-summary": + case "token-efficiency-summary": + case "jsonl-replay-summary": + case "self-test-summary": + case "generic-pass-summary": + return text; + default: + throw new Error(`unknown confidence manifest lane kind: ${text ?? "missing"}`); + } +} + +function normalizeManifestLane(value: unknown): QaConfidenceManifestLane { + if (!isRecord(value)) { + throw new Error("confidence manifest lanes must be objects"); + } + const failureVerdict = readVerdict(value.failureVerdict, "failureVerdict"); + if (failureVerdict === "pass" || failureVerdict === "environment-blocked") { + throw new Error("confidence manifest failureVerdict must classify an actual failure"); + } + const missingVerdict = readVerdict(value.missingVerdict, "missingVerdict"); + if ( + missingVerdict !== undefined && + missingVerdict !== "environment-blocked" && + missingVerdict !== "optional-gap" + ) { + throw new Error( + "confidence manifest missingVerdict must be environment-blocked or optional-gap", + ); + } + const expectedTokenUsageSource = readString(value.expectedTokenUsageSource); + if ( + expectedTokenUsageSource !== undefined && + expectedTokenUsageSource !== "mock-estimate" && + expectedTokenUsageSource !== "live-usage" + ) { + throw new Error( + "confidence manifest expectedTokenUsageSource must be mock-estimate or live-usage", + ); + } + return { + id: readRequiredString(value, "id"), + title: readRequiredString(value, "title"), + kind: readLaneKind(value.kind), + artifact: readRequiredString(value, "artifact"), + required: readBoolean(value.required) ?? true, + ...(failureVerdict ? { failureVerdict } : {}), + ...(missingVerdict ? { missingVerdict } : {}), + ...(readString(value.missingReason) ? { missingReason: readString(value.missingReason) } : {}), + ...(expectedTokenUsageSource ? { expectedTokenUsageSource } : {}), + ...(readString(value.skipBackfillLane) + ? { skipBackfillLane: readString(value.skipBackfillLane) } + : {}), + ...(readString(value.productImpact) ? { productImpact: readString(value.productImpact) } : {}), + ...(readString(value.qaImpact) ? { qaImpact: readString(value.qaImpact) } : {}), + ...(readString(value.issue) ? { issue: readString(value.issue) } : {}), + ...(readString(value.ownerAction) ? { ownerAction: readString(value.ownerAction) } : {}), + ...(readStringArray(value.labels) ? { labels: readStringArray(value.labels) } : {}), + }; +} + +export function normalizeQaConfidenceManifest(value: unknown): QaConfidenceManifest { + if (!isRecord(value)) { + throw new Error("confidence manifest must be an object"); + } + if (value.version !== 1) { + throw new Error("confidence manifest version must be 1"); + } + const profile = readString(value.profile); + if (!profile) { + throw new Error("confidence manifest missing profile"); + } + if (!Array.isArray(value.lanes) || value.lanes.length === 0) { + throw new Error("confidence manifest must include at least one lane"); + } + const lanes = value.lanes.map(normalizeManifestLane); + const ids = new Set(); + for (const lane of lanes) { + if (ids.has(lane.id)) { + throw new Error(`confidence manifest duplicate lane id: ${lane.id}`); + } + ids.add(lane.id); + } + return { + version: 1, + profile, + lanes, + }; +} + +export async function readQaConfidenceManifestFile( + filePath: string, +): Promise { + let payload: unknown; + try { + payload = JSON.parse(await fs.readFile(filePath, "utf8")) as unknown; + } catch (error) { + throw new Error( + `Could not read confidence manifest at ${filePath}: ${formatErrorMessage(error)}`, + { + cause: error, + }, + ); + } + return normalizeQaConfidenceManifest(payload); +} + +function resolveArtifactPath(artifactRoot: string, artifact: string): string { + return path.isAbsolute(artifact) ? artifact : path.resolve(artifactRoot, artifact); +} + +async function readJsonFile(filePath: string): Promise { + return JSON.parse(await fs.readFile(filePath, "utf8")) as unknown; +} + +function isMissingFileError(error: unknown): boolean { + return isRecord(error) && error.code === "ENOENT"; +} + +function statusFromPassed(passed: boolean): Pick { + return passed ? { status: "pass", verdict: "pass" } : { status: "unknown" }; +} + +type QaConfidenceLaneEvaluation = { + passed: boolean; + details: string; + skippedCount?: number; + status?: QaConfidenceLaneStatus; + verdict?: QaConfidenceVerdict; +}; + +function evaluateQaSuiteSummary(payload: unknown): QaConfidenceLaneEvaluation { + if (!isRecord(payload)) { + return { + passed: false, + status: "unknown", + details: "qa-suite-summary payload was not an object", + }; + } + const counts = isRecord(payload.counts) ? payload.counts : undefined; + const totalCount = readNumber(counts?.total); + const passedCount = readNumber(counts?.passed); + const failedCount = readNumber(counts?.failed); + const scenarios = Array.isArray(payload.scenarios) ? payload.scenarios : undefined; + const failedScenarios = scenarios?.filter( + (scenario) => isRecord(scenario) && scenario.status === "fail", + ); + const skippedScenarioCount = + scenarios?.filter( + (scenario) => + isRecord(scenario) && (scenario.status === "skip" || scenario.status === "skipped"), + ).length ?? 0; + const hasScenarioRows = scenarios !== undefined && scenarios.length > 0; + const gatewayLogSentinels = collectGatewayLogSentinels(payload); + if (gatewayLogSentinels.length > 0) { + const allEnvironmentBlocked = gatewayLogSentinels.every( + (finding) => finding.verdict === "environment-blocked", + ); + const suiteHasFailures = + (failedCount !== undefined && failedCount > 0) || (failedScenarios?.length ?? 0) > 0; + if (allEnvironmentBlocked && suiteHasFailures) { + return { + passed: false, + status: "unknown", + details: `gateway log sentinel(s): ${formatGatewayLogSentinelSummary( + gatewayLogSentinels, + )}; suite also reports failures`, + }; + } + const firstBlockingSentinel = + gatewayLogSentinels.find((finding) => finding.verdict !== "environment-blocked") ?? + gatewayLogSentinels[0]; + return { + passed: false, + status: allEnvironmentBlocked ? "blocked" : "fail", + verdict: allEnvironmentBlocked + ? "environment-blocked" + : (firstBlockingSentinel?.verdict ?? "product-bug"), + details: `gateway log sentinel(s): ${formatGatewayLogSentinelSummary(gatewayLogSentinels)}`, + }; + } + if (failedCount !== undefined) { + if (failedCount === 0 && !(totalCount !== undefined && totalCount > 0) && !hasScenarioRows) { + return { + passed: false, + status: "unknown", + details: "qa-suite-summary has no executed scenarios", + }; + } + if (failedScenarios !== undefined && Math.floor(failedCount) !== failedScenarios.length) { + return { + passed: false, + status: "unknown", + details: `qa-suite-summary count/scenario mismatch: counts.failed=${Math.max( + 0, + Math.floor(failedCount), + )}, failed scenarios=${failedScenarios.length}`, + }; + } + const explicitSkippedCount = readNumber(counts?.skipped); + const inferredSkippedCount = + totalCount === undefined || passedCount === undefined + ? undefined + : Math.max(0, Math.floor(totalCount) - Math.floor(passedCount) - Math.floor(failedCount)); + const skippedCount = Math.max( + 0, + ...[explicitSkippedCount, inferredSkippedCount, skippedScenarioCount].filter( + (count): count is number => count !== undefined, + ), + ); + const shouldReportSkippedCount = explicitSkippedCount !== undefined || skippedCount > 0; + const skippedDetails = shouldReportSkippedCount + ? ` counts.skipped=${Math.max(0, Math.floor(skippedCount))}` + : ""; + const totalDetails = + totalCount === undefined ? "" : ` counts.total=${Math.max(0, Math.floor(totalCount))}`; + return { + passed: failedCount === 0, + details: `qa-suite-summary counts.failed=${Math.max(0, Math.floor(failedCount))}${totalDetails}${skippedDetails}`, + ...(skippedCount === 0 ? {} : { skippedCount: Math.max(0, Math.floor(skippedCount)) }), + }; + } + if (!Array.isArray(payload.scenarios)) { + return { + passed: false, + status: "unknown", + details: "qa-suite-summary missing counts.failed and scenarios[]", + }; + } + if (payload.scenarios.length === 0) { + return { + passed: false, + status: "unknown", + details: "qa-suite-summary has no executed scenarios", + }; + } + const fallbackFailedScenarios = payload.scenarios.filter( + (scenario) => isRecord(scenario) && scenario.status === "fail", + ); + return { + passed: fallbackFailedScenarios.length === 0, + details: `qa-suite-summary failed scenarios=${fallbackFailedScenarios.length}`, + }; +} + +function evaluatePassSummary(payload: unknown): QaConfidenceLaneEvaluation { + if (!isRecord(payload)) { + return { passed: false, details: "summary payload was not an object" }; + } + const pass = readBoolean(payload.pass); + if (pass !== undefined) { + return { passed: pass, details: `summary pass=${String(pass)}` }; + } + const verdict = readString(payload.verdict); + if (verdict) { + return { passed: verdict === "pass", details: `summary verdict=${verdict}` }; + } + const status = readString(payload.status); + if (status) { + if ( + status === "pass" || + status === "passed" || + status === "success" || + status === "succeeded" + ) { + return { passed: true, details: `summary status=${status}` }; + } + if (status === "fail" || status === "failed" || status === "error") { + return { passed: false, details: `summary status=${status}` }; + } + return { + passed: false, + status: "unknown", + details: `summary status=${status}`, + }; + } + return { + passed: false, + status: "unknown", + details: "summary did not expose an explicit pass signal", + }; +} + +function evaluateTokenEfficiencySummary( + payload: unknown, + expectedTokenUsageSource: QaConfidenceManifestLane["expectedTokenUsageSource"], +): QaConfidenceLaneEvaluation { + const base = evaluatePassSummary(payload); + if (!base.passed || !expectedTokenUsageSource) { + return base; + } + if (!isRecord(payload) || !Array.isArray(payload.rows)) { + return { + passed: false, + details: `token summary missing rows for expected usageSource=${expectedTokenUsageSource}`, + }; + } + if (readString(payload.status) === "skipped" || payload.rows.length === 0) { + return { + passed: false, + details: `token summary has no ${expectedTokenUsageSource} rows`, + }; + } + const mismatched = payload.rows.filter( + (row) => !isRecord(row) || row.usageSource !== expectedTokenUsageSource, + ); + return { + passed: mismatched.length === 0, + details: + mismatched.length === 0 + ? `token summary rows all usageSource=${expectedTokenUsageSource}` + : `token summary has ${mismatched.length} row(s) not labeled ${expectedTokenUsageSource}`, + }; +} + +function evaluateJsonlReplaySummary(payload: unknown): QaConfidenceLaneEvaluation { + if (!isRecord(payload) || !Array.isArray(payload.transcripts)) { + return { + passed: false, + status: "unknown", + details: "jsonl replay summary missing transcripts array", + }; + } + if (payload.transcripts.length === 0) { + return { + passed: false, + status: "unknown", + details: "jsonl replay summary has no transcripts", + }; + } + let drifted = 0; + let replayedUserTurns = 0; + for (const transcript of payload.transcripts) { + if (!isRecord(transcript)) { + return { + passed: false, + status: "unknown", + details: "jsonl replay summary has an invalid transcript row", + }; + } + const userTurnCount = readNumber(transcript.userTurnCount); + if (userTurnCount !== undefined && userTurnCount > 0) { + replayedUserTurns += userTurnCount; + } + const hasFirstDrift = transcript.firstDriftAtTurn !== undefined; + if (!Array.isArray(transcript.drift)) { + return { + passed: false, + status: "unknown", + details: "jsonl replay transcript missing drift array", + }; + } + if (userTurnCount !== undefined && transcript.drift.length !== userTurnCount) { + return { + passed: false, + status: "unknown", + details: "jsonl replay transcript drift count does not match userTurnCount", + }; + } + const drift = transcript.drift; + const hasDrift = drift.some((entry) => entry !== "none"); + if (hasFirstDrift || hasDrift) { + drifted += 1; + } + } + if (replayedUserTurns === 0) { + return { + passed: false, + status: "unknown", + details: "jsonl replay summary has no replayed user turns", + }; + } + return { + passed: drifted === 0, + details: `jsonl replay turns=${replayedUserTurns}, drifted transcripts=${drifted}`, + }; +} + +function evaluateSelfTestSummary(payload: unknown): QaConfidenceLaneEvaluation { + if (!isRecord(payload) || !Array.isArray(payload.canaries)) { + return { + passed: false, + status: "unknown", + details: "confidence self-test summary missing canaries array", + }; + } + if (payload.canaries.length === 0) { + return { + passed: false, + status: "unknown", + details: "confidence self-test summary has no canaries", + }; + } + const canariesById = new Map( + payload.canaries + .filter((canary): canary is Record => isRecord(canary)) + .map((canary) => [readString(canary.id), canary]), + ); + const missingExpected = QA_CONFIDENCE_SELF_TEST_CANARY_IDS.filter( + (canaryId) => !canariesById.has(canaryId), + ); + if (missingExpected.length > 0) { + return { + passed: false, + status: "unknown", + details: `confidence self-test missing expected canaries: ${missingExpected.join(", ")}`, + }; + } + const missed = QA_CONFIDENCE_SELF_TEST_CANARY_IDS.filter( + (canaryId) => canariesById.get(canaryId)?.detected !== true, + ); + const pass = readBoolean(payload.pass) ?? missed.length === 0; + return { + passed: pass && missed.length === 0, + details: `confidence self-test detected=${ + QA_CONFIDENCE_SELF_TEST_CANARY_IDS.length - missed.length + }/${QA_CONFIDENCE_SELF_TEST_CANARY_IDS.length}`, + }; +} + +function evaluateLaneArtifact( + lane: QaConfidenceManifestLane, + payload: unknown, +): QaConfidenceLaneEvaluation { + switch (lane.kind) { + case "qa-suite-summary": + return evaluateQaSuiteSummary(payload); + case "runtime-parity-summary": + case "harness-parity-summary": + case "generic-pass-summary": + return evaluatePassSummary(payload); + case "token-efficiency-summary": + return evaluateTokenEfficiencySummary(payload, lane.expectedTokenUsageSource); + case "jsonl-replay-summary": + return evaluateJsonlReplaySummary(payload); + case "self-test-summary": + return evaluateSelfTestSummary(payload); + default: + return { + passed: false, + details: `unknown confidence lane kind: ${(lane as { kind?: string }).kind ?? "missing"}`, + }; + } +} + +function resultForMissingLane( + lane: QaConfidenceManifestLane, + artifactPath: string, +): QaConfidenceLaneResult { + if (lane.missingVerdict) { + return { + ...baseLaneResult(lane, artifactPath), + status: lane.missingVerdict === "environment-blocked" ? "blocked" : "fail", + verdict: lane.missingVerdict, + details: lane.missingReason ?? "artifact missing with explicit missing verdict", + }; + } + return { + ...baseLaneResult(lane, artifactPath), + status: "missing", + details: "artifact missing and no missingVerdict was configured", + }; +} + +function baseLaneResult( + lane: QaConfidenceManifestLane, + artifactPath: string, +): Omit { + const reportArtifactPath = path.isAbsolute(lane.artifact) + ? path.basename(artifactPath) + : lane.artifact; + return { + id: lane.id, + title: lane.title, + kind: lane.kind, + artifact: lane.artifact, + artifactPath: reportArtifactPath, + required: lane.required, + ...(lane.productImpact ? { productImpact: lane.productImpact } : {}), + ...(lane.qaImpact ? { qaImpact: lane.qaImpact } : {}), + ...(lane.issue ? { issue: lane.issue } : {}), + ...(lane.ownerAction ? { ownerAction: lane.ownerAction } : {}), + ...(lane.labels ? { labels: lane.labels } : {}), + ...(lane.skipBackfillLane ? { skipBackfillLane: lane.skipBackfillLane } : {}), + }; +} + +function classifiedFailureResult( + lane: QaConfidenceManifestLane, + artifactPath: string, + details: string, +): QaConfidenceLaneResult { + const base = baseLaneResult(lane, artifactPath); + if (lane.failureVerdict) { + return { + ...base, + status: "fail", + verdict: lane.failureVerdict, + details, + }; + } + return { + ...base, + status: "unknown", + details, + }; +} + +function evaluatedFailureResult( + lane: QaConfidenceManifestLane, + artifactPath: string, + evaluated: QaConfidenceLaneEvaluation, +): QaConfidenceLaneResult { + if (evaluated.status || evaluated.verdict) { + return { + ...baseLaneResult(lane, artifactPath), + status: evaluated.status ?? "fail", + ...(evaluated.verdict ? { verdict: evaluated.verdict } : {}), + details: evaluated.details, + }; + } + return classifiedFailureResult(lane, artifactPath, evaluated.details); +} + +async function evaluateLane( + lane: QaConfidenceManifestLane, + artifactRoot: string, +): Promise { + const artifactPath = resolveArtifactPath(artifactRoot, lane.artifact); + let payload: unknown; + try { + payload = await readJsonFile(artifactPath); + } catch (error) { + if (!isMissingFileError(error)) { + return { + ...baseLaneResult(lane, artifactPath), + status: "unknown", + details: `artifact unreadable: ${formatErrorMessage(error)}`, + }; + } + return resultForMissingLane(lane, artifactPath); + } + const evaluated = evaluateLaneArtifact(lane, payload); + if (!evaluated.passed) { + return { + ...evaluatedFailureResult(lane, artifactPath, evaluated), + ...(evaluated.skippedCount === undefined ? {} : { skippedCount: evaluated.skippedCount }), + }; + } + return { + ...baseLaneResult(lane, artifactPath), + ...statusFromPassed(true), + details: evaluated.details, + ...(evaluated.skippedCount === undefined ? {} : { skippedCount: evaluated.skippedCount }), + }; +} + +function applySkipBackfillState( + lanes: readonly QaConfidenceLaneResult[], +): QaConfidenceLaneResult[] { + const byId = new Map(lanes.map((lane) => [lane.id, lane])); + return lanes.map((lane) => { + if (!lane.skippedCount || lane.skippedCount <= 0 || !lane.skipBackfillLane) { + return lane; + } + const backfillLane = byId.get(lane.skipBackfillLane); + const skipBackfilled = backfillLane?.status === "pass"; + return { + ...lane, + skipBackfilled, + details: `${lane.details}; skipped rows backfilled by ${lane.skipBackfillLane}: ${ + skipBackfilled ? "yes" : "no" + }`, + }; + }); +} + +function countLaneResults(lanes: readonly QaConfidenceLaneResult[]): QaConfidenceReport["counts"] { + return { + total: lanes.length, + passed: lanes.filter((lane) => lane.status === "pass").length, + failed: lanes.filter((lane) => lane.status === "fail").length, + blocked: lanes.filter((lane) => lane.status === "blocked").length, + missing: lanes.filter((lane) => lane.status === "missing").length, + unknown: lanes.filter((lane) => lane.status === "unknown" || lane.status === "missing").length, + }; +} + +function failuresForLaneResults(lanes: readonly QaConfidenceLaneResult[]): string[] { + return lanes + .filter((lane) => lane.status === "unknown" || lane.status === "missing") + .map((lane) => `${lane.id} is unclassified: ${lane.details}`); +} + +function globalFailuresForLaneResults(lanes: readonly QaConfidenceLaneResult[]): string[] { + return lanes.flatMap((lane) => { + if (lane.status === "blocked") { + return [`${lane.id} is blocked: ${lane.details}`]; + } + if (lane.status === "missing") { + return [`${lane.id} is missing: ${lane.details}`]; + } + if (lane.status === "unknown") { + return [`${lane.id} is unclassified: ${lane.details}`]; + } + if (lane.status === "fail") { + return [`${lane.id} is classified ${lane.verdict ?? "unclassified"}: ${lane.details}`]; + } + if ((lane.skippedCount ?? 0) > 0 && lane.skipBackfilled !== true) { + return [`${lane.id} has ${lane.skippedCount} skipped row(s) with no passing backfill lane`]; + } + return []; + }); +} + +export async function buildQaConfidenceReport(params: { + manifest: QaConfidenceManifest; + artifactRoot: string; + strictZeroUnknowns?: boolean; + strictGlobalPass?: boolean; + generatedAt?: string; +}): Promise { + const evaluatedLanes = []; + for (const lane of params.manifest.lanes) { + evaluatedLanes.push(await evaluateLane(lane, params.artifactRoot)); + } + const lanes = applySkipBackfillState(evaluatedLanes); + const requiredLanes = lanes.filter((lane) => lane.required); + const counts = countLaneResults(requiredLanes); + const unclassifiedFailures = failuresForLaneResults(requiredLanes); + const globalFailures = globalFailuresForLaneResults(requiredLanes); + const zeroUnknowns = counts.unknown === 0; + const globalPass = zeroUnknowns && globalFailures.length === 0; + const strictZeroUnknowns = params.strictZeroUnknowns === true; + const strictGlobalPass = params.strictGlobalPass === true; + return { + generatedAt: params.generatedAt ?? new Date().toISOString(), + profile: params.manifest.profile, + strictZeroUnknowns, + strictGlobalPass, + pass: strictGlobalPass + ? globalPass + : strictZeroUnknowns + ? zeroUnknowns + : unclassifiedFailures.length === 0, + zeroUnknowns, + globalPass, + counts, + failures: strictGlobalPass ? globalFailures : unclassifiedFailures, + lanes, + }; +} + +function formatVerdict(lane: QaConfidenceLaneResult): string { + return lane.verdict ?? "unclassified"; +} + +function escapeTableCell(value: string): string { + return value.replace(/\|/gu, "\\|").replace(/\s+/gu, " ").trim(); +} + +export function renderQaConfidenceMarkdownReport(report: QaConfidenceReport): string { + const lines = [ + `# OpenClaw QA Confidence Report - ${report.profile}`, + "", + `- Generated at: ${report.generatedAt}`, + `- Verdict: ${report.pass ? "pass" : "fail"}`, + `- Strict zero unknowns: ${report.strictZeroUnknowns ? "yes" : "no"}`, + `- Strict global pass: ${report.strictGlobalPass ? "yes" : "no"}`, + `- Zero unknowns: ${report.zeroUnknowns ? "yes" : "no"}`, + `- Global pass: ${report.globalPass ? "yes" : "no"}`, + `- Counts: ${report.counts.passed} pass, ${report.counts.failed} classified fail, ${report.counts.blocked} blocked, ${report.counts.unknown} unknown`, + "", + "| Lane | Status | Verdict | Product impact | QA impact | Details |", + "| --- | --- | --- | --- | --- | --- |", + ]; + for (const lane of report.lanes) { + lines.push( + `| ${escapeTableCell(lane.id)} | ${lane.status} | ${formatVerdict(lane)} | ${lane.productImpact ?? ""} | ${lane.qaImpact ?? ""} | ${escapeTableCell(lane.details)} |`, + ); + } + if (report.failures.length > 0) { + lines.push( + "", + report.strictGlobalPass ? "## Global Gate Failures" : "## Unclassified Failures", + "", + ); + for (const failure of report.failures) { + lines.push(`- ${failure}`); + } + } + return `${lines.join("\n")}\n`; +} + +function syntheticRuntimeCell( + runtime: RuntimeParityCell["runtime"], + overrides: Partial = {}, +): HarnessRuntimeParityCell { + return { + runtime, + transcriptBytes: JSON.stringify({ message: { role: "assistant", content: "ok" } }), + toolCalls: [], + finalText: "ok", + usage: { + inputTokens: 10, + outputTokens: 5, + totalTokens: 15, + }, + wallClockMs: 10, + bootStateLines: [], + ...overrides, + }; +} + +function syntheticToolCall(overrides: Partial = {}): RuntimeParityToolCall { + return { + tool: "openclaw.synthetic", + argsHash: "args-a", + resultHash: "result-a", + ...overrides, + }; +} + +async function detectRuntimeDrift(params: { + scenarioId: string; + pi: RuntimeParityCell; + codex: RuntimeParityCell; + expectedDrift: RuntimeParityDrift; +}): Promise { + const result = await runRuntimeParityScenario({ + scenarioId: params.scenarioId, + runCell: async (runtime) => ({ + scenarioStatus: "pass", + cell: runtime === "pi" ? params.pi : params.codex, + }), + }); + return result.drift === params.expectedDrift; +} + +function syntheticPromptReport( + overrides: Partial = {}, +): RuntimeParitySystemPromptReport { + return { + systemPrompt: { + chars: 100, + projectContextChars: 10, + nonProjectContextChars: 90, + hash: "system-prompt-a", + }, + skills: { + promptChars: 20, + hash: "skills-a", + }, + tools: { + listChars: 30, + schemaChars: 40, + entries: [ + { + name: "openclaw.synthetic", + summaryChars: 12, + summaryHash: "summary-a", + schemaChars: 18, + schemaHash: "schema-a", + propertiesCount: 2, + }, + ], + }, + ...overrides, + }; +} + +function detectHarnessDrift(params: { + leftReport: RuntimeParitySystemPromptReport; + rightReport: RuntimeParitySystemPromptReport; + expectedDrift: HarnessParityDrift; +}): boolean { + const left = buildHarnessParityCell({ + variant: { id: "left", label: "Left" }, + cell: syntheticRuntimeCell("pi", { systemPromptReport: params.leftReport }), + tokenUsageSource: "mock-estimate", + }); + const right = buildHarnessParityCell({ + variant: { id: "right", label: "Right" }, + cell: syntheticRuntimeCell("codex", { systemPromptReport: params.rightReport }), + tokenUsageSource: "mock-estimate", + }); + return ( + buildHarnessParityResult({ + scenarioId: "confidence-self-test", + left, + right, + }).drift === params.expectedDrift + ); +} + +function detectTokenEfficiencyRegression(): boolean { + const pi = syntheticRuntimeCell("pi", { + usage: { inputTokens: 100, outputTokens: 20, totalTokens: 120 }, + }); + const codex = syntheticRuntimeCell("codex", { + usage: { inputTokens: 200, outputTokens: 40, totalTokens: 240 }, + }); + const runtimeParity: RuntimeParityResult = { + scenarioId: "token-efficiency-regression", + cells: { pi, codex }, + drift: "none", + }; + const report = buildTokenEfficiencyReport({ + summary: { + run: { + providerMode: "live-frontier", + runtimePair: ["pi", "codex"], + }, + scenarios: [ + { + name: "token-efficiency-regression", + status: "pass", + runtimeParity, + }, + ], + }, + thresholdPercent: 15, + generatedAt: "2026-05-12T00:00:00.000Z", + }); + return !report.pass && report.failures.length === 1; +} + +function detectJsonlReplayDrift(): boolean { + return !evaluateJsonlReplaySummary({ + transcripts: [ + { + transcriptPath: "synthetic.jsonl", + userTurnCount: 2, + drift: ["none", "tool-result-shape"], + firstDriftAtTurn: 2, + }, + ], + }).passed; +} + +export async function buildQaConfidenceSelfTestSummary( + generatedAt = new Date().toISOString(), +): Promise { + const promptDriftDetected = detectHarnessDrift({ + leftReport: syntheticPromptReport(), + rightReport: syntheticPromptReport({ + systemPrompt: { + chars: 100, + projectContextChars: 10, + nonProjectContextChars: 90, + hash: "system-prompt-b", + }, + }), + expectedDrift: "system-prompt", + }); + const toolDescriptionDetected = detectHarnessDrift({ + leftReport: syntheticPromptReport(), + rightReport: syntheticPromptReport({ + tools: { + listChars: 30, + schemaChars: 40, + entries: [ + { + name: "openclaw.synthetic", + summaryChars: 12, + summaryHash: "summary-b", + schemaChars: 18, + schemaHash: "schema-a", + propertiesCount: 2, + }, + ], + }, + }), + expectedDrift: "tool-description", + }); + const toolSchemaDetected = detectHarnessDrift({ + leftReport: syntheticPromptReport(), + rightReport: syntheticPromptReport({ + tools: { + listChars: 30, + schemaChars: 40, + entries: [ + { + name: "openclaw.synthetic", + summaryChars: 12, + summaryHash: "summary-a", + schemaChars: 18, + schemaHash: "schema-b", + propertiesCount: 2, + }, + ], + }, + }), + expectedDrift: "tool-schema", + }); + const runtimeToolCallDropDetected = await detectRuntimeDrift({ + scenarioId: "runtime-tool-call-drop", + pi: syntheticRuntimeCell("pi", { toolCalls: [syntheticToolCall()] }), + codex: syntheticRuntimeCell("codex", { toolCalls: [] }), + expectedDrift: "tool-call-shape", + }); + const toolResultMismatchDetected = await detectRuntimeDrift({ + scenarioId: "tool-result-mismatch", + pi: syntheticRuntimeCell("pi", { toolCalls: [syntheticToolCall()] }), + codex: syntheticRuntimeCell("codex", { + toolCalls: [syntheticToolCall({ resultHash: "result-b" })], + }), + expectedDrift: "tool-result-shape", + }); + const failureModeDriftDetected = await detectRuntimeDrift({ + scenarioId: "failure-mode-drift", + pi: syntheticRuntimeCell("pi"), + codex: syntheticRuntimeCell("codex", { transportErrorClass: "synthetic-transport" }), + expectedDrift: "failure-mode", + }); + const canaries: QaConfidenceSelfTestCanary[] = [ + { + id: "prompt-drift", + category: "prompt", + detected: promptDriftDetected, + expectedVerdict: "qa-harness-bug", + details: "synthetic harness prompt hash changed", + }, + { + id: "tool-description-schema-drift", + category: "tool-schema", + detected: toolDescriptionDetected && toolSchemaDetected, + expectedVerdict: "qa-harness-bug", + details: "synthetic tool description/schema hash changed", + }, + { + id: "runtime-tool-call-drop", + category: "tool-call", + detected: runtimeToolCallDropDetected, + expectedVerdict: "product-bug", + details: "synthetic runtime transcript omitted a required tool call", + }, + { + id: "tool-result-mismatch", + category: "tool-result", + detected: toolResultMismatchDetected, + expectedVerdict: "product-bug", + details: "synthetic runtime transcript returned a mismatched tool result", + }, + { + id: "failure-mode-drift", + category: "failure-mode", + detected: failureModeDriftDetected, + expectedVerdict: "product-bug", + details: "synthetic runtime failed with a different failure mode", + }, + { + id: "token-efficiency-regression", + category: "token-efficiency", + detected: detectTokenEfficiencyRegression(), + expectedVerdict: "qa-harness-bug", + details: "synthetic token row exceeded the configured efficiency threshold", + }, + { + id: "jsonl-replay-ordering-drift", + category: "jsonl-replay", + detected: detectJsonlReplayDrift(), + expectedVerdict: "fixture-bug", + details: "synthetic JSONL replay drifted after turn ordering changed", + }, + ]; + return { + generatedAt, + pass: canaries.every((canary) => canary.detected), + canaries, + }; +} + +export function renderQaConfidenceSelfTestMarkdownReport( + summary: QaConfidenceSelfTestSummary, +): string { + const lines = [ + "# OpenClaw QA Confidence Self-Test", + "", + `- Generated at: ${summary.generatedAt}`, + `- Verdict: ${summary.pass ? "pass" : "fail"}`, + "", + "| Canary | Category | Detected | Expected verdict | Details |", + "| --- | --- | --- | --- | --- |", + ]; + for (const canary of summary.canaries) { + lines.push( + `| ${canary.id} | ${canary.category} | ${canary.detected ? "yes" : "no"} | ${canary.expectedVerdict} | ${escapeTableCell(canary.details)} |`, + ); + } + return `${lines.join("\n")}\n`; +} + +export async function writeQaConfidenceSelfTestArtifacts(params: { + outputDir: string; + generatedAt?: string; +}): Promise<{ reportPath: string; summaryPath: string; summary: QaConfidenceSelfTestSummary }> { + await fs.mkdir(params.outputDir, { recursive: true }); + const summary = await buildQaConfidenceSelfTestSummary(params.generatedAt); + const report = renderQaConfidenceSelfTestMarkdownReport(summary); + const reportPath = path.join(params.outputDir, "qa-confidence-self-test-report.md"); + const summaryPath = path.join(params.outputDir, "qa-confidence-self-test-summary.json"); + await fs.writeFile(reportPath, report, "utf8"); + await fs.writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, "utf8"); + return { reportPath, summaryPath, summary }; +} diff --git a/extensions/qa-lab/src/harness-parity.test.ts b/extensions/qa-lab/src/harness-parity.test.ts new file mode 100644 index 000000000000..523108be120c --- /dev/null +++ b/extensions/qa-lab/src/harness-parity.test.ts @@ -0,0 +1,284 @@ +import { describe, expect, it } from "vitest"; +import { + buildHarnessParityCell, + buildHarnessParityResult, + type HarnessRuntimeParityCell, + type HarnessVariant, +} from "./harness-parity.js"; +import type { RuntimeId } from "./runtime-parity.js"; +import type { RuntimeParityComparisonMode } from "./runtime-tool-metadata.js"; + +const LEFT: HarnessVariant = { id: "left", label: "Left", runtime: "pi" }; +const RIGHT: HarnessVariant = { id: "right", label: "Right", runtime: "pi" }; + +const BASE_PROMPT_REPORT = { + systemPrompt: { + chars: 100, + projectContextChars: 40, + nonProjectContextChars: 60, + hash: "system-a", + }, + skills: { + promptChars: 12, + hash: "skills-a", + }, + tools: { + schemaChars: 20, + entries: [ + { + name: "read", + summaryChars: 8, + summaryHash: "summary-a", + schemaChars: 20, + schemaHash: "schema-a", + propertiesCount: 1, + }, + ], + }, +}; + +function makeCell( + runtime: RuntimeId, + overrides: Partial = {}, +): HarnessRuntimeParityCell { + return { + runtime, + transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n', + toolCalls: [], + finalText: "same", + usage: { inputTokens: 10, outputTokens: 5, totalTokens: 15 }, + wallClockMs: 1, + bootStateLines: [], + systemPromptReport: BASE_PROMPT_REPORT, + ...overrides, + }; +} + +function classify( + left: Partial, + right: Partial, + comparisonMode?: RuntimeParityComparisonMode, +) { + return buildHarnessParityResult({ + scenarioId: "scenario", + left: buildHarnessParityCell({ + variant: LEFT, + cell: makeCell("pi", left), + tokenUsageSource: "live-usage", + }), + right: buildHarnessParityCell({ + variant: RIGHT, + cell: makeCell("pi", right), + tokenUsageSource: "live-usage", + }), + ...(comparisonMode ? { comparisonMode } : {}), + }).drift; +} + +describe("harness parity", () => { + it("classifies prompt and tool surface drift before behavioral drift", () => { + expect( + classify( + {}, + { + systemPromptReport: { + ...BASE_PROMPT_REPORT, + systemPrompt: { chars: 101, projectContextChars: 40, nonProjectContextChars: 61 }, + }, + }, + ), + ).toBe("system-prompt"); + expect( + classify( + {}, + { + systemPromptReport: { + ...BASE_PROMPT_REPORT, + systemPrompt: { + chars: 100, + projectContextChars: 40, + nonProjectContextChars: 60, + hash: "system-b", + }, + }, + }, + ), + ).toBe("system-prompt"); + expect( + classify( + {}, + { + systemPromptReport: { + ...BASE_PROMPT_REPORT, + skills: { promptChars: 12, hash: "skills-b" }, + }, + }, + ), + ).toBe("system-prompt"); + expect( + classify( + {}, + { + systemPromptReport: { + ...BASE_PROMPT_REPORT, + tools: { + schemaChars: 20, + entries: [ + { + name: "read", + summaryChars: 8, + summaryHash: "summary-b", + schemaChars: 20, + schemaHash: "schema-a", + propertiesCount: 1, + }, + ], + }, + }, + }, + ), + ).toBe("tool-description"); + expect( + classify( + {}, + { + systemPromptReport: { + ...BASE_PROMPT_REPORT, + tools: { + schemaChars: 20, + entries: [ + { + name: "read", + summaryChars: 8, + summaryHash: "summary-a", + schemaChars: 20, + schemaHash: "schema-b", + propertiesCount: 1, + }, + ], + }, + }, + }, + ), + ).toBe("tool-schema"); + }); + + it("classifies behavioral harness drift", () => { + expect( + classify( + { toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r" }] }, + { toolCalls: [{ tool: "read", argsHash: "b", resultHash: "r" }] }, + ), + ).toBe("tool-call-shape"); + expect( + classify( + { toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r1" }] }, + { toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r2" }] }, + ), + ).toBe("tool-result-shape"); + expect(classify({ finalText: "same text" }, { finalText: "different text" })).toBe("text-only"); + expect( + classify( + { + transcriptBytes: + '{"type":"model_change","modelId":"gpt-5.5"}\n' + + '{"type":"thinking_level_change","thinkingLevel":"off"}\n' + + '{"type":"custom","customType":"model-snapshot"}\n' + + '{"message":{"role":"assistant","content":"same"}}\n', + }, + { transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n' }, + ), + ).toBe("none"); + expect( + classify( + { transcriptBytes: '{"message":{"role":"assistant"}}\n' }, + { transcriptBytes: '{"message":{"role":"assistant"}}\n{"message":{"role":"tool"}}\n' }, + ), + ).toBe("structural"); + expect( + classify( + { transcriptBytes: '{"role":"assistant","content":"same"}\n' }, + { + transcriptBytes: + '{"role":"assistant","content":"same"}\n{"role":"tool","content":"same"}\n', + }, + ), + ).toBe("structural"); + expect(classify({ runtimeErrorClass: "timeout" }, {})).toBe("failure-mode"); + }); + + it("honors native workspace comparison mode for outcome-only harness proofs", () => { + expect( + classify( + { + transcriptBytes: + '{"message":{"role":"assistant","content":"same"}}\n' + + '{"message":{"role":"tool","content":"same result"}}\n', + toolCalls: [{ tool: "bash", argsHash: "sed-160", resultHash: "same-result" }], + }, + { + transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n', + toolCalls: [{ tool: "bash", argsHash: "sed-200", resultHash: "same-result" }], + }, + "codex-native-workspace", + ), + ).toBe("none"); + + expect( + classify( + { toolCalls: [{ tool: "bash", argsHash: "a", resultHash: "r1" }] }, + { toolCalls: [{ tool: "bash", argsHash: "b", resultHash: "r2" }] }, + "outcome-only", + ), + ).toBe("none"); + }); + + it("keeps prompt and tool surface checks strict under native workspace comparison mode", () => { + expect( + classify( + {}, + { + systemPromptReport: { + ...BASE_PROMPT_REPORT, + systemPrompt: { chars: 101, projectContextChars: 40, nonProjectContextChars: 61 }, + }, + toolCalls: [{ tool: "bash", argsHash: "changed", resultHash: "changed" }], + }, + "codex-native-workspace", + ), + ).toBe("system-prompt"); + expect( + classify( + {}, + { + systemPromptReport: { + ...BASE_PROMPT_REPORT, + tools: { + schemaChars: 20, + entries: [{ name: "read", summaryChars: 9, schemaChars: 20, propertiesCount: 1 }], + }, + }, + toolCalls: [{ tool: "bash", argsHash: "changed", resultHash: "changed" }], + }, + "outcome-only", + ), + ).toBe("tool-description"); + }); + + it("labels mock token estimates separately from live usage", () => { + const sourceCell = makeCell("pi", { + usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }, + }); + const cell = buildHarnessParityCell({ + variant: LEFT, + cell: sourceCell, + tokenUsageSource: "mock-estimate", + }); + const inputChars = 100 + 12 + 8 + 20 + sourceCell.transcriptBytes.length; + + expect(cell.tokenUsageSource).toBe("mock-estimate"); + expect(cell.tokenUsage.totalTokens).toBeGreaterThan(0); + expect(cell.tokenUsage.inputTokens).toBe(Math.ceil(inputChars / 4)); + expect(cell.promptStats.toolCount).toBe(1); + }); +}); diff --git a/extensions/qa-lab/src/harness-parity.ts b/extensions/qa-lab/src/harness-parity.ts new file mode 100644 index 000000000000..bc55ae4f1853 --- /dev/null +++ b/extensions/qa-lab/src/harness-parity.ts @@ -0,0 +1,491 @@ +import { createHash } from "node:crypto"; +import type { + RuntimeId, + RuntimeParityCell, + RuntimeParityDrift, + RuntimeParityToolCall, + RuntimeParityUsage, +} from "./runtime-parity.js"; +import type { RuntimeParityComparisonMode } from "./runtime-tool-metadata.js"; + +export type HarnessVariant = { + id: string; + label: string; + runtime?: RuntimeId; + model?: string; + configPatch?: Record; + systemPromptOverlay?: string; + toolDescriptionOverlay?: Record; +}; + +export type HarnessParityDrift = + | RuntimeParityDrift + | "system-prompt" + | "tool-description" + | "tool-schema"; + +export type HarnessParityPromptStats = { + systemPromptChars: number; + projectContextChars: number; + nonProjectContextChars: number; + skillPromptChars: number; + toolSummaryChars: number; + toolSchemaChars: number; + toolCount: number; +}; + +export type RuntimeParitySystemPromptReport = { + systemPrompt?: { + chars?: number; + projectContextChars?: number; + nonProjectContextChars?: number; + text?: string; + hash?: string; + contentHash?: string; + }; + skills?: { + promptChars?: number; + prompt?: string; + hash?: string; + contentHash?: string; + }; + tools?: { + listChars?: number; + schemaChars?: number; + entries?: Array<{ + name?: string; + summary?: string; + summaryHash?: string; + summaryChars?: number; + schema?: unknown; + schemaHash?: string; + schemaChars?: number; + propertiesCount?: number; + }>; + }; +}; + +export type HarnessRuntimeParityCell = RuntimeParityCell & { + systemPromptReport?: RuntimeParitySystemPromptReport; +}; + +export type HarnessParityCell = HarnessRuntimeParityCell & { + variant: HarnessVariant; + promptStats: HarnessParityPromptStats; + systemPromptHash: string; + toolDescriptionHash: string; + toolSchemaHash: string; + tokenUsage: RuntimeParityUsage; + tokenUsageSource: "live-usage" | "mock-estimate"; +}; + +export type HarnessParityResult = { + scenarioId: string; + left: HarnessParityCell; + right: HarnessParityCell; + drift: HarnessParityDrift; + driftDetails?: string; + promptDelta: { + systemPromptChars: number; + projectContextChars: number; + skillPromptChars: number; + toolSummaryChars: number; + toolSchemaChars: number; + toolCount: number; + }; + tokenDeltaPercent: number; + firstDriftTurn?: number; +}; + +export type HarnessParityReport = { + generatedAt: string; + providerMode: string; + left: HarnessVariant; + right: HarnessVariant; + results: HarnessParityResult[]; + pass: boolean; + failures: string[]; +}; + +function sha256(value: string) { + return createHash("sha256").update(value).digest("hex"); +} + +function countComparableTranscriptRecords(transcriptBytes: string) { + let count = 0; + for (const line of transcriptBytes.split(/\r?\n/u)) { + const trimmed = line.trim(); + if (!trimmed) { + continue; + } + try { + const parsed = JSON.parse(trimmed) as { + message?: { role?: unknown }; + role?: unknown; + }; + if ( + (parsed.message && typeof parsed.message.role === "string") || + typeof parsed.role === "string" + ) { + count += 1; + } + } catch { + // Ignore malformed QA transcript rows and keep parity classification deterministic. + } + } + return count; +} + +function normalizeForStableHash(value: unknown): unknown { + if (Array.isArray(value)) { + return value.map((entry) => normalizeForStableHash(entry)); + } + if (value && typeof value === "object") { + const record = value as Record; + return Object.fromEntries( + Object.keys(record) + .toSorted((left, right) => left.localeCompare(right)) + .map((key) => [key, normalizeForStableHash(record[key])]), + ); + } + return value; +} + +function stableHash(value: unknown) { + return sha256(JSON.stringify(normalizeForStableHash(value)) ?? "null"); +} + +function readPositiveNumber(value: unknown) { + return typeof value === "number" && Number.isFinite(value) && value > 0 ? Math.floor(value) : 0; +} + +function buildPromptStats(report: RuntimeParitySystemPromptReport | undefined) { + const toolEntries = Array.isArray(report?.tools?.entries) ? report.tools.entries : []; + return { + systemPromptChars: readPositiveNumber(report?.systemPrompt?.chars), + projectContextChars: readPositiveNumber(report?.systemPrompt?.projectContextChars), + nonProjectContextChars: readPositiveNumber(report?.systemPrompt?.nonProjectContextChars), + skillPromptChars: readPositiveNumber(report?.skills?.promptChars), + toolSummaryChars: toolEntries.reduce( + (sum, entry) => sum + readPositiveNumber(entry.summaryChars), + 0, + ), + toolSchemaChars: readPositiveNumber(report?.tools?.schemaChars), + toolCount: toolEntries.length, + }; +} + +function estimateUsage( + cell: RuntimeParityCell, + stats: HarnessParityPromptStats, +): RuntimeParityUsage { + const inputChars = + stats.systemPromptChars + + stats.skillPromptChars + + stats.toolSummaryChars + + stats.toolSchemaChars + + cell.transcriptBytes.length; + const outputChars = cell.finalText.length + cell.toolCalls.length * 80; + const inputTokens = Math.ceil(inputChars / 4); + const outputTokens = Math.ceil(outputChars / 4); + return { + inputTokens, + outputTokens, + totalTokens: inputTokens + outputTokens, + }; +} + +function normalizeTextForParity(text: string) { + return text.replace(/\s+/gu, " ").trim(); +} + +function compareToolCallShape(left: RuntimeParityToolCall[], right: RuntimeParityToolCall[]) { + if (left.length !== right.length) { + return `tool call count differs (${left.length} vs ${right.length})`; + } + for (let index = 0; index < left.length; index += 1) { + const leftCall = left[index]; + const rightCall = right[index]; + if (!leftCall || !rightCall) { + return `tool call row ${index + 1} missing`; + } + if (leftCall.tool !== rightCall.tool || leftCall.argsHash !== rightCall.argsHash) { + return `tool call ${index + 1} differs (${leftCall.tool}/${leftCall.argsHash} vs ${rightCall.tool}/${rightCall.argsHash})`; + } + } + return undefined; +} + +function compareToolResultShape(left: RuntimeParityToolCall[], right: RuntimeParityToolCall[]) { + const total = Math.min(left.length, right.length); + for (let index = 0; index < total; index += 1) { + const leftCall = left[index]; + const rightCall = right[index]; + if (!leftCall || !rightCall) { + continue; + } + if ( + leftCall.resultHash !== rightCall.resultHash || + (leftCall.errorClass ?? "") !== (rightCall.errorClass ?? "") + ) { + return `tool result ${index + 1} differs (${leftCall.tool})`; + } + } + return undefined; +} + +function firstDriftTurn(leftTranscript: string, rightTranscript: string): number | undefined { + const leftLines = leftTranscript.trim().length ? leftTranscript.trim().split(/\r?\n/u) : []; + const rightLines = rightTranscript.trim().length ? rightTranscript.trim().split(/\r?\n/u) : []; + const total = Math.max(leftLines.length, rightLines.length); + for (let index = 0; index < total; index += 1) { + if ((leftLines[index] ?? "") !== (rightLines[index] ?? "")) { + return index + 1; + } + } + return undefined; +} + +export function buildHarnessParityCell(params: { + variant: HarnessVariant; + cell: HarnessRuntimeParityCell; + tokenUsageSource: HarnessParityCell["tokenUsageSource"]; +}): HarnessParityCell { + const report = params.cell.systemPromptReport; + const promptStats = buildPromptStats(report); + const toolEntries = report?.tools?.entries ?? []; + const tokenUsage = + params.tokenUsageSource === "live-usage" + ? params.cell.usage + : estimateUsage(params.cell, promptStats); + return { + ...params.cell, + variant: params.variant, + ...(report ? { systemPromptReport: report } : {}), + promptStats, + systemPromptHash: stableHash({ + systemPrompt: report?.systemPrompt ?? null, + skills: report?.skills ?? null, + }), + toolDescriptionHash: stableHash( + toolEntries.map((entry) => { + return { + name: entry.name, + summary: entry.summary, + summaryHash: entry.summaryHash, + summaryChars: entry.summaryChars, + }; + }), + ), + toolSchemaHash: stableHash({ + listChars: report?.tools?.listChars, + schemaChars: report?.tools?.schemaChars, + entries: toolEntries.map((entry) => { + return { + name: entry.name, + schema: entry.schema, + schemaHash: entry.schemaHash, + schemaChars: entry.schemaChars, + propertiesCount: entry.propertiesCount, + }; + }), + }), + tokenUsage, + tokenUsageSource: params.tokenUsageSource, + }; +} + +export function buildHarnessParityResult(params: { + scenarioId: string; + left: HarnessParityCell; + right: HarnessParityCell; + comparisonMode?: RuntimeParityComparisonMode; +}): HarnessParityResult { + const promptDelta = { + systemPromptChars: + params.right.promptStats.systemPromptChars - params.left.promptStats.systemPromptChars, + projectContextChars: + params.right.promptStats.projectContextChars - params.left.promptStats.projectContextChars, + skillPromptChars: + params.right.promptStats.skillPromptChars - params.left.promptStats.skillPromptChars, + toolSummaryChars: + params.right.promptStats.toolSummaryChars - params.left.promptStats.toolSummaryChars, + toolSchemaChars: + params.right.promptStats.toolSchemaChars - params.left.promptStats.toolSchemaChars, + toolCount: params.right.promptStats.toolCount - params.left.promptStats.toolCount, + }; + const tokenDeltaPercent = + params.left.tokenUsage.totalTokens === 0 + ? params.right.tokenUsage.totalTokens === 0 + ? 0 + : 100 + : ((params.right.tokenUsage.totalTokens - params.left.tokenUsage.totalTokens) / + params.left.tokenUsage.totalTokens) * + 100; + const failDetails = + params.left.transportErrorClass || params.right.transportErrorClass + ? "at least one harness variant hit a transport failure" + : params.left.runtimeErrorClass || params.right.runtimeErrorClass + ? "at least one harness variant hit a runtime failure" + : undefined; + if (failDetails) { + return { + scenarioId: params.scenarioId, + left: params.left, + right: params.right, + drift: "failure-mode", + driftDetails: failDetails, + promptDelta, + tokenDeltaPercent, + firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes), + }; + } + if (params.left.systemPromptHash !== params.right.systemPromptHash) { + return { + scenarioId: params.scenarioId, + left: params.left, + right: params.right, + drift: "system-prompt", + driftDetails: "system prompt report differs", + promptDelta, + tokenDeltaPercent, + firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes), + }; + } + if (params.left.toolDescriptionHash !== params.right.toolDescriptionHash) { + return { + scenarioId: params.scenarioId, + left: params.left, + right: params.right, + drift: "tool-description", + driftDetails: "tool description summary shape differs", + promptDelta, + tokenDeltaPercent, + firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes), + }; + } + if (params.left.toolSchemaHash !== params.right.toolSchemaHash) { + return { + scenarioId: params.scenarioId, + left: params.left, + right: params.right, + drift: "tool-schema", + driftDetails: "tool schema shape differs", + promptDelta, + tokenDeltaPercent, + firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes), + }; + } + const compareToolShapes = + params.comparisonMode !== "codex-native-workspace" && params.comparisonMode !== "outcome-only"; + const compareTranscriptStructure = + params.comparisonMode !== "codex-native-workspace" && params.comparisonMode !== "outcome-only"; + + if (compareToolShapes) { + const toolCallDrift = compareToolCallShape(params.left.toolCalls, params.right.toolCalls); + if (toolCallDrift) { + return { + scenarioId: params.scenarioId, + left: params.left, + right: params.right, + drift: "tool-call-shape", + driftDetails: toolCallDrift, + promptDelta, + tokenDeltaPercent, + firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes), + }; + } + const toolResultDrift = compareToolResultShape(params.left.toolCalls, params.right.toolCalls); + if (toolResultDrift) { + return { + scenarioId: params.scenarioId, + left: params.left, + right: params.right, + drift: "tool-result-shape", + driftDetails: toolResultDrift, + promptDelta, + tokenDeltaPercent, + firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes), + }; + } + } + const leftTranscriptRecords = countComparableTranscriptRecords(params.left.transcriptBytes); + const rightTranscriptRecords = countComparableTranscriptRecords(params.right.transcriptBytes); + if ( + compareTranscriptStructure && + (leftTranscriptRecords !== rightTranscriptRecords || + (!params.left.finalText && !!params.right.finalText) || + (!!params.left.finalText && !params.right.finalText)) + ) { + return { + scenarioId: params.scenarioId, + left: params.left, + right: params.right, + drift: "structural", + driftDetails: `transcript/final-text structure differs (${leftTranscriptRecords} message records vs ${rightTranscriptRecords} message records)`, + promptDelta, + tokenDeltaPercent, + firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes), + }; + } + if ( + normalizeTextForParity(params.left.finalText) !== normalizeTextForParity(params.right.finalText) + ) { + return { + scenarioId: params.scenarioId, + left: params.left, + right: params.right, + drift: "text-only", + driftDetails: "final text differs after whitespace normalization", + promptDelta, + tokenDeltaPercent, + firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes), + }; + } + return { + scenarioId: params.scenarioId, + left: params.left, + right: params.right, + drift: "none", + promptDelta, + tokenDeltaPercent, + }; +} + +function formatPercent(value: number) { + const normalized = Math.abs(value) < 0.05 ? 0 : value; + const prefix = normalized > 0 ? "+" : ""; + return `${prefix}${normalized.toFixed(1)}%`; +} + +export function renderHarnessParityMarkdownReport(report: HarnessParityReport): string { + const lines = [ + `# OpenClaw Harness Parity - ${report.left.label} vs ${report.right.label}`, + "", + `- Generated at: ${report.generatedAt}`, + `- Provider mode: ${report.providerMode}`, + `- Verdict: ${report.pass ? "pass" : "fail"}`, + "", + "| Scenario | Drift | First drift turn | Token delta | Prompt chars delta | Tool count delta | Details |", + "| --- | --- | ---: | ---: | ---: | ---: | --- |", + ]; + + for (const result of report.results) { + lines.push( + `| ${result.scenarioId} | ${result.drift} | ${result.firstDriftTurn ?? ""} | ${formatPercent( + result.tokenDeltaPercent, + )} | ${result.promptDelta.systemPromptChars} | ${result.promptDelta.toolCount} | ${ + result.driftDetails ?? "" + } |`, + ); + } + + if (report.failures.length > 0) { + lines.push("", "## Gate Failures", ""); + for (const failure of report.failures) { + lines.push(`- ${failure}`); + } + } + + return `${lines.join("\n").trimEnd()}\n`; +} diff --git a/src/agents/system-prompt-report.test.ts b/src/agents/system-prompt-report.test.ts index 0a98a032a9b7..4f5095c6503d 100644 --- a/src/agents/system-prompt-report.test.ts +++ b/src/agents/system-prompt-report.test.ts @@ -144,4 +144,76 @@ describe("buildSystemPromptReport", () => { expect(report.systemPrompt.projectContextChars).toBe(0); expect(report.systemPrompt.nonProjectContextChars).toBe("custom override".length); }); + + it("emits content hashes for prompt and tool parity checks", () => { + const file = makeBootstrapFile({ path: "/tmp/workspace/AGENTS.md" }); + const report = buildSystemPromptReport({ + source: "run", + generatedAt: 0, + bootstrapMaxChars: 20_000, + systemPrompt: "system", + bootstrapFiles: [file], + injectedFiles: [], + skillsPrompt: "docs", + tools: [ + { + name: "read", + description: "Read files", + parameters: { + type: "object", + properties: { path: { type: "string" } }, + }, + }, + ] as never, + }); + const sameLengthChangedPrompt = buildSystemPromptReport({ + source: "run", + generatedAt: 0, + bootstrapMaxChars: 20_000, + systemPrompt: "systen", + bootstrapFiles: [file], + injectedFiles: [], + skillsPrompt: "docs", + tools: [], + }); + + expect(report.systemPrompt.hash).toMatch(/^[a-f0-9]{64}$/u); + expect(report.skills.hash).toMatch(/^[a-f0-9]{64}$/u); + expect(report.tools.entries[0]?.summaryHash).toMatch(/^[a-f0-9]{64}$/u); + expect(report.tools.entries[0]?.schemaHash).toMatch(/^[a-f0-9]{64}$/u); + expect(sameLengthChangedPrompt.systemPrompt.hash).not.toBe(report.systemPrompt.hash); + }); + + it("keeps reporting when a tool schema cannot be stringified", () => { + const file = makeBootstrapFile({ path: "/tmp/workspace/AGENTS.md" }); + const circularSchema: Record = { + type: "object", + properties: { count: { type: "integer" } }, + }; + circularSchema.self = circularSchema; + + const report = buildSystemPromptReport({ + source: "run", + generatedAt: 0, + bootstrapMaxChars: 20_000, + systemPrompt: "system", + bootstrapFiles: [file], + injectedFiles: [], + skillsPrompt: "", + tools: [ + { + name: "broken", + description: "Broken schema", + parameters: circularSchema, + }, + ] as never, + }); + + expect(report.tools.entries[0]).toMatchObject({ + name: "broken", + schemaChars: 0, + propertiesCount: 1, + }); + expect(report.tools.entries[0]?.schemaHash).toMatch(/^[a-f0-9]{64}$/u); + }); }); diff --git a/src/agents/system-prompt-report.ts b/src/agents/system-prompt-report.ts index 2792ec34db57..d888c74c9602 100644 --- a/src/agents/system-prompt-report.ts +++ b/src/agents/system-prompt-report.ts @@ -1,3 +1,4 @@ +import { createHash } from "node:crypto"; import type { AgentTool } from "@earendil-works/pi-agent-core"; import type { SessionSystemPromptReport } from "../config/sessions/types.js"; import { buildBootstrapInjectionStats } from "./bootstrap-budget.js"; @@ -9,9 +10,47 @@ type ToolReportEntry = SessionSystemPromptReport["tools"]["entries"][number]; const toolReportEntryCache = new WeakMap(); const toolSchemaStatsCache = new WeakMap< object, - Pick + Pick >(); +function sha256(value: string): string { + return createHash("sha256").update(value).digest("hex"); +} + +function normalizeForStableHash(value: unknown, seen = new WeakSet()): unknown { + if (typeof value === "bigint") { + return `${value.toString()}n`; + } + if (value && typeof value === "object") { + if (seen.has(value)) { + return "[Circular]"; + } + seen.add(value); + if (Array.isArray(value)) { + const normalized = value.map((entry) => normalizeForStableHash(entry, seen)); + seen.delete(value); + return normalized; + } + const record = value as Record; + const normalized = Object.fromEntries( + Object.keys(record) + .toSorted((left, right) => left.localeCompare(right)) + .map((key) => [key, normalizeForStableHash(record[key], seen)]), + ); + seen.delete(value); + return normalized; + } + return value; +} + +function stableJsonHash(value: unknown): string { + try { + return sha256(JSON.stringify(normalizeForStableHash(value)) ?? "null"); + } catch { + return sha256("[unserializable]"); + } +} + function extractBetween(input: string, startMarker: string, endMarker: string): string { const start = input.indexOf(startMarker); if (start === -1) { @@ -39,9 +78,9 @@ function parseSkillBlocks(skillsPrompt: string): Array<{ name: string; blockChar function buildToolSchemaStats( parameters: AgentTool["parameters"], -): Pick { +): Pick { if (!parameters || typeof parameters !== "object") { - return { schemaChars: 0, propertiesCount: null }; + return { schemaChars: 0, schemaHash: stableJsonHash(null), propertiesCount: null }; } const cached = toolSchemaStatsCache.get(parameters); if (cached) { @@ -55,6 +94,7 @@ function buildToolSchemaStats( return 0; } })(), + schemaHash: stableJsonHash(parameters), propertiesCount: (() => { const schema = parameters as Record; const props = typeof schema.properties === "object" ? schema.properties : null; @@ -78,7 +118,7 @@ function buildToolsEntries(tools: AgentTool[]): SessionSystemPromptReport["tools const summary = tool.description?.trim() || tool.label?.trim() || ""; const summaryChars = summary.length; const schemaStats = buildToolSchemaStats(tool.parameters); - const entry = { name, summaryChars, ...schemaStats }; + const entry = { name, summaryChars, summaryHash: sha256(summary), ...schemaStats }; toolReportEntryCache.set(tool, entry); return entry; }); @@ -129,6 +169,7 @@ export function buildSystemPromptReport(params: { chars: systemPromptChars, projectContextChars, nonProjectContextChars: Math.max(0, systemPromptChars - projectContextChars), + hash: sha256(params.systemPrompt), }, ...(params.currentTurn ? { currentTurn: params.currentTurn } : {}), injectedWorkspaceFiles: buildBootstrapInjectionStats({ @@ -137,6 +178,7 @@ export function buildSystemPromptReport(params: { }), skills: { promptChars: params.skillsPrompt.length, + hash: sha256(params.skillsPrompt), entries: skillsEntries, }, tools: { diff --git a/src/config/sessions/types.ts b/src/config/sessions/types.ts index cacaddbb7880..53b93121f90d 100644 --- a/src/config/sessions/types.ts +++ b/src/config/sessions/types.ts @@ -644,6 +644,7 @@ export type SessionSystemPromptReport = { chars: number; projectContextChars: number; nonProjectContextChars: number; + hash?: string; }; currentTurn?: { kind?: "user_request" | "room_event"; @@ -660,6 +661,7 @@ export type SessionSystemPromptReport = { }>; skills: { promptChars: number; + hash?: string; entries: Array<{ name: string; blockChars: number }>; }; tools: { @@ -668,7 +670,9 @@ export type SessionSystemPromptReport = { entries: Array<{ name: string; summaryChars: number; + summaryHash?: string; schemaChars: number; + schemaHash?: string; propertiesCount?: number | null; }>; };