test(qa-lab): add runtime confidence reports

This commit is contained in:
Vincent Koc
2026-05-25 20:31:03 +01:00
parent cda7c30150
commit f6a49a4e8a
12 changed files with 3336 additions and 6 deletions

View File

@@ -2844,11 +2844,17 @@ describe("runCodexAppServerAttempt", () => {
expect(report?.provider).toBe("codex");
expect(report?.model).toBe("gpt-5.4-codex");
expect(report?.systemPrompt.chars).toBeGreaterThan(0);
expect(report?.systemPrompt.hash).toMatch(/^[a-f0-9]{64}$/u);
expect(report?.skills.hash).toMatch(/^[a-f0-9]{64}$/u);
const message = report?.tools.entries.find((tool) => tool.name === "message");
const webSearch = report?.tools.entries.find((tool) => tool.name === "web_search");
expect(message?.schemaChars).toBeGreaterThan(0);
expect(message?.summaryHash).toMatch(/^[a-f0-9]{64}$/u);
expect(message?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
expect(webSearch?.schemaChars).toBe(0);
expect(webSearch?.summaryHash).toMatch(/^[a-f0-9]{64}$/u);
expect(webSearch?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
expect(report?.tools.schemaChars).toBe(message?.schemaChars);
});
@@ -6574,7 +6580,8 @@ describe("runCodexAppServerAttempt", () => {
input?: Array<{ text?: string }>;
};
expect(turnStartParams.input?.[0]?.text).toBe(exactCommand);
expect(result.systemPromptReport?.skills).toEqual({ promptChars: 0, entries: [] });
expect(result.systemPromptReport?.skills).toMatchObject({ promptChars: 0, entries: [] });
expect(result.systemPromptReport?.skills.hash).toMatch(/^[a-f0-9]{64}$/u);
});
it("fires llm_input, llm_output, and agent_end hooks for codex turns", async () => {

View File

@@ -5095,6 +5095,7 @@ function buildCodexSystemPromptReport(params: {
chars: params.developerInstructions.length,
projectContextChars: 0,
nonProjectContextChars: params.developerInstructions.length,
hash: sha256Text(params.developerInstructions),
},
injectedWorkspaceFiles: buildCodexBootstrapInjectionStats({
bootstrapFiles: params.workspaceBootstrapContext.bootstrapFiles,
@@ -5106,6 +5107,7 @@ function buildCodexSystemPromptReport(params: {
}),
skills: {
promptChars: skillsPrompt.length,
hash: sha256Text(skillsPrompt),
entries: buildCodexSkillReportEntries(skillsPrompt),
},
tools: {
@@ -5137,20 +5139,23 @@ function buildCodexToolReportEntry(tool: CodexDynamicToolSpec): CodexToolReportE
return {
name: tool.name,
summaryChars: summary.length,
summaryHash: sha256Text(summary),
schemaChars: 0,
schemaHash: stableJsonHash(null),
propertiesCount: null,
};
}
return {
name: tool.name,
summaryChars: summary.length,
summaryHash: sha256Text(summary),
...buildCodexToolSchemaStats(tool.inputSchema),
};
}
function buildCodexToolSchemaStats(
schema: JsonValue,
): Pick<CodexToolReportEntry, "schemaChars" | "propertiesCount"> {
): Pick<CodexToolReportEntry, "schemaChars" | "schemaHash" | "propertiesCount"> {
const schemaChars = (() => {
try {
return JSON.stringify(schema).length;
@@ -5162,10 +5167,34 @@ function buildCodexToolSchemaStats(
isJsonObject(schema) && isJsonObject(schema.properties) ? schema.properties : null;
return {
schemaChars,
schemaHash: stableJsonHash(schema),
propertiesCount: properties ? Object.keys(properties).length : null,
};
}
function sha256Text(value: string): string {
return createHash("sha256").update(value).digest("hex");
}
function normalizeForStableHash(value: unknown): unknown {
if (Array.isArray(value)) {
return value.map((entry) => normalizeForStableHash(entry));
}
if (value && typeof value === "object") {
const record = value as Record<string, unknown>;
return Object.fromEntries(
Object.keys(record)
.toSorted((left, right) => left.localeCompare(right))
.map((key) => [key, normalizeForStableHash(record[key])]),
);
}
return value;
}
function stableJsonHash(value: JsonValue): string {
return sha256Text(JSON.stringify(normalizeForStableHash(value)) ?? "null");
}
function buildCodexBootstrapInjectionStats(params: {
bootstrapFiles: CodexBootstrapFile[];
injectedFiles: EmbeddedContextFile[];

View File

@@ -0,0 +1,168 @@
{
"version": 1,
"profile": "codex-100",
"lanes": [
{
"id": "tool-defaults-direct",
"title": "Tool-defaults direct runtime parity",
"kind": "qa-suite-summary",
"artifact": "tool-defaults-direct/qa-suite-summary.json",
"required": true,
"productImpact": "P2",
"qaImpact": "P0",
"issue": "https://github.com/openclaw/openclaw/issues/80319",
"ownerAction": "Fix product or harness before claiming the tool-defaults gate is trusted.",
"labels": ["qa-lab", "runtime-parity", "codex"]
},
{
"id": "openclaw-dynamic-tools-direct",
"title": "OpenClaw dynamic integration tools direct runtime parity",
"kind": "qa-suite-summary",
"artifact": "openclaw-dynamic-tools-direct/qa-suite-summary.json",
"required": true,
"productImpact": "P1",
"qaImpact": "P0",
"issue": "https://github.com/openclaw/openclaw/issues/80319",
"ownerAction": "Investigate any hard failure as an OpenClaw dynamic integration or QA loading regression.",
"labels": ["qa-lab", "runtime-parity", "openclaw-dynamic-tools"]
},
{
"id": "tool-defaults-searchable",
"title": "Tool-defaults searchable runtime parity",
"kind": "qa-suite-summary",
"artifact": "tool-defaults-searchable/qa-suite-summary.json",
"required": true,
"failureVerdict": "mock-limitation",
"skipBackfillLane": "openclaw-dynamic-tools-searchable-live",
"productImpact": "P4",
"qaImpact": "P2",
"issue": "https://github.com/openclaw/openclaw/issues/80319",
"ownerAction": "Keep as report-only until searchable/deferred tool modeling has no mock-only ambiguity.",
"labels": ["qa-lab", "runtime-parity", "searchable-tools"]
},
{
"id": "first-hour-20-direct",
"title": "First-hour 20-turn direct runtime parity",
"kind": "qa-suite-summary",
"artifact": "first-hour-20-direct/qa-suite-summary.json",
"required": true,
"skipBackfillLane": "codex-native-live",
"productImpact": "P1",
"qaImpact": "P0",
"ownerAction": "Triage row-by-row; do not file product bugs unless live/native proof reproduces.",
"labels": ["qa-lab", "runtime-parity", "first-hour"]
},
{
"id": "mock-token-efficiency",
"title": "Mock assistant-message token efficiency estimate",
"kind": "token-efficiency-summary",
"artifact": "first-hour-20-direct-report/qa-runtime-token-efficiency-summary.json",
"required": true,
"expectedTokenUsageSource": "mock-estimate",
"productImpact": "P4",
"qaImpact": "P1",
"ownerAction": "Fix labeling before trusting token-efficiency comparisons.",
"labels": ["qa-lab", "runtime-parity", "token-efficiency"]
},
{
"id": "fault-injection-mock",
"title": "Mock fault-injection runtime parity",
"kind": "qa-suite-summary",
"artifact": "fault-injection-mock/qa-suite-summary.json",
"required": true,
"skipBackfillLane": "codex-native-live",
"productImpact": "P2",
"qaImpact": "P0",
"ownerAction": "Treat failures as retry/recovery regressions unless evidence shows fixture drift.",
"labels": ["qa-lab", "runtime-parity", "fault-injection"]
},
{
"id": "jsonl-expanded",
"title": "Expanded curated JSONL replay",
"kind": "jsonl-replay-summary",
"artifact": "jsonl-expanded/qa-jsonl-replay-summary.json",
"required": true,
"productImpact": "P2",
"qaImpact": "P0",
"ownerAction": "Inspect first drift turn and transcript class before filing any product issue.",
"labels": ["qa-lab", "runtime-parity", "jsonl-replay"]
},
{
"id": "confidence-self-test",
"title": "Seeded confidence negative controls",
"kind": "self-test-summary",
"artifact": "confidence-self-test/qa-confidence-self-test-summary.json",
"required": true,
"productImpact": "P4",
"qaImpact": "P0",
"ownerAction": "Fix the harness before trusting any green parity result.",
"labels": ["qa-lab", "confidence-gate", "negative-controls"]
},
{
"id": "codex-native-live",
"title": "Codex-native live workspace capability proof",
"kind": "qa-suite-summary",
"artifact": "codex-native-live/qa-suite-summary.json",
"required": true,
"missingVerdict": "environment-blocked",
"missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
"productImpact": "P1",
"qaImpact": "P1",
"ownerAction": "Run with live-frontier OAuth before using this lane as product proof.",
"labels": ["qa-lab", "runtime-parity", "live-proof"]
},
{
"id": "first-hour-live",
"title": "Live first-hour capability proof",
"kind": "qa-suite-summary",
"artifact": "first-hour-live/qa-suite-summary.json",
"required": true,
"missingVerdict": "environment-blocked",
"missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
"productImpact": "P1",
"qaImpact": "P1",
"ownerAction": "Run with live-frontier OAuth before claiming live first-hour coverage.",
"labels": ["qa-lab", "runtime-parity", "live-proof"]
},
{
"id": "openclaw-dynamic-tools-searchable-live",
"title": "Live OpenClaw dynamic tools searchable proof",
"kind": "qa-suite-summary",
"artifact": "openclaw-dynamic-tools-searchable-live/qa-suite-summary.json",
"required": true,
"missingVerdict": "environment-blocked",
"missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
"productImpact": "P1",
"qaImpact": "P1",
"ownerAction": "Run with live-frontier OAuth before claiming production-shaped searchable OpenClaw dynamic tool coverage.",
"labels": ["qa-lab", "runtime-parity", "searchable-tools", "live-proof"]
},
{
"id": "live-token-efficiency",
"title": "Live assistant-message token efficiency",
"kind": "token-efficiency-summary",
"artifact": "live-token-efficiency/qa-runtime-token-efficiency-summary.json",
"required": true,
"expectedTokenUsageSource": "live-usage",
"missingVerdict": "environment-blocked",
"missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
"productImpact": "P3",
"qaImpact": "P1",
"ownerAction": "Run a live-frontier runtime parity summary and regenerate token efficiency.",
"labels": ["qa-lab", "runtime-parity", "token-efficiency"]
},
{
"id": "soak-100",
"title": "Optional 100-turn soak",
"kind": "qa-suite-summary",
"artifact": "soak-100/qa-suite-summary.json",
"required": true,
"missingVerdict": "environment-blocked",
"missingReason": "Scheduled/Testbox soak runner did not upload artifacts for this proof bundle.",
"productImpact": "P3",
"qaImpact": "P2",
"ownerAction": "Run remotely with a long timeout or record the runner budget blocker.",
"labels": ["qa-lab", "runtime-parity", "soak"]
}
]
}

View File

@@ -13,6 +13,12 @@ import {
import { resolveQaParityPackScenarioIds } from "./agentic-parity.js";
import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js";
import { resolveRepoRelativeOutputDir } from "./cli-paths.js";
import {
buildQaConfidenceReport,
readQaConfidenceManifestFile,
renderQaConfidenceMarkdownReport,
writeQaConfidenceSelfTestArtifacts,
} from "./confidence-report.js";
import {
buildQaCoverageInventory,
findQaScenarioMatches,
@@ -786,6 +792,60 @@ export async function runQaParityReportCommand(opts: {
}
}
export async function runQaConfidenceReportCommand(opts: {
repoRoot?: string;
manifest: string;
artifactRoot?: string;
outputDir?: string;
strictZeroUnknowns?: boolean;
strictGlobalPass?: boolean;
}) {
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
const manifestPath = path.resolve(repoRoot, opts.manifest);
const artifactRoot = path.resolve(repoRoot, opts.artifactRoot ?? ".");
const outputDir =
resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
path.join(repoRoot, ".artifacts", "qa-e2e", `confidence-${Date.now().toString(36)}`);
await fs.mkdir(outputDir, { recursive: true });
const manifest = await readQaConfidenceManifestFile(manifestPath);
const reportPayload = await buildQaConfidenceReport({
manifest,
artifactRoot,
strictZeroUnknowns: opts.strictZeroUnknowns === true,
strictGlobalPass: opts.strictGlobalPass === true,
});
const report = renderQaConfidenceMarkdownReport(reportPayload);
const reportPath = path.join(outputDir, "qa-confidence-report.md");
const summaryPath = path.join(outputDir, "qa-confidence-summary.json");
await fs.writeFile(reportPath, report, "utf8");
await fs.writeFile(summaryPath, `${JSON.stringify(reportPayload, null, 2)}\n`, "utf8");
process.stdout.write(`QA confidence report: ${reportPath}\n`);
process.stdout.write(`QA confidence summary: ${summaryPath}\n`);
process.stdout.write(`QA confidence verdict: ${reportPayload.pass ? "pass" : "fail"}\n`);
if (!reportPayload.pass) {
process.exitCode = 1;
}
}
export async function runQaConfidenceSelfTestCommand(opts: {
repoRoot?: string;
outputDir?: string;
}) {
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
const outputDir =
resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
path.join(repoRoot, ".artifacts", "qa-e2e", `confidence-self-test-${Date.now().toString(36)}`);
const result = await writeQaConfidenceSelfTestArtifacts({ outputDir });
process.stdout.write(`QA confidence self-test report: ${result.reportPath}\n`);
process.stdout.write(`QA confidence self-test summary: ${result.summaryPath}\n`);
process.stdout.write(
`QA confidence self-test verdict: ${result.summary.pass ? "pass" : "fail"}\n`,
);
if (!result.summary.pass) {
process.exitCode = 1;
}
}
export async function runQaCoverageReportCommand(opts: {
repoRoot?: string;
output?: string;

View File

@@ -72,6 +72,23 @@ async function runQaParityReport(opts: {
await runtime.runQaParityReportCommand(opts);
}
async function runQaConfidenceReport(opts: {
repoRoot?: string;
manifest: string;
artifactRoot?: string;
outputDir?: string;
strictZeroUnknowns?: boolean;
strictGlobalPass?: boolean;
}) {
const runtime = await loadQaLabCliRuntime();
await runtime.runQaConfidenceReportCommand(opts);
}
async function runQaConfidenceSelfTest(opts: { repoRoot?: string; outputDir?: string }) {
const runtime = await loadQaLabCliRuntime();
await runtime.runQaConfidenceSelfTestCommand(opts);
}
async function runQaCoverageReport(opts: {
repoRoot?: string;
output?: string;
@@ -424,6 +441,43 @@ export function registerQaLabCli(program: Command) {
},
);
qa.command("confidence-report")
.description("Classify QA proof artifacts into a zero-unknown confidence report")
.requiredOption("--manifest <path>", "Confidence profile manifest JSON")
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
.option("--artifact-root <path>", "Root directory for relative artifact paths", ".")
.option("--output-dir <path>", "Artifact directory for the confidence report")
.option(
"--strict-zero-unknowns",
"Fail unless every lane passes or has an explicit non-unknown verdict",
false,
)
.option(
"--strict-global-pass",
"Fail unless every lane passes with no blocked, missing, unknown, classified-fail, or unbackfilled skipped rows",
false,
)
.action(
async (opts: {
repoRoot?: string;
manifest: string;
artifactRoot?: string;
outputDir?: string;
strictZeroUnknowns?: boolean;
strictGlobalPass?: boolean;
}) => {
await runQaConfidenceReport(opts);
},
);
qa.command("confidence-self-test")
.description("Write seeded negative-control canaries proving the confidence gate detects drift")
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
.option("--output-dir <path>", "Artifact directory for the confidence self-test")
.action(async (opts: { repoRoot?: string; outputDir?: string }) => {
await runQaConfidenceSelfTest(opts);
});
qa.command("jsonl-replay")
.description("Replay curated JSONL transcripts through the runtime parity replay harness")
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")

View File

@@ -0,0 +1,881 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import {
buildQaConfidenceReport,
buildQaConfidenceSelfTestSummary,
renderQaConfidenceMarkdownReport,
writeQaConfidenceSelfTestArtifacts,
type QaConfidenceManifest,
} from "./confidence-report.js";
describe("qa confidence report", () => {
let tempRoot: string;
beforeEach(async () => {
tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-confidence-"));
});
afterEach(async () => {
await fs.rm(tempRoot, { recursive: true, force: true });
});
async function writeJson(relativePath: string, payload: unknown) {
const filePath = path.join(tempRoot, relativePath);
await fs.mkdir(path.dirname(filePath), { recursive: true });
await fs.writeFile(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
return filePath;
}
it("passes strict zero-unknowns when every lane passes or has an allowed blocked verdict", async () => {
await writeJson("tool-defaults/qa-suite-summary.json", {
counts: { total: 20, passed: 18, skipped: 2, failed: 0 },
scenarios: [],
});
await writeJson("token/qa-runtime-token-efficiency-summary.json", {
status: "estimated",
pass: true,
rows: [{ scenarioId: "one", usageSource: "mock-estimate" }],
});
const manifest: QaConfidenceManifest = {
version: 1,
profile: "codex-100",
lanes: [
{
id: "tool-defaults-direct",
title: "Tool defaults direct",
kind: "qa-suite-summary",
artifact: "tool-defaults/qa-suite-summary.json",
required: true,
},
{
id: "mock-token-efficiency",
title: "Mock token efficiency",
kind: "token-efficiency-summary",
artifact: "token/qa-runtime-token-efficiency-summary.json",
required: true,
expectedTokenUsageSource: "mock-estimate",
},
{
id: "live-token-efficiency",
title: "Live token efficiency",
kind: "token-efficiency-summary",
artifact: "live/qa-runtime-token-efficiency-summary.json",
required: true,
missingVerdict: "environment-blocked",
missingReason: "OPENAI OAuth credentials are not available in this runner.",
},
],
};
const report = await buildQaConfidenceReport({
manifest,
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-12T00:00:00.000Z",
});
expect(report.pass).toBe(true);
expect(report.globalPass).toBe(false);
expect(report.counts).toMatchObject({ passed: 2, blocked: 1, unknown: 0, failed: 0 });
expect(report.lanes.map((lane) => lane.verdict)).toEqual([
"pass",
"pass",
"environment-blocked",
]);
expect(report.lanes[0]?.artifactPath).toBe("tool-defaults/qa-suite-summary.json");
expect(report.lanes[0]?.artifactPath).not.toContain(tempRoot);
expect(report.lanes[0]?.details).toContain("counts.skipped=2");
expect(renderQaConfidenceMarkdownReport(report)).toContain("Zero unknowns: yes");
expect(renderQaConfidenceMarkdownReport(report)).toContain("Global pass: no");
});
it("does not let optional lanes block strict gates", async () => {
await writeJson("required/qa-suite-summary.json", {
counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
scenarios: [],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "required",
title: "Required",
kind: "qa-suite-summary",
artifact: "required/qa-suite-summary.json",
required: true,
},
{
id: "optional-missing",
title: "Optional missing",
kind: "qa-suite-summary",
artifact: "optional/qa-suite-summary.json",
required: false,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
strictGlobalPass: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(true);
expect(report.counts).toMatchObject({ total: 1, passed: 1, unknown: 0 });
expect(report.failures).toEqual([]);
expect(report.lanes[1]).toMatchObject({ id: "optional-missing", status: "missing" });
});
it("fails strict global pass when any lane is blocked, missing, unknown, or classified failed", async () => {
await writeJson("classified/qa-suite-summary.json", {
counts: { total: 1, passed: 0, skipped: 0, failed: 1 },
scenarios: [{ name: "classified", status: "fail" }],
});
await writeJson("unknown/qa-suite-summary.json", {
counts: { total: 1, passed: 0, skipped: 0, failed: 1 },
scenarios: [{ name: "unknown", status: "fail" }],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "blocked-live",
title: "Blocked live",
kind: "qa-suite-summary",
artifact: "live/qa-suite-summary.json",
required: true,
missingVerdict: "environment-blocked",
missingReason: "OPENAI_API_KEY missing.",
},
{
id: "missing-soak",
title: "Missing soak",
kind: "qa-suite-summary",
artifact: "soak/qa-suite-summary.json",
required: true,
},
{
id: "classified-fixture",
title: "Classified fixture",
kind: "qa-suite-summary",
artifact: "classified/qa-suite-summary.json",
required: true,
failureVerdict: "fixture-bug",
},
{
id: "unknown-failure",
title: "Unknown failure",
kind: "qa-suite-summary",
artifact: "unknown/qa-suite-summary.json",
required: true,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
strictGlobalPass: true,
generatedAt: "2026-05-12T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.zeroUnknowns).toBe(false);
expect(report.globalPass).toBe(false);
expect(report.counts).toMatchObject({
blocked: 1,
missing: 1,
failed: 1,
unknown: 2,
});
expect(report.failures).toEqual([
"blocked-live is blocked: OPENAI_API_KEY missing.",
"missing-soak is missing: artifact missing and no missingVerdict was configured",
"classified-fixture is classified fixture-bug: qa-suite-summary counts.failed=1 counts.total=1 counts.skipped=0",
"unknown-failure is unclassified: qa-suite-summary counts.failed=1 counts.total=1 counts.skipped=0",
]);
});
it("fails strict global pass for skipped suite rows until a backfill lane passes", async () => {
await writeJson("report-only/qa-suite-summary.json", {
counts: { total: 3, passed: 2, skipped: 1, failed: 0 },
scenarios: [],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "report-only",
title: "Report-only",
kind: "qa-suite-summary",
artifact: "report-only/qa-suite-summary.json",
required: true,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
strictGlobalPass: true,
generatedAt: "2026-05-12T00:00:00.000Z",
});
expect(report.zeroUnknowns).toBe(true);
expect(report.globalPass).toBe(false);
expect(report.failures).toEqual([
"report-only has 1 skipped row(s) with no passing backfill lane",
]);
});
it("infers skipped suite rows from totals and scenario status", async () => {
for (const [artifact, expectedDetail] of [
[{ counts: { total: 3, passed: 2, failed: 0 }, scenarios: [] }, "counts.skipped=1"],
[
{
counts: { total: 2, passed: 2, failed: 0 },
scenarios: [
{ name: "passing", status: "pass" },
{ name: "skipped", status: "skip" },
],
},
"counts.skipped=1",
],
] as const) {
await writeJson("report-only/qa-suite-summary.json", artifact);
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "report-only",
title: "Report-only",
kind: "qa-suite-summary",
artifact: "report-only/qa-suite-summary.json",
required: true,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
strictGlobalPass: true,
generatedAt: "2026-05-12T00:00:00.000Z",
});
expect(report.globalPass).toBe(false);
expect(report.failures).toEqual([
"report-only has 1 skipped row(s) with no passing backfill lane",
]);
expect(report.lanes[0]).toMatchObject({ skippedCount: 1 });
expect(report.lanes[0]?.details).toContain(expectedDetail);
}
});
it("rejects skipped token reports when a live usage source is required", async () => {
await writeJson("live-token/qa-runtime-token-efficiency-summary.json", {
status: "skipped",
pass: true,
rows: [],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "live-token-efficiency",
title: "Live token efficiency",
kind: "token-efficiency-summary",
artifact: "live-token/qa-runtime-token-efficiency-summary.json",
required: true,
expectedTokenUsageSource: "live-usage",
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-12T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.lanes[0]).toMatchObject({
status: "unknown",
details: "token summary has no live-usage rows",
});
});
it("preserves partial zero-unknown mode for classified failing lanes", async () => {
await writeJson("classified/qa-suite-summary.json", {
counts: { total: 1, passed: 0, skipped: 0, failed: 1 },
scenarios: [{ name: "classified", status: "fail" }],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "classified-fixture",
title: "Classified fixture",
kind: "qa-suite-summary",
artifact: "classified/qa-suite-summary.json",
required: true,
failureVerdict: "fixture-bug",
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-12T00:00:00.000Z",
});
expect(report.pass).toBe(true);
expect(report.zeroUnknowns).toBe(true);
expect(report.globalPass).toBe(false);
expect(report.counts.failed).toBe(1);
});
it("passes strict global pass when skipped suite rows are backfilled by a passing lane", async () => {
await writeJson("report-only/qa-suite-summary.json", {
counts: { total: 3, passed: 2, skipped: 1, failed: 0 },
scenarios: [],
});
await writeJson("live-backfill/qa-suite-summary.json", {
counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
scenarios: [],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "report-only",
title: "Report-only",
kind: "qa-suite-summary",
artifact: "report-only/qa-suite-summary.json",
required: true,
skipBackfillLane: "live-backfill",
},
{
id: "live-backfill",
title: "Live backfill",
kind: "qa-suite-summary",
artifact: "live-backfill/qa-suite-summary.json",
required: true,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
strictGlobalPass: true,
generatedAt: "2026-05-12T00:00:00.000Z",
});
expect(report.pass).toBe(true);
expect(report.zeroUnknowns).toBe(true);
expect(report.globalPass).toBe(true);
expect(report.lanes[0]).toMatchObject({
skippedCount: 1,
skipBackfillLane: "live-backfill",
skipBackfilled: true,
});
});
it("classifies environment-blocking gateway sentinels without turning them into unknowns", async () => {
await writeJson("live/qa-suite-summary.json", {
counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
gatewayLogSentinels: [
{
kind: "live-quota-or-subscription",
verdict: "environment-blocked",
owner: "environment",
productImpact: "P4",
qaImpact: "P0",
line: 12,
text: "OpenAI quota exceeded",
},
],
scenarios: [],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "first-hour-live",
title: "First hour live",
kind: "qa-suite-summary",
artifact: "live/qa-suite-summary.json",
required: true,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(true);
expect(report.globalPass).toBe(false);
expect(report.counts).toMatchObject({ blocked: 1, unknown: 0 });
expect(report.lanes[0]).toMatchObject({
status: "blocked",
verdict: "environment-blocked",
});
});
it("does not let environment sentinels hide separate suite failures", async () => {
await writeJson("live/qa-suite-summary.json", {
counts: { total: 2, passed: 1, skipped: 0, failed: 1 },
gatewayLogSentinels: [
{
kind: "live-quota-or-subscription",
verdict: "environment-blocked",
owner: "environment",
line: 12,
text: "OpenAI quota exceeded",
},
],
scenarios: [
{ name: "quota", status: "pass" },
{ name: "unrelated-drift", status: "fail" },
],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "first-hour-live",
title: "First hour live",
kind: "qa-suite-summary",
artifact: "live/qa-suite-summary.json",
required: true,
missingVerdict: "environment-blocked",
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.counts).toMatchObject({ blocked: 0, unknown: 1 });
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
expect(report.lanes[0]?.details).toContain("suite also reports failures");
});
it("classifies product and plugin gateway sentinels as known failing lanes", async () => {
await writeJson("live/qa-suite-summary.json", {
counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
scenarios: [
{
name: "plugin hook health sentinel",
status: "pass",
steps: [],
runtimeParity: {
scenarioId: "plugin-hook-health-sentinel",
drift: "none",
cells: {
pi: { sentinelFindings: [] },
codex: {
sentinelFindings: [
{
kind: "plugin-hook-failure",
verdict: "qa-harness-bug",
owner: "plugin",
productImpact: "P1",
qaImpact: "P0",
line: 4,
text: "before_prompt_build hook failed",
},
],
},
},
},
},
],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "first-hour-live",
title: "First hour live",
kind: "qa-suite-summary",
artifact: "live/qa-suite-summary.json",
required: true,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(true);
expect(report.globalPass).toBe(false);
expect(report.counts).toMatchObject({ failed: 1, unknown: 0 });
expect(report.lanes[0]).toMatchObject({
status: "fail",
verdict: "qa-harness-bug",
});
});
it("treats corrupt artifacts as unknown instead of allowed missing lanes", async () => {
const artifactPath = path.join(tempRoot, "live", "qa-suite-summary.json");
await fs.mkdir(path.dirname(artifactPath), { recursive: true });
await fs.writeFile(artifactPath, "{not-json", "utf8");
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "first-hour-live",
title: "First hour live",
kind: "qa-suite-summary",
artifact: "live/qa-suite-summary.json",
required: true,
missingVerdict: "environment-blocked",
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.counts).toMatchObject({ blocked: 0, unknown: 1 });
expect(report.lanes[0]).toMatchObject({
status: "unknown",
});
expect(report.lanes[0]?.details).toContain("artifact unreadable");
});
it("treats schema-invalid suite artifacts as unknown", async () => {
await writeJson("live/qa-suite-summary.json", {});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "first-hour-live",
title: "First hour live",
kind: "qa-suite-summary",
artifact: "live/qa-suite-summary.json",
required: true,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.counts.unknown).toBe(1);
expect(report.lanes[0]?.details).toContain("missing counts.failed and scenarios[]");
});
it("treats empty suite artifacts as unknown", async () => {
await writeJson("live/qa-suite-summary.json", {
counts: { total: 0, passed: 0, skipped: 0, failed: 0 },
scenarios: [],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "first-hour-live",
title: "First hour live",
kind: "qa-suite-summary",
artifact: "live/qa-suite-summary.json",
required: true,
failureVerdict: "qa-harness-bug",
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
expect(report.lanes[0]?.details).toContain("no executed scenarios");
});
it("treats suite count and scenario mismatches as unknown", async () => {
await writeJson("live/qa-suite-summary.json", {
counts: { total: 2, passed: 2, skipped: 0, failed: 0 },
scenarios: [
{ name: "passing", status: "pass" },
{ name: "stale-failure", status: "fail" },
],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "first-hour-live",
title: "First hour live",
kind: "qa-suite-summary",
artifact: "live/qa-suite-summary.json",
required: true,
failureVerdict: "qa-harness-bug",
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
expect(report.lanes[0]?.details).toContain("count/scenario mismatch");
});
it("requires generic summary lanes to expose an explicit pass signal", async () => {
await writeJson("runtime/qa-runtime-parity-summary.json", {});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "runtime-parity",
title: "Runtime parity",
kind: "runtime-parity-summary",
artifact: "runtime/qa-runtime-parity-summary.json",
required: true,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.counts.unknown).toBe(1);
expect(report.lanes[0]?.details).toContain("explicit pass signal");
});
it("requires JSONL replay summaries to contain replayed user turns", async () => {
for (const [artifact, expectedDetail] of [
[{ transcripts: [] }, "no transcripts"],
[
{ transcripts: [{ transcriptPath: "empty.jsonl", userTurnCount: 0, drift: [] }] },
"no replayed user turns",
],
[
{ transcripts: [{ transcriptPath: "missing-drift.jsonl", userTurnCount: 1 }] },
"missing drift array",
],
] as const) {
await writeJson("jsonl/qa-jsonl-replay-summary.json", artifact);
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "jsonl-expanded",
title: "Expanded JSONL replay",
kind: "jsonl-replay-summary",
artifact: "jsonl/qa-jsonl-replay-summary.json",
required: true,
failureVerdict: "fixture-bug",
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
expect(report.lanes[0]?.details).toContain(expectedDetail);
}
});
it("requires confidence self-test summaries to contain every seeded canary", async () => {
for (const [artifact, expectedDetail] of [
[{ pass: true, canaries: [] }, "no canaries"],
[
{ pass: true, canaries: [{ id: "prompt-drift", detected: true }] },
"missing expected canaries",
],
] as const) {
await writeJson("confidence-self-test/qa-confidence-self-test-summary.json", artifact);
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "confidence-self-test",
title: "Confidence self-test",
kind: "self-test-summary",
artifact: "confidence-self-test/qa-confidence-self-test-summary.json",
required: true,
failureVerdict: "qa-harness-bug",
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-13T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
expect(report.lanes[0]?.details).toContain(expectedDetail);
}
});
it("fails strict zero-unknowns for an unclassified failing lane", async () => {
await writeJson("first-hour/qa-suite-summary.json", {
counts: { total: 18, passed: 17, failed: 1 },
scenarios: [{ name: "approval-turn-tool-followthrough", status: "fail", steps: [] }],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "first-hour-20-direct",
title: "First-hour 20 direct",
kind: "qa-suite-summary",
artifact: "first-hour/qa-suite-summary.json",
required: true,
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-12T00:00:00.000Z",
});
expect(report.pass).toBe(false);
expect(report.counts.unknown).toBe(1);
expect(report.failures[0]).toContain("first-hour-20-direct is unclassified");
});
it("accepts a classified failing lane without treating it as unknown", async () => {
await writeJson("jsonl/qa-jsonl-replay-summary.json", {
transcripts: [
{
transcriptPath: "curated.jsonl",
userTurnCount: 2,
drift: ["none", "tool-result-shape"],
firstDriftAtTurn: 2,
},
],
});
const report = await buildQaConfidenceReport({
manifest: {
version: 1,
profile: "codex-100",
lanes: [
{
id: "jsonl-expanded",
title: "Expanded JSONL replay",
kind: "jsonl-replay-summary",
artifact: "jsonl/qa-jsonl-replay-summary.json",
required: true,
failureVerdict: "fixture-bug",
productImpact: "P4",
qaImpact: "P1",
},
],
},
artifactRoot: tempRoot,
strictZeroUnknowns: true,
generatedAt: "2026-05-12T00:00:00.000Z",
});
expect(report.pass).toBe(true);
expect(report.globalPass).toBe(false);
expect(report.counts.failed).toBe(1);
expect(report.counts.unknown).toBe(0);
expect(report.lanes[0]).toMatchObject({
status: "fail",
verdict: "fixture-bug",
productImpact: "P4",
qaImpact: "P1",
});
});
it("emits confidence self-test canaries for every drift class we need to catch", async () => {
const summary = await buildQaConfidenceSelfTestSummary("2026-05-12T00:00:00.000Z");
expect(summary.pass).toBe(true);
expect(summary.canaries.map((canary) => canary.id)).toEqual([
"prompt-drift",
"tool-description-schema-drift",
"runtime-tool-call-drop",
"tool-result-mismatch",
"failure-mode-drift",
"token-efficiency-regression",
"jsonl-replay-ordering-drift",
]);
expect(summary.canaries.every((canary) => canary.detected)).toBe(true);
});
it("writes confidence self-test artifacts", async () => {
const result = await writeQaConfidenceSelfTestArtifacts({
outputDir: tempRoot,
generatedAt: "2026-05-12T00:00:00.000Z",
});
await expect(fs.stat(result.summaryPath)).resolves.toBeTruthy();
await expect(fs.stat(result.reportPath)).resolves.toBeTruthy();
const summary = JSON.parse(await fs.readFile(result.summaryPath, "utf8")) as { pass: boolean };
expect(summary.pass).toBe(true);
});
});

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,284 @@
import { describe, expect, it } from "vitest";
import {
buildHarnessParityCell,
buildHarnessParityResult,
type HarnessRuntimeParityCell,
type HarnessVariant,
} from "./harness-parity.js";
import type { RuntimeId } from "./runtime-parity.js";
import type { RuntimeParityComparisonMode } from "./runtime-tool-metadata.js";
const LEFT: HarnessVariant = { id: "left", label: "Left", runtime: "pi" };
const RIGHT: HarnessVariant = { id: "right", label: "Right", runtime: "pi" };
const BASE_PROMPT_REPORT = {
systemPrompt: {
chars: 100,
projectContextChars: 40,
nonProjectContextChars: 60,
hash: "system-a",
},
skills: {
promptChars: 12,
hash: "skills-a",
},
tools: {
schemaChars: 20,
entries: [
{
name: "read",
summaryChars: 8,
summaryHash: "summary-a",
schemaChars: 20,
schemaHash: "schema-a",
propertiesCount: 1,
},
],
},
};
function makeCell(
runtime: RuntimeId,
overrides: Partial<HarnessRuntimeParityCell> = {},
): HarnessRuntimeParityCell {
return {
runtime,
transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n',
toolCalls: [],
finalText: "same",
usage: { inputTokens: 10, outputTokens: 5, totalTokens: 15 },
wallClockMs: 1,
bootStateLines: [],
systemPromptReport: BASE_PROMPT_REPORT,
...overrides,
};
}
function classify(
left: Partial<HarnessRuntimeParityCell>,
right: Partial<HarnessRuntimeParityCell>,
comparisonMode?: RuntimeParityComparisonMode,
) {
return buildHarnessParityResult({
scenarioId: "scenario",
left: buildHarnessParityCell({
variant: LEFT,
cell: makeCell("pi", left),
tokenUsageSource: "live-usage",
}),
right: buildHarnessParityCell({
variant: RIGHT,
cell: makeCell("pi", right),
tokenUsageSource: "live-usage",
}),
...(comparisonMode ? { comparisonMode } : {}),
}).drift;
}
describe("harness parity", () => {
it("classifies prompt and tool surface drift before behavioral drift", () => {
expect(
classify(
{},
{
systemPromptReport: {
...BASE_PROMPT_REPORT,
systemPrompt: { chars: 101, projectContextChars: 40, nonProjectContextChars: 61 },
},
},
),
).toBe("system-prompt");
expect(
classify(
{},
{
systemPromptReport: {
...BASE_PROMPT_REPORT,
systemPrompt: {
chars: 100,
projectContextChars: 40,
nonProjectContextChars: 60,
hash: "system-b",
},
},
},
),
).toBe("system-prompt");
expect(
classify(
{},
{
systemPromptReport: {
...BASE_PROMPT_REPORT,
skills: { promptChars: 12, hash: "skills-b" },
},
},
),
).toBe("system-prompt");
expect(
classify(
{},
{
systemPromptReport: {
...BASE_PROMPT_REPORT,
tools: {
schemaChars: 20,
entries: [
{
name: "read",
summaryChars: 8,
summaryHash: "summary-b",
schemaChars: 20,
schemaHash: "schema-a",
propertiesCount: 1,
},
],
},
},
},
),
).toBe("tool-description");
expect(
classify(
{},
{
systemPromptReport: {
...BASE_PROMPT_REPORT,
tools: {
schemaChars: 20,
entries: [
{
name: "read",
summaryChars: 8,
summaryHash: "summary-a",
schemaChars: 20,
schemaHash: "schema-b",
propertiesCount: 1,
},
],
},
},
},
),
).toBe("tool-schema");
});
it("classifies behavioral harness drift", () => {
expect(
classify(
{ toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r" }] },
{ toolCalls: [{ tool: "read", argsHash: "b", resultHash: "r" }] },
),
).toBe("tool-call-shape");
expect(
classify(
{ toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r1" }] },
{ toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r2" }] },
),
).toBe("tool-result-shape");
expect(classify({ finalText: "same text" }, { finalText: "different text" })).toBe("text-only");
expect(
classify(
{
transcriptBytes:
'{"type":"model_change","modelId":"gpt-5.5"}\n' +
'{"type":"thinking_level_change","thinkingLevel":"off"}\n' +
'{"type":"custom","customType":"model-snapshot"}\n' +
'{"message":{"role":"assistant","content":"same"}}\n',
},
{ transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n' },
),
).toBe("none");
expect(
classify(
{ transcriptBytes: '{"message":{"role":"assistant"}}\n' },
{ transcriptBytes: '{"message":{"role":"assistant"}}\n{"message":{"role":"tool"}}\n' },
),
).toBe("structural");
expect(
classify(
{ transcriptBytes: '{"role":"assistant","content":"same"}\n' },
{
transcriptBytes:
'{"role":"assistant","content":"same"}\n{"role":"tool","content":"same"}\n',
},
),
).toBe("structural");
expect(classify({ runtimeErrorClass: "timeout" }, {})).toBe("failure-mode");
});
it("honors native workspace comparison mode for outcome-only harness proofs", () => {
expect(
classify(
{
transcriptBytes:
'{"message":{"role":"assistant","content":"same"}}\n' +
'{"message":{"role":"tool","content":"same result"}}\n',
toolCalls: [{ tool: "bash", argsHash: "sed-160", resultHash: "same-result" }],
},
{
transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n',
toolCalls: [{ tool: "bash", argsHash: "sed-200", resultHash: "same-result" }],
},
"codex-native-workspace",
),
).toBe("none");
expect(
classify(
{ toolCalls: [{ tool: "bash", argsHash: "a", resultHash: "r1" }] },
{ toolCalls: [{ tool: "bash", argsHash: "b", resultHash: "r2" }] },
"outcome-only",
),
).toBe("none");
});
it("keeps prompt and tool surface checks strict under native workspace comparison mode", () => {
expect(
classify(
{},
{
systemPromptReport: {
...BASE_PROMPT_REPORT,
systemPrompt: { chars: 101, projectContextChars: 40, nonProjectContextChars: 61 },
},
toolCalls: [{ tool: "bash", argsHash: "changed", resultHash: "changed" }],
},
"codex-native-workspace",
),
).toBe("system-prompt");
expect(
classify(
{},
{
systemPromptReport: {
...BASE_PROMPT_REPORT,
tools: {
schemaChars: 20,
entries: [{ name: "read", summaryChars: 9, schemaChars: 20, propertiesCount: 1 }],
},
},
toolCalls: [{ tool: "bash", argsHash: "changed", resultHash: "changed" }],
},
"outcome-only",
),
).toBe("tool-description");
});
it("labels mock token estimates separately from live usage", () => {
const sourceCell = makeCell("pi", {
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 },
});
const cell = buildHarnessParityCell({
variant: LEFT,
cell: sourceCell,
tokenUsageSource: "mock-estimate",
});
const inputChars = 100 + 12 + 8 + 20 + sourceCell.transcriptBytes.length;
expect(cell.tokenUsageSource).toBe("mock-estimate");
expect(cell.tokenUsage.totalTokens).toBeGreaterThan(0);
expect(cell.tokenUsage.inputTokens).toBe(Math.ceil(inputChars / 4));
expect(cell.promptStats.toolCount).toBe(1);
});
});

View File

@@ -0,0 +1,491 @@
import { createHash } from "node:crypto";
import type {
RuntimeId,
RuntimeParityCell,
RuntimeParityDrift,
RuntimeParityToolCall,
RuntimeParityUsage,
} from "./runtime-parity.js";
import type { RuntimeParityComparisonMode } from "./runtime-tool-metadata.js";
export type HarnessVariant = {
id: string;
label: string;
runtime?: RuntimeId;
model?: string;
configPatch?: Record<string, unknown>;
systemPromptOverlay?: string;
toolDescriptionOverlay?: Record<string, string>;
};
export type HarnessParityDrift =
| RuntimeParityDrift
| "system-prompt"
| "tool-description"
| "tool-schema";
export type HarnessParityPromptStats = {
systemPromptChars: number;
projectContextChars: number;
nonProjectContextChars: number;
skillPromptChars: number;
toolSummaryChars: number;
toolSchemaChars: number;
toolCount: number;
};
export type RuntimeParitySystemPromptReport = {
systemPrompt?: {
chars?: number;
projectContextChars?: number;
nonProjectContextChars?: number;
text?: string;
hash?: string;
contentHash?: string;
};
skills?: {
promptChars?: number;
prompt?: string;
hash?: string;
contentHash?: string;
};
tools?: {
listChars?: number;
schemaChars?: number;
entries?: Array<{
name?: string;
summary?: string;
summaryHash?: string;
summaryChars?: number;
schema?: unknown;
schemaHash?: string;
schemaChars?: number;
propertiesCount?: number;
}>;
};
};
export type HarnessRuntimeParityCell = RuntimeParityCell & {
systemPromptReport?: RuntimeParitySystemPromptReport;
};
export type HarnessParityCell = HarnessRuntimeParityCell & {
variant: HarnessVariant;
promptStats: HarnessParityPromptStats;
systemPromptHash: string;
toolDescriptionHash: string;
toolSchemaHash: string;
tokenUsage: RuntimeParityUsage;
tokenUsageSource: "live-usage" | "mock-estimate";
};
export type HarnessParityResult = {
scenarioId: string;
left: HarnessParityCell;
right: HarnessParityCell;
drift: HarnessParityDrift;
driftDetails?: string;
promptDelta: {
systemPromptChars: number;
projectContextChars: number;
skillPromptChars: number;
toolSummaryChars: number;
toolSchemaChars: number;
toolCount: number;
};
tokenDeltaPercent: number;
firstDriftTurn?: number;
};
export type HarnessParityReport = {
generatedAt: string;
providerMode: string;
left: HarnessVariant;
right: HarnessVariant;
results: HarnessParityResult[];
pass: boolean;
failures: string[];
};
function sha256(value: string) {
return createHash("sha256").update(value).digest("hex");
}
function countComparableTranscriptRecords(transcriptBytes: string) {
let count = 0;
for (const line of transcriptBytes.split(/\r?\n/u)) {
const trimmed = line.trim();
if (!trimmed) {
continue;
}
try {
const parsed = JSON.parse(trimmed) as {
message?: { role?: unknown };
role?: unknown;
};
if (
(parsed.message && typeof parsed.message.role === "string") ||
typeof parsed.role === "string"
) {
count += 1;
}
} catch {
// Ignore malformed QA transcript rows and keep parity classification deterministic.
}
}
return count;
}
function normalizeForStableHash(value: unknown): unknown {
if (Array.isArray(value)) {
return value.map((entry) => normalizeForStableHash(entry));
}
if (value && typeof value === "object") {
const record = value as Record<string, unknown>;
return Object.fromEntries(
Object.keys(record)
.toSorted((left, right) => left.localeCompare(right))
.map((key) => [key, normalizeForStableHash(record[key])]),
);
}
return value;
}
function stableHash(value: unknown) {
return sha256(JSON.stringify(normalizeForStableHash(value)) ?? "null");
}
function readPositiveNumber(value: unknown) {
return typeof value === "number" && Number.isFinite(value) && value > 0 ? Math.floor(value) : 0;
}
function buildPromptStats(report: RuntimeParitySystemPromptReport | undefined) {
const toolEntries = Array.isArray(report?.tools?.entries) ? report.tools.entries : [];
return {
systemPromptChars: readPositiveNumber(report?.systemPrompt?.chars),
projectContextChars: readPositiveNumber(report?.systemPrompt?.projectContextChars),
nonProjectContextChars: readPositiveNumber(report?.systemPrompt?.nonProjectContextChars),
skillPromptChars: readPositiveNumber(report?.skills?.promptChars),
toolSummaryChars: toolEntries.reduce(
(sum, entry) => sum + readPositiveNumber(entry.summaryChars),
0,
),
toolSchemaChars: readPositiveNumber(report?.tools?.schemaChars),
toolCount: toolEntries.length,
};
}
function estimateUsage(
cell: RuntimeParityCell,
stats: HarnessParityPromptStats,
): RuntimeParityUsage {
const inputChars =
stats.systemPromptChars +
stats.skillPromptChars +
stats.toolSummaryChars +
stats.toolSchemaChars +
cell.transcriptBytes.length;
const outputChars = cell.finalText.length + cell.toolCalls.length * 80;
const inputTokens = Math.ceil(inputChars / 4);
const outputTokens = Math.ceil(outputChars / 4);
return {
inputTokens,
outputTokens,
totalTokens: inputTokens + outputTokens,
};
}
function normalizeTextForParity(text: string) {
return text.replace(/\s+/gu, " ").trim();
}
function compareToolCallShape(left: RuntimeParityToolCall[], right: RuntimeParityToolCall[]) {
if (left.length !== right.length) {
return `tool call count differs (${left.length} vs ${right.length})`;
}
for (let index = 0; index < left.length; index += 1) {
const leftCall = left[index];
const rightCall = right[index];
if (!leftCall || !rightCall) {
return `tool call row ${index + 1} missing`;
}
if (leftCall.tool !== rightCall.tool || leftCall.argsHash !== rightCall.argsHash) {
return `tool call ${index + 1} differs (${leftCall.tool}/${leftCall.argsHash} vs ${rightCall.tool}/${rightCall.argsHash})`;
}
}
return undefined;
}
function compareToolResultShape(left: RuntimeParityToolCall[], right: RuntimeParityToolCall[]) {
const total = Math.min(left.length, right.length);
for (let index = 0; index < total; index += 1) {
const leftCall = left[index];
const rightCall = right[index];
if (!leftCall || !rightCall) {
continue;
}
if (
leftCall.resultHash !== rightCall.resultHash ||
(leftCall.errorClass ?? "") !== (rightCall.errorClass ?? "")
) {
return `tool result ${index + 1} differs (${leftCall.tool})`;
}
}
return undefined;
}
function firstDriftTurn(leftTranscript: string, rightTranscript: string): number | undefined {
const leftLines = leftTranscript.trim().length ? leftTranscript.trim().split(/\r?\n/u) : [];
const rightLines = rightTranscript.trim().length ? rightTranscript.trim().split(/\r?\n/u) : [];
const total = Math.max(leftLines.length, rightLines.length);
for (let index = 0; index < total; index += 1) {
if ((leftLines[index] ?? "") !== (rightLines[index] ?? "")) {
return index + 1;
}
}
return undefined;
}
export function buildHarnessParityCell(params: {
variant: HarnessVariant;
cell: HarnessRuntimeParityCell;
tokenUsageSource: HarnessParityCell["tokenUsageSource"];
}): HarnessParityCell {
const report = params.cell.systemPromptReport;
const promptStats = buildPromptStats(report);
const toolEntries = report?.tools?.entries ?? [];
const tokenUsage =
params.tokenUsageSource === "live-usage"
? params.cell.usage
: estimateUsage(params.cell, promptStats);
return {
...params.cell,
variant: params.variant,
...(report ? { systemPromptReport: report } : {}),
promptStats,
systemPromptHash: stableHash({
systemPrompt: report?.systemPrompt ?? null,
skills: report?.skills ?? null,
}),
toolDescriptionHash: stableHash(
toolEntries.map((entry) => {
return {
name: entry.name,
summary: entry.summary,
summaryHash: entry.summaryHash,
summaryChars: entry.summaryChars,
};
}),
),
toolSchemaHash: stableHash({
listChars: report?.tools?.listChars,
schemaChars: report?.tools?.schemaChars,
entries: toolEntries.map((entry) => {
return {
name: entry.name,
schema: entry.schema,
schemaHash: entry.schemaHash,
schemaChars: entry.schemaChars,
propertiesCount: entry.propertiesCount,
};
}),
}),
tokenUsage,
tokenUsageSource: params.tokenUsageSource,
};
}
export function buildHarnessParityResult(params: {
scenarioId: string;
left: HarnessParityCell;
right: HarnessParityCell;
comparisonMode?: RuntimeParityComparisonMode;
}): HarnessParityResult {
const promptDelta = {
systemPromptChars:
params.right.promptStats.systemPromptChars - params.left.promptStats.systemPromptChars,
projectContextChars:
params.right.promptStats.projectContextChars - params.left.promptStats.projectContextChars,
skillPromptChars:
params.right.promptStats.skillPromptChars - params.left.promptStats.skillPromptChars,
toolSummaryChars:
params.right.promptStats.toolSummaryChars - params.left.promptStats.toolSummaryChars,
toolSchemaChars:
params.right.promptStats.toolSchemaChars - params.left.promptStats.toolSchemaChars,
toolCount: params.right.promptStats.toolCount - params.left.promptStats.toolCount,
};
const tokenDeltaPercent =
params.left.tokenUsage.totalTokens === 0
? params.right.tokenUsage.totalTokens === 0
? 0
: 100
: ((params.right.tokenUsage.totalTokens - params.left.tokenUsage.totalTokens) /
params.left.tokenUsage.totalTokens) *
100;
const failDetails =
params.left.transportErrorClass || params.right.transportErrorClass
? "at least one harness variant hit a transport failure"
: params.left.runtimeErrorClass || params.right.runtimeErrorClass
? "at least one harness variant hit a runtime failure"
: undefined;
if (failDetails) {
return {
scenarioId: params.scenarioId,
left: params.left,
right: params.right,
drift: "failure-mode",
driftDetails: failDetails,
promptDelta,
tokenDeltaPercent,
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
};
}
if (params.left.systemPromptHash !== params.right.systemPromptHash) {
return {
scenarioId: params.scenarioId,
left: params.left,
right: params.right,
drift: "system-prompt",
driftDetails: "system prompt report differs",
promptDelta,
tokenDeltaPercent,
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
};
}
if (params.left.toolDescriptionHash !== params.right.toolDescriptionHash) {
return {
scenarioId: params.scenarioId,
left: params.left,
right: params.right,
drift: "tool-description",
driftDetails: "tool description summary shape differs",
promptDelta,
tokenDeltaPercent,
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
};
}
if (params.left.toolSchemaHash !== params.right.toolSchemaHash) {
return {
scenarioId: params.scenarioId,
left: params.left,
right: params.right,
drift: "tool-schema",
driftDetails: "tool schema shape differs",
promptDelta,
tokenDeltaPercent,
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
};
}
const compareToolShapes =
params.comparisonMode !== "codex-native-workspace" && params.comparisonMode !== "outcome-only";
const compareTranscriptStructure =
params.comparisonMode !== "codex-native-workspace" && params.comparisonMode !== "outcome-only";
if (compareToolShapes) {
const toolCallDrift = compareToolCallShape(params.left.toolCalls, params.right.toolCalls);
if (toolCallDrift) {
return {
scenarioId: params.scenarioId,
left: params.left,
right: params.right,
drift: "tool-call-shape",
driftDetails: toolCallDrift,
promptDelta,
tokenDeltaPercent,
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
};
}
const toolResultDrift = compareToolResultShape(params.left.toolCalls, params.right.toolCalls);
if (toolResultDrift) {
return {
scenarioId: params.scenarioId,
left: params.left,
right: params.right,
drift: "tool-result-shape",
driftDetails: toolResultDrift,
promptDelta,
tokenDeltaPercent,
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
};
}
}
const leftTranscriptRecords = countComparableTranscriptRecords(params.left.transcriptBytes);
const rightTranscriptRecords = countComparableTranscriptRecords(params.right.transcriptBytes);
if (
compareTranscriptStructure &&
(leftTranscriptRecords !== rightTranscriptRecords ||
(!params.left.finalText && !!params.right.finalText) ||
(!!params.left.finalText && !params.right.finalText))
) {
return {
scenarioId: params.scenarioId,
left: params.left,
right: params.right,
drift: "structural",
driftDetails: `transcript/final-text structure differs (${leftTranscriptRecords} message records vs ${rightTranscriptRecords} message records)`,
promptDelta,
tokenDeltaPercent,
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
};
}
if (
normalizeTextForParity(params.left.finalText) !== normalizeTextForParity(params.right.finalText)
) {
return {
scenarioId: params.scenarioId,
left: params.left,
right: params.right,
drift: "text-only",
driftDetails: "final text differs after whitespace normalization",
promptDelta,
tokenDeltaPercent,
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
};
}
return {
scenarioId: params.scenarioId,
left: params.left,
right: params.right,
drift: "none",
promptDelta,
tokenDeltaPercent,
};
}
function formatPercent(value: number) {
const normalized = Math.abs(value) < 0.05 ? 0 : value;
const prefix = normalized > 0 ? "+" : "";
return `${prefix}${normalized.toFixed(1)}%`;
}
export function renderHarnessParityMarkdownReport(report: HarnessParityReport): string {
const lines = [
`# OpenClaw Harness Parity - ${report.left.label} vs ${report.right.label}`,
"",
`- Generated at: ${report.generatedAt}`,
`- Provider mode: ${report.providerMode}`,
`- Verdict: ${report.pass ? "pass" : "fail"}`,
"",
"| Scenario | Drift | First drift turn | Token delta | Prompt chars delta | Tool count delta | Details |",
"| --- | --- | ---: | ---: | ---: | ---: | --- |",
];
for (const result of report.results) {
lines.push(
`| ${result.scenarioId} | ${result.drift} | ${result.firstDriftTurn ?? ""} | ${formatPercent(
result.tokenDeltaPercent,
)} | ${result.promptDelta.systemPromptChars} | ${result.promptDelta.toolCount} | ${
result.driftDetails ?? ""
} |`,
);
}
if (report.failures.length > 0) {
lines.push("", "## Gate Failures", "");
for (const failure of report.failures) {
lines.push(`- ${failure}`);
}
}
return `${lines.join("\n").trimEnd()}\n`;
}

View File

@@ -144,4 +144,76 @@ describe("buildSystemPromptReport", () => {
expect(report.systemPrompt.projectContextChars).toBe(0);
expect(report.systemPrompt.nonProjectContextChars).toBe("custom override".length);
});
it("emits content hashes for prompt and tool parity checks", () => {
const file = makeBootstrapFile({ path: "/tmp/workspace/AGENTS.md" });
const report = buildSystemPromptReport({
source: "run",
generatedAt: 0,
bootstrapMaxChars: 20_000,
systemPrompt: "system",
bootstrapFiles: [file],
injectedFiles: [],
skillsPrompt: "<skill><name>docs</name></skill>",
tools: [
{
name: "read",
description: "Read files",
parameters: {
type: "object",
properties: { path: { type: "string" } },
},
},
] as never,
});
const sameLengthChangedPrompt = buildSystemPromptReport({
source: "run",
generatedAt: 0,
bootstrapMaxChars: 20_000,
systemPrompt: "systen",
bootstrapFiles: [file],
injectedFiles: [],
skillsPrompt: "<skill><name>docs</name></skill>",
tools: [],
});
expect(report.systemPrompt.hash).toMatch(/^[a-f0-9]{64}$/u);
expect(report.skills.hash).toMatch(/^[a-f0-9]{64}$/u);
expect(report.tools.entries[0]?.summaryHash).toMatch(/^[a-f0-9]{64}$/u);
expect(report.tools.entries[0]?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
expect(sameLengthChangedPrompt.systemPrompt.hash).not.toBe(report.systemPrompt.hash);
});
it("keeps reporting when a tool schema cannot be stringified", () => {
const file = makeBootstrapFile({ path: "/tmp/workspace/AGENTS.md" });
const circularSchema: Record<string, unknown> = {
type: "object",
properties: { count: { type: "integer" } },
};
circularSchema.self = circularSchema;
const report = buildSystemPromptReport({
source: "run",
generatedAt: 0,
bootstrapMaxChars: 20_000,
systemPrompt: "system",
bootstrapFiles: [file],
injectedFiles: [],
skillsPrompt: "",
tools: [
{
name: "broken",
description: "Broken schema",
parameters: circularSchema,
},
] as never,
});
expect(report.tools.entries[0]).toMatchObject({
name: "broken",
schemaChars: 0,
propertiesCount: 1,
});
expect(report.tools.entries[0]?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
});
});

View File

@@ -1,3 +1,4 @@
import { createHash } from "node:crypto";
import type { AgentTool } from "@earendil-works/pi-agent-core";
import type { SessionSystemPromptReport } from "../config/sessions/types.js";
import { buildBootstrapInjectionStats } from "./bootstrap-budget.js";
@@ -9,9 +10,47 @@ type ToolReportEntry = SessionSystemPromptReport["tools"]["entries"][number];
const toolReportEntryCache = new WeakMap<AgentTool, ToolReportEntry>();
const toolSchemaStatsCache = new WeakMap<
object,
Pick<ToolReportEntry, "propertiesCount" | "schemaChars">
Pick<ToolReportEntry, "propertiesCount" | "schemaChars" | "schemaHash">
>();
function sha256(value: string): string {
return createHash("sha256").update(value).digest("hex");
}
function normalizeForStableHash(value: unknown, seen = new WeakSet<object>()): unknown {
if (typeof value === "bigint") {
return `${value.toString()}n`;
}
if (value && typeof value === "object") {
if (seen.has(value)) {
return "[Circular]";
}
seen.add(value);
if (Array.isArray(value)) {
const normalized = value.map((entry) => normalizeForStableHash(entry, seen));
seen.delete(value);
return normalized;
}
const record = value as Record<string, unknown>;
const normalized = Object.fromEntries(
Object.keys(record)
.toSorted((left, right) => left.localeCompare(right))
.map((key) => [key, normalizeForStableHash(record[key], seen)]),
);
seen.delete(value);
return normalized;
}
return value;
}
function stableJsonHash(value: unknown): string {
try {
return sha256(JSON.stringify(normalizeForStableHash(value)) ?? "null");
} catch {
return sha256("[unserializable]");
}
}
function extractBetween(input: string, startMarker: string, endMarker: string): string {
const start = input.indexOf(startMarker);
if (start === -1) {
@@ -39,9 +78,9 @@ function parseSkillBlocks(skillsPrompt: string): Array<{ name: string; blockChar
function buildToolSchemaStats(
parameters: AgentTool["parameters"],
): Pick<ToolReportEntry, "propertiesCount" | "schemaChars"> {
): Pick<ToolReportEntry, "propertiesCount" | "schemaChars" | "schemaHash"> {
if (!parameters || typeof parameters !== "object") {
return { schemaChars: 0, propertiesCount: null };
return { schemaChars: 0, schemaHash: stableJsonHash(null), propertiesCount: null };
}
const cached = toolSchemaStatsCache.get(parameters);
if (cached) {
@@ -55,6 +94,7 @@ function buildToolSchemaStats(
return 0;
}
})(),
schemaHash: stableJsonHash(parameters),
propertiesCount: (() => {
const schema = parameters as Record<string, unknown>;
const props = typeof schema.properties === "object" ? schema.properties : null;
@@ -78,7 +118,7 @@ function buildToolsEntries(tools: AgentTool[]): SessionSystemPromptReport["tools
const summary = tool.description?.trim() || tool.label?.trim() || "";
const summaryChars = summary.length;
const schemaStats = buildToolSchemaStats(tool.parameters);
const entry = { name, summaryChars, ...schemaStats };
const entry = { name, summaryChars, summaryHash: sha256(summary), ...schemaStats };
toolReportEntryCache.set(tool, entry);
return entry;
});
@@ -129,6 +169,7 @@ export function buildSystemPromptReport(params: {
chars: systemPromptChars,
projectContextChars,
nonProjectContextChars: Math.max(0, systemPromptChars - projectContextChars),
hash: sha256(params.systemPrompt),
},
...(params.currentTurn ? { currentTurn: params.currentTurn } : {}),
injectedWorkspaceFiles: buildBootstrapInjectionStats({
@@ -137,6 +178,7 @@ export function buildSystemPromptReport(params: {
}),
skills: {
promptChars: params.skillsPrompt.length,
hash: sha256(params.skillsPrompt),
entries: skillsEntries,
},
tools: {

View File

@@ -644,6 +644,7 @@ export type SessionSystemPromptReport = {
chars: number;
projectContextChars: number;
nonProjectContextChars: number;
hash?: string;
};
currentTurn?: {
kind?: "user_request" | "room_event";
@@ -660,6 +661,7 @@ export type SessionSystemPromptReport = {
}>;
skills: {
promptChars: number;
hash?: string;
entries: Array<{ name: string; blockChars: number }>;
};
tools: {
@@ -668,7 +670,9 @@ export type SessionSystemPromptReport = {
entries: Array<{
name: string;
summaryChars: number;
summaryHash?: string;
schemaChars: number;
schemaHash?: string;
propertiesCount?: number | null;
}>;
};