test(qa-lab): add runtime confidence reports

2026-06-06 05:51:15 +08:00 · 2026-05-25 20:31:03 +01:00
parent cda7c30150
commit f6a49a4e8a
12 changed files with 3336 additions and 6 deletions
--- a/extensions/codex/src/app-server/run-attempt.test.ts
+++ b/extensions/codex/src/app-server/run-attempt.test.ts
@@ -2844,11 +2844,17 @@ describe("runCodexAppServerAttempt", () => {
    expect(report?.provider).toBe("codex");
    expect(report?.model).toBe("gpt-5.4-codex");
    expect(report?.systemPrompt.chars).toBeGreaterThan(0);
+    expect(report?.systemPrompt.hash).toMatch(/^[a-f0-9]{64}$/u);
+    expect(report?.skills.hash).toMatch(/^[a-f0-9]{64}$/u);

    const message = report?.tools.entries.find((tool) => tool.name === "message");
    const webSearch = report?.tools.entries.find((tool) => tool.name === "web_search");
    expect(message?.schemaChars).toBeGreaterThan(0);
+    expect(message?.summaryHash).toMatch(/^[a-f0-9]{64}$/u);
+    expect(message?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
    expect(webSearch?.schemaChars).toBe(0);
+    expect(webSearch?.summaryHash).toMatch(/^[a-f0-9]{64}$/u);
+    expect(webSearch?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
    expect(report?.tools.schemaChars).toBe(message?.schemaChars);
  });

@@ -6574,7 +6580,8 @@ describe("runCodexAppServerAttempt", () => {
      input?: Array<{ text?: string }>;
    };
    expect(turnStartParams.input?.[0]?.text).toBe(exactCommand);
-    expect(result.systemPromptReport?.skills).toEqual({ promptChars: 0, entries: [] });
+    expect(result.systemPromptReport?.skills).toMatchObject({ promptChars: 0, entries: [] });
+    expect(result.systemPromptReport?.skills.hash).toMatch(/^[a-f0-9]{64}$/u);
  });

  it("fires llm_input, llm_output, and agent_end hooks for codex turns", async () => {
--- a/extensions/codex/src/app-server/run-attempt.ts
+++ b/extensions/codex/src/app-server/run-attempt.ts
@@ -5095,6 +5095,7 @@ function buildCodexSystemPromptReport(params: {
      chars: params.developerInstructions.length,
      projectContextChars: 0,
      nonProjectContextChars: params.developerInstructions.length,
+      hash: sha256Text(params.developerInstructions),
    },
    injectedWorkspaceFiles: buildCodexBootstrapInjectionStats({
      bootstrapFiles: params.workspaceBootstrapContext.bootstrapFiles,
@@ -5106,6 +5107,7 @@ function buildCodexSystemPromptReport(params: {
    }),
    skills: {
      promptChars: skillsPrompt.length,
+      hash: sha256Text(skillsPrompt),
      entries: buildCodexSkillReportEntries(skillsPrompt),
    },
    tools: {
@@ -5137,20 +5139,23 @@ function buildCodexToolReportEntry(tool: CodexDynamicToolSpec): CodexToolReportE
    return {
      name: tool.name,
      summaryChars: summary.length,
+      summaryHash: sha256Text(summary),
      schemaChars: 0,
+      schemaHash: stableJsonHash(null),
      propertiesCount: null,
    };
  }
  return {
    name: tool.name,
    summaryChars: summary.length,
+    summaryHash: sha256Text(summary),
    ...buildCodexToolSchemaStats(tool.inputSchema),
  };
 }

 function buildCodexToolSchemaStats(
  schema: JsonValue,
-): Pick<CodexToolReportEntry, "schemaChars" | "propertiesCount"> {
+): Pick<CodexToolReportEntry, "schemaChars" | "schemaHash" | "propertiesCount"> {
  const schemaChars = (() => {
    try {
      return JSON.stringify(schema).length;
@@ -5162,10 +5167,34 @@ function buildCodexToolSchemaStats(
    isJsonObject(schema) && isJsonObject(schema.properties) ? schema.properties : null;
  return {
    schemaChars,
+    schemaHash: stableJsonHash(schema),
    propertiesCount: properties ? Object.keys(properties).length : null,
  };
 }

+function sha256Text(value: string): string {
+  return createHash("sha256").update(value).digest("hex");
+}
+
+function normalizeForStableHash(value: unknown): unknown {
+  if (Array.isArray(value)) {
+    return value.map((entry) => normalizeForStableHash(entry));
+  }
+  if (value && typeof value === "object") {
+    const record = value as Record<string, unknown>;
+    return Object.fromEntries(
+      Object.keys(record)
+        .toSorted((left, right) => left.localeCompare(right))
+        .map((key) => [key, normalizeForStableHash(record[key])]),
+    );
+  }
+  return value;
+}
+
+function stableJsonHash(value: JsonValue): string {
+  return sha256Text(JSON.stringify(normalizeForStableHash(value)) ?? "null");
+}
+
 function buildCodexBootstrapInjectionStats(params: {
  bootstrapFiles: CodexBootstrapFile[];
  injectedFiles: EmbeddedContextFile[];
--- a/extensions/qa-lab/confidence-profiles/codex-100.json
+++ b/extensions/qa-lab/confidence-profiles/codex-100.json
@@ -0,0 +1,168 @@
+{
+  "version": 1,
+  "profile": "codex-100",
+  "lanes": [
+    {
+      "id": "tool-defaults-direct",
+      "title": "Tool-defaults direct runtime parity",
+      "kind": "qa-suite-summary",
+      "artifact": "tool-defaults-direct/qa-suite-summary.json",
+      "required": true,
+      "productImpact": "P2",
+      "qaImpact": "P0",
+      "issue": "https://github.com/openclaw/openclaw/issues/80319",
+      "ownerAction": "Fix product or harness before claiming the tool-defaults gate is trusted.",
+      "labels": ["qa-lab", "runtime-parity", "codex"]
+    },
+    {
+      "id": "openclaw-dynamic-tools-direct",
+      "title": "OpenClaw dynamic integration tools direct runtime parity",
+      "kind": "qa-suite-summary",
+      "artifact": "openclaw-dynamic-tools-direct/qa-suite-summary.json",
+      "required": true,
+      "productImpact": "P1",
+      "qaImpact": "P0",
+      "issue": "https://github.com/openclaw/openclaw/issues/80319",
+      "ownerAction": "Investigate any hard failure as an OpenClaw dynamic integration or QA loading regression.",
+      "labels": ["qa-lab", "runtime-parity", "openclaw-dynamic-tools"]
+    },
+    {
+      "id": "tool-defaults-searchable",
+      "title": "Tool-defaults searchable runtime parity",
+      "kind": "qa-suite-summary",
+      "artifact": "tool-defaults-searchable/qa-suite-summary.json",
+      "required": true,
+      "failureVerdict": "mock-limitation",
+      "skipBackfillLane": "openclaw-dynamic-tools-searchable-live",
+      "productImpact": "P4",
+      "qaImpact": "P2",
+      "issue": "https://github.com/openclaw/openclaw/issues/80319",
+      "ownerAction": "Keep as report-only until searchable/deferred tool modeling has no mock-only ambiguity.",
+      "labels": ["qa-lab", "runtime-parity", "searchable-tools"]
+    },
+    {
+      "id": "first-hour-20-direct",
+      "title": "First-hour 20-turn direct runtime parity",
+      "kind": "qa-suite-summary",
+      "artifact": "first-hour-20-direct/qa-suite-summary.json",
+      "required": true,
+      "skipBackfillLane": "codex-native-live",
+      "productImpact": "P1",
+      "qaImpact": "P0",
+      "ownerAction": "Triage row-by-row; do not file product bugs unless live/native proof reproduces.",
+      "labels": ["qa-lab", "runtime-parity", "first-hour"]
+    },
+    {
+      "id": "mock-token-efficiency",
+      "title": "Mock assistant-message token efficiency estimate",
+      "kind": "token-efficiency-summary",
+      "artifact": "first-hour-20-direct-report/qa-runtime-token-efficiency-summary.json",
+      "required": true,
+      "expectedTokenUsageSource": "mock-estimate",
+      "productImpact": "P4",
+      "qaImpact": "P1",
+      "ownerAction": "Fix labeling before trusting token-efficiency comparisons.",
+      "labels": ["qa-lab", "runtime-parity", "token-efficiency"]
+    },
+    {
+      "id": "fault-injection-mock",
+      "title": "Mock fault-injection runtime parity",
+      "kind": "qa-suite-summary",
+      "artifact": "fault-injection-mock/qa-suite-summary.json",
+      "required": true,
+      "skipBackfillLane": "codex-native-live",
+      "productImpact": "P2",
+      "qaImpact": "P0",
+      "ownerAction": "Treat failures as retry/recovery regressions unless evidence shows fixture drift.",
+      "labels": ["qa-lab", "runtime-parity", "fault-injection"]
+    },
+    {
+      "id": "jsonl-expanded",
+      "title": "Expanded curated JSONL replay",
+      "kind": "jsonl-replay-summary",
+      "artifact": "jsonl-expanded/qa-jsonl-replay-summary.json",
+      "required": true,
+      "productImpact": "P2",
+      "qaImpact": "P0",
+      "ownerAction": "Inspect first drift turn and transcript class before filing any product issue.",
+      "labels": ["qa-lab", "runtime-parity", "jsonl-replay"]
+    },
+    {
+      "id": "confidence-self-test",
+      "title": "Seeded confidence negative controls",
+      "kind": "self-test-summary",
+      "artifact": "confidence-self-test/qa-confidence-self-test-summary.json",
+      "required": true,
+      "productImpact": "P4",
+      "qaImpact": "P0",
+      "ownerAction": "Fix the harness before trusting any green parity result.",
+      "labels": ["qa-lab", "confidence-gate", "negative-controls"]
+    },
+    {
+      "id": "codex-native-live",
+      "title": "Codex-native live workspace capability proof",
+      "kind": "qa-suite-summary",
+      "artifact": "codex-native-live/qa-suite-summary.json",
+      "required": true,
+      "missingVerdict": "environment-blocked",
+      "missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
+      "productImpact": "P1",
+      "qaImpact": "P1",
+      "ownerAction": "Run with live-frontier OAuth before using this lane as product proof.",
+      "labels": ["qa-lab", "runtime-parity", "live-proof"]
+    },
+    {
+      "id": "first-hour-live",
+      "title": "Live first-hour capability proof",
+      "kind": "qa-suite-summary",
+      "artifact": "first-hour-live/qa-suite-summary.json",
+      "required": true,
+      "missingVerdict": "environment-blocked",
+      "missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
+      "productImpact": "P1",
+      "qaImpact": "P1",
+      "ownerAction": "Run with live-frontier OAuth before claiming live first-hour coverage.",
+      "labels": ["qa-lab", "runtime-parity", "live-proof"]
+    },
+    {
+      "id": "openclaw-dynamic-tools-searchable-live",
+      "title": "Live OpenClaw dynamic tools searchable proof",
+      "kind": "qa-suite-summary",
+      "artifact": "openclaw-dynamic-tools-searchable-live/qa-suite-summary.json",
+      "required": true,
+      "missingVerdict": "environment-blocked",
+      "missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
+      "productImpact": "P1",
+      "qaImpact": "P1",
+      "ownerAction": "Run with live-frontier OAuth before claiming production-shaped searchable OpenClaw dynamic tool coverage.",
+      "labels": ["qa-lab", "runtime-parity", "searchable-tools", "live-proof"]
+    },
+    {
+      "id": "live-token-efficiency",
+      "title": "Live assistant-message token efficiency",
+      "kind": "token-efficiency-summary",
+      "artifact": "live-token-efficiency/qa-runtime-token-efficiency-summary.json",
+      "required": true,
+      "expectedTokenUsageSource": "live-usage",
+      "missingVerdict": "environment-blocked",
+      "missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
+      "productImpact": "P3",
+      "qaImpact": "P1",
+      "ownerAction": "Run a live-frontier runtime parity summary and regenerate token efficiency.",
+      "labels": ["qa-lab", "runtime-parity", "token-efficiency"]
+    },
+    {
+      "id": "soak-100",
+      "title": "Optional 100-turn soak",
+      "kind": "qa-suite-summary",
+      "artifact": "soak-100/qa-suite-summary.json",
+      "required": true,
+      "missingVerdict": "environment-blocked",
+      "missingReason": "Scheduled/Testbox soak runner did not upload artifacts for this proof bundle.",
+      "productImpact": "P3",
+      "qaImpact": "P2",
+      "ownerAction": "Run remotely with a long timeout or record the runner budget blocker.",
+      "labels": ["qa-lab", "runtime-parity", "soak"]
+    }
+  ]
+}
--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -13,6 +13,12 @@ import {
 import { resolveQaParityPackScenarioIds } from "./agentic-parity.js";
 import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js";
 import { resolveRepoRelativeOutputDir } from "./cli-paths.js";
+import {
+  buildQaConfidenceReport,
+  readQaConfidenceManifestFile,
+  renderQaConfidenceMarkdownReport,
+  writeQaConfidenceSelfTestArtifacts,
+} from "./confidence-report.js";
 import {
  buildQaCoverageInventory,
  findQaScenarioMatches,
@@ -786,6 +792,60 @@ export async function runQaParityReportCommand(opts: {
  }
 }

+export async function runQaConfidenceReportCommand(opts: {
+  repoRoot?: string;
+  manifest: string;
+  artifactRoot?: string;
+  outputDir?: string;
+  strictZeroUnknowns?: boolean;
+  strictGlobalPass?: boolean;
+}) {
+  const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
+  const manifestPath = path.resolve(repoRoot, opts.manifest);
+  const artifactRoot = path.resolve(repoRoot, opts.artifactRoot ?? ".");
+  const outputDir =
+    resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
+    path.join(repoRoot, ".artifacts", "qa-e2e", `confidence-${Date.now().toString(36)}`);
+  await fs.mkdir(outputDir, { recursive: true });
+  const manifest = await readQaConfidenceManifestFile(manifestPath);
+  const reportPayload = await buildQaConfidenceReport({
+    manifest,
+    artifactRoot,
+    strictZeroUnknowns: opts.strictZeroUnknowns === true,
+    strictGlobalPass: opts.strictGlobalPass === true,
+  });
+  const report = renderQaConfidenceMarkdownReport(reportPayload);
+  const reportPath = path.join(outputDir, "qa-confidence-report.md");
+  const summaryPath = path.join(outputDir, "qa-confidence-summary.json");
+  await fs.writeFile(reportPath, report, "utf8");
+  await fs.writeFile(summaryPath, `${JSON.stringify(reportPayload, null, 2)}\n`, "utf8");
+  process.stdout.write(`QA confidence report: ${reportPath}\n`);
+  process.stdout.write(`QA confidence summary: ${summaryPath}\n`);
+  process.stdout.write(`QA confidence verdict: ${reportPayload.pass ? "pass" : "fail"}\n`);
+  if (!reportPayload.pass) {
+    process.exitCode = 1;
+  }
+}
+
+export async function runQaConfidenceSelfTestCommand(opts: {
+  repoRoot?: string;
+  outputDir?: string;
+}) {
+  const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
+  const outputDir =
+    resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
+    path.join(repoRoot, ".artifacts", "qa-e2e", `confidence-self-test-${Date.now().toString(36)}`);
+  const result = await writeQaConfidenceSelfTestArtifacts({ outputDir });
+  process.stdout.write(`QA confidence self-test report: ${result.reportPath}\n`);
+  process.stdout.write(`QA confidence self-test summary: ${result.summaryPath}\n`);
+  process.stdout.write(
+    `QA confidence self-test verdict: ${result.summary.pass ? "pass" : "fail"}\n`,
+  );
+  if (!result.summary.pass) {
+    process.exitCode = 1;
+  }
+}
+
 export async function runQaCoverageReportCommand(opts: {
  repoRoot?: string;
  output?: string;
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -72,6 +72,23 @@ async function runQaParityReport(opts: {
  await runtime.runQaParityReportCommand(opts);
 }

+async function runQaConfidenceReport(opts: {
+  repoRoot?: string;
+  manifest: string;
+  artifactRoot?: string;
+  outputDir?: string;
+  strictZeroUnknowns?: boolean;
+  strictGlobalPass?: boolean;
+}) {
+  const runtime = await loadQaLabCliRuntime();
+  await runtime.runQaConfidenceReportCommand(opts);
+}
+
+async function runQaConfidenceSelfTest(opts: { repoRoot?: string; outputDir?: string }) {
+  const runtime = await loadQaLabCliRuntime();
+  await runtime.runQaConfidenceSelfTestCommand(opts);
+}
+
 async function runQaCoverageReport(opts: {
  repoRoot?: string;
  output?: string;
@@ -424,6 +441,43 @@ export function registerQaLabCli(program: Command) {
      },
    );

+  qa.command("confidence-report")
+    .description("Classify QA proof artifacts into a zero-unknown confidence report")
+    .requiredOption("--manifest <path>", "Confidence profile manifest JSON")
+    .option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
+    .option("--artifact-root <path>", "Root directory for relative artifact paths", ".")
+    .option("--output-dir <path>", "Artifact directory for the confidence report")
+    .option(
+      "--strict-zero-unknowns",
+      "Fail unless every lane passes or has an explicit non-unknown verdict",
+      false,
+    )
+    .option(
+      "--strict-global-pass",
+      "Fail unless every lane passes with no blocked, missing, unknown, classified-fail, or unbackfilled skipped rows",
+      false,
+    )
+    .action(
+      async (opts: {
+        repoRoot?: string;
+        manifest: string;
+        artifactRoot?: string;
+        outputDir?: string;
+        strictZeroUnknowns?: boolean;
+        strictGlobalPass?: boolean;
+      }) => {
+        await runQaConfidenceReport(opts);
+      },
+    );
+
+  qa.command("confidence-self-test")
+    .description("Write seeded negative-control canaries proving the confidence gate detects drift")
+    .option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
+    .option("--output-dir <path>", "Artifact directory for the confidence self-test")
+    .action(async (opts: { repoRoot?: string; outputDir?: string }) => {
+      await runQaConfidenceSelfTest(opts);
+    });
+
  qa.command("jsonl-replay")
    .description("Replay curated JSONL transcripts through the runtime parity replay harness")
    .option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
--- a/extensions/qa-lab/src/confidence-report.test.ts
+++ b/extensions/qa-lab/src/confidence-report.test.ts
@@ -0,0 +1,881 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { afterEach, beforeEach, describe, expect, it } from "vitest";
+import {
+  buildQaConfidenceReport,
+  buildQaConfidenceSelfTestSummary,
+  renderQaConfidenceMarkdownReport,
+  writeQaConfidenceSelfTestArtifacts,
+  type QaConfidenceManifest,
+} from "./confidence-report.js";
+
+describe("qa confidence report", () => {
+  let tempRoot: string;
+
+  beforeEach(async () => {
+    tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-confidence-"));
+  });
+
+  afterEach(async () => {
+    await fs.rm(tempRoot, { recursive: true, force: true });
+  });
+
+  async function writeJson(relativePath: string, payload: unknown) {
+    const filePath = path.join(tempRoot, relativePath);
+    await fs.mkdir(path.dirname(filePath), { recursive: true });
+    await fs.writeFile(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
+    return filePath;
+  }
+
+  it("passes strict zero-unknowns when every lane passes or has an allowed blocked verdict", async () => {
+    await writeJson("tool-defaults/qa-suite-summary.json", {
+      counts: { total: 20, passed: 18, skipped: 2, failed: 0 },
+      scenarios: [],
+    });
+    await writeJson("token/qa-runtime-token-efficiency-summary.json", {
+      status: "estimated",
+      pass: true,
+      rows: [{ scenarioId: "one", usageSource: "mock-estimate" }],
+    });
+
+    const manifest: QaConfidenceManifest = {
+      version: 1,
+      profile: "codex-100",
+      lanes: [
+        {
+          id: "tool-defaults-direct",
+          title: "Tool defaults direct",
+          kind: "qa-suite-summary",
+          artifact: "tool-defaults/qa-suite-summary.json",
+          required: true,
+        },
+        {
+          id: "mock-token-efficiency",
+          title: "Mock token efficiency",
+          kind: "token-efficiency-summary",
+          artifact: "token/qa-runtime-token-efficiency-summary.json",
+          required: true,
+          expectedTokenUsageSource: "mock-estimate",
+        },
+        {
+          id: "live-token-efficiency",
+          title: "Live token efficiency",
+          kind: "token-efficiency-summary",
+          artifact: "live/qa-runtime-token-efficiency-summary.json",
+          required: true,
+          missingVerdict: "environment-blocked",
+          missingReason: "OPENAI OAuth credentials are not available in this runner.",
+        },
+      ],
+    };
+
+    const report = await buildQaConfidenceReport({
+      manifest,
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-12T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(true);
+    expect(report.globalPass).toBe(false);
+    expect(report.counts).toMatchObject({ passed: 2, blocked: 1, unknown: 0, failed: 0 });
+    expect(report.lanes.map((lane) => lane.verdict)).toEqual([
+      "pass",
+      "pass",
+      "environment-blocked",
+    ]);
+    expect(report.lanes[0]?.artifactPath).toBe("tool-defaults/qa-suite-summary.json");
+    expect(report.lanes[0]?.artifactPath).not.toContain(tempRoot);
+    expect(report.lanes[0]?.details).toContain("counts.skipped=2");
+    expect(renderQaConfidenceMarkdownReport(report)).toContain("Zero unknowns: yes");
+    expect(renderQaConfidenceMarkdownReport(report)).toContain("Global pass: no");
+  });
+
+  it("does not let optional lanes block strict gates", async () => {
+    await writeJson("required/qa-suite-summary.json", {
+      counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
+      scenarios: [],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "required",
+            title: "Required",
+            kind: "qa-suite-summary",
+            artifact: "required/qa-suite-summary.json",
+            required: true,
+          },
+          {
+            id: "optional-missing",
+            title: "Optional missing",
+            kind: "qa-suite-summary",
+            artifact: "optional/qa-suite-summary.json",
+            required: false,
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      strictGlobalPass: true,
+      generatedAt: "2026-05-13T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(true);
+    expect(report.counts).toMatchObject({ total: 1, passed: 1, unknown: 0 });
+    expect(report.failures).toEqual([]);
+    expect(report.lanes[1]).toMatchObject({ id: "optional-missing", status: "missing" });
+  });
+
+  it("fails strict global pass when any lane is blocked, missing, unknown, or classified failed", async () => {
+    await writeJson("classified/qa-suite-summary.json", {
+      counts: { total: 1, passed: 0, skipped: 0, failed: 1 },
+      scenarios: [{ name: "classified", status: "fail" }],
+    });
+    await writeJson("unknown/qa-suite-summary.json", {
+      counts: { total: 1, passed: 0, skipped: 0, failed: 1 },
+      scenarios: [{ name: "unknown", status: "fail" }],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "blocked-live",
+            title: "Blocked live",
+            kind: "qa-suite-summary",
+            artifact: "live/qa-suite-summary.json",
+            required: true,
+            missingVerdict: "environment-blocked",
+            missingReason: "OPENAI_API_KEY missing.",
+          },
+          {
+            id: "missing-soak",
+            title: "Missing soak",
+            kind: "qa-suite-summary",
+            artifact: "soak/qa-suite-summary.json",
+            required: true,
+          },
+          {
+            id: "classified-fixture",
+            title: "Classified fixture",
+            kind: "qa-suite-summary",
+            artifact: "classified/qa-suite-summary.json",
+            required: true,
+            failureVerdict: "fixture-bug",
+          },
+          {
+            id: "unknown-failure",
+            title: "Unknown failure",
+            kind: "qa-suite-summary",
+            artifact: "unknown/qa-suite-summary.json",
+            required: true,
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      strictGlobalPass: true,
+      generatedAt: "2026-05-12T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.zeroUnknowns).toBe(false);
+    expect(report.globalPass).toBe(false);
+    expect(report.counts).toMatchObject({
+      blocked: 1,
+      missing: 1,
+      failed: 1,
+      unknown: 2,
+    });
+    expect(report.failures).toEqual([
+      "blocked-live is blocked: OPENAI_API_KEY missing.",
+      "missing-soak is missing: artifact missing and no missingVerdict was configured",
+      "classified-fixture is classified fixture-bug: qa-suite-summary counts.failed=1 counts.total=1 counts.skipped=0",
+      "unknown-failure is unclassified: qa-suite-summary counts.failed=1 counts.total=1 counts.skipped=0",
+    ]);
+  });
+
+  it("fails strict global pass for skipped suite rows until a backfill lane passes", async () => {
+    await writeJson("report-only/qa-suite-summary.json", {
+      counts: { total: 3, passed: 2, skipped: 1, failed: 0 },
+      scenarios: [],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "report-only",
+            title: "Report-only",
+            kind: "qa-suite-summary",
+            artifact: "report-only/qa-suite-summary.json",
+            required: true,
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      strictGlobalPass: true,
+      generatedAt: "2026-05-12T00:00:00.000Z",
+    });
+
+    expect(report.zeroUnknowns).toBe(true);
+    expect(report.globalPass).toBe(false);
+    expect(report.failures).toEqual([
+      "report-only has 1 skipped row(s) with no passing backfill lane",
+    ]);
+  });
+
+  it("infers skipped suite rows from totals and scenario status", async () => {
+    for (const [artifact, expectedDetail] of [
+      [{ counts: { total: 3, passed: 2, failed: 0 }, scenarios: [] }, "counts.skipped=1"],
+      [
+        {
+          counts: { total: 2, passed: 2, failed: 0 },
+          scenarios: [
+            { name: "passing", status: "pass" },
+            { name: "skipped", status: "skip" },
+          ],
+        },
+        "counts.skipped=1",
+      ],
+    ] as const) {
+      await writeJson("report-only/qa-suite-summary.json", artifact);
+
+      const report = await buildQaConfidenceReport({
+        manifest: {
+          version: 1,
+          profile: "codex-100",
+          lanes: [
+            {
+              id: "report-only",
+              title: "Report-only",
+              kind: "qa-suite-summary",
+              artifact: "report-only/qa-suite-summary.json",
+              required: true,
+            },
+          ],
+        },
+        artifactRoot: tempRoot,
+        strictZeroUnknowns: true,
+        strictGlobalPass: true,
+        generatedAt: "2026-05-12T00:00:00.000Z",
+      });
+
+      expect(report.globalPass).toBe(false);
+      expect(report.failures).toEqual([
+        "report-only has 1 skipped row(s) with no passing backfill lane",
+      ]);
+      expect(report.lanes[0]).toMatchObject({ skippedCount: 1 });
+      expect(report.lanes[0]?.details).toContain(expectedDetail);
+    }
+  });
+
+  it("rejects skipped token reports when a live usage source is required", async () => {
+    await writeJson("live-token/qa-runtime-token-efficiency-summary.json", {
+      status: "skipped",
+      pass: true,
+      rows: [],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "live-token-efficiency",
+            title: "Live token efficiency",
+            kind: "token-efficiency-summary",
+            artifact: "live-token/qa-runtime-token-efficiency-summary.json",
+            required: true,
+            expectedTokenUsageSource: "live-usage",
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-12T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.lanes[0]).toMatchObject({
+      status: "unknown",
+      details: "token summary has no live-usage rows",
+    });
+  });
+
+  it("preserves partial zero-unknown mode for classified failing lanes", async () => {
+    await writeJson("classified/qa-suite-summary.json", {
+      counts: { total: 1, passed: 0, skipped: 0, failed: 1 },
+      scenarios: [{ name: "classified", status: "fail" }],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "classified-fixture",
+            title: "Classified fixture",
+            kind: "qa-suite-summary",
+            artifact: "classified/qa-suite-summary.json",
+            required: true,
+            failureVerdict: "fixture-bug",
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-12T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(true);
+    expect(report.zeroUnknowns).toBe(true);
+    expect(report.globalPass).toBe(false);
+    expect(report.counts.failed).toBe(1);
+  });
+
+  it("passes strict global pass when skipped suite rows are backfilled by a passing lane", async () => {
+    await writeJson("report-only/qa-suite-summary.json", {
+      counts: { total: 3, passed: 2, skipped: 1, failed: 0 },
+      scenarios: [],
+    });
+    await writeJson("live-backfill/qa-suite-summary.json", {
+      counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
+      scenarios: [],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "report-only",
+            title: "Report-only",
+            kind: "qa-suite-summary",
+            artifact: "report-only/qa-suite-summary.json",
+            required: true,
+            skipBackfillLane: "live-backfill",
+          },
+          {
+            id: "live-backfill",
+            title: "Live backfill",
+            kind: "qa-suite-summary",
+            artifact: "live-backfill/qa-suite-summary.json",
+            required: true,
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      strictGlobalPass: true,
+      generatedAt: "2026-05-12T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(true);
+    expect(report.zeroUnknowns).toBe(true);
+    expect(report.globalPass).toBe(true);
+    expect(report.lanes[0]).toMatchObject({
+      skippedCount: 1,
+      skipBackfillLane: "live-backfill",
+      skipBackfilled: true,
+    });
+  });
+
+  it("classifies environment-blocking gateway sentinels without turning them into unknowns", async () => {
+    await writeJson("live/qa-suite-summary.json", {
+      counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
+      gatewayLogSentinels: [
+        {
+          kind: "live-quota-or-subscription",
+          verdict: "environment-blocked",
+          owner: "environment",
+          productImpact: "P4",
+          qaImpact: "P0",
+          line: 12,
+          text: "OpenAI quota exceeded",
+        },
+      ],
+      scenarios: [],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "first-hour-live",
+            title: "First hour live",
+            kind: "qa-suite-summary",
+            artifact: "live/qa-suite-summary.json",
+            required: true,
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-13T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(true);
+    expect(report.globalPass).toBe(false);
+    expect(report.counts).toMatchObject({ blocked: 1, unknown: 0 });
+    expect(report.lanes[0]).toMatchObject({
+      status: "blocked",
+      verdict: "environment-blocked",
+    });
+  });
+
+  it("does not let environment sentinels hide separate suite failures", async () => {
+    await writeJson("live/qa-suite-summary.json", {
+      counts: { total: 2, passed: 1, skipped: 0, failed: 1 },
+      gatewayLogSentinels: [
+        {
+          kind: "live-quota-or-subscription",
+          verdict: "environment-blocked",
+          owner: "environment",
+          line: 12,
+          text: "OpenAI quota exceeded",
+        },
+      ],
+      scenarios: [
+        { name: "quota", status: "pass" },
+        { name: "unrelated-drift", status: "fail" },
+      ],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "first-hour-live",
+            title: "First hour live",
+            kind: "qa-suite-summary",
+            artifact: "live/qa-suite-summary.json",
+            required: true,
+            missingVerdict: "environment-blocked",
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-13T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.counts).toMatchObject({ blocked: 0, unknown: 1 });
+    expect(report.lanes[0]).toMatchObject({ status: "unknown" });
+    expect(report.lanes[0]?.details).toContain("suite also reports failures");
+  });
+
+  it("classifies product and plugin gateway sentinels as known failing lanes", async () => {
+    await writeJson("live/qa-suite-summary.json", {
+      counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
+      scenarios: [
+        {
+          name: "plugin hook health sentinel",
+          status: "pass",
+          steps: [],
+          runtimeParity: {
+            scenarioId: "plugin-hook-health-sentinel",
+            drift: "none",
+            cells: {
+              pi: { sentinelFindings: [] },
+              codex: {
+                sentinelFindings: [
+                  {
+                    kind: "plugin-hook-failure",
+                    verdict: "qa-harness-bug",
+                    owner: "plugin",
+                    productImpact: "P1",
+                    qaImpact: "P0",
+                    line: 4,
+                    text: "before_prompt_build hook failed",
+                  },
+                ],
+              },
+            },
+          },
+        },
+      ],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "first-hour-live",
+            title: "First hour live",
+            kind: "qa-suite-summary",
+            artifact: "live/qa-suite-summary.json",
+            required: true,
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-13T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(true);
+    expect(report.globalPass).toBe(false);
+    expect(report.counts).toMatchObject({ failed: 1, unknown: 0 });
+    expect(report.lanes[0]).toMatchObject({
+      status: "fail",
+      verdict: "qa-harness-bug",
+    });
+  });
+
+  it("treats corrupt artifacts as unknown instead of allowed missing lanes", async () => {
+    const artifactPath = path.join(tempRoot, "live", "qa-suite-summary.json");
+    await fs.mkdir(path.dirname(artifactPath), { recursive: true });
+    await fs.writeFile(artifactPath, "{not-json", "utf8");
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "first-hour-live",
+            title: "First hour live",
+            kind: "qa-suite-summary",
+            artifact: "live/qa-suite-summary.json",
+            required: true,
+            missingVerdict: "environment-blocked",
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-13T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.counts).toMatchObject({ blocked: 0, unknown: 1 });
+    expect(report.lanes[0]).toMatchObject({
+      status: "unknown",
+    });
+    expect(report.lanes[0]?.details).toContain("artifact unreadable");
+  });
+
+  it("treats schema-invalid suite artifacts as unknown", async () => {
+    await writeJson("live/qa-suite-summary.json", {});
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "first-hour-live",
+            title: "First hour live",
+            kind: "qa-suite-summary",
+            artifact: "live/qa-suite-summary.json",
+            required: true,
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-13T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.counts.unknown).toBe(1);
+    expect(report.lanes[0]?.details).toContain("missing counts.failed and scenarios[]");
+  });
+
+  it("treats empty suite artifacts as unknown", async () => {
+    await writeJson("live/qa-suite-summary.json", {
+      counts: { total: 0, passed: 0, skipped: 0, failed: 0 },
+      scenarios: [],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "first-hour-live",
+            title: "First hour live",
+            kind: "qa-suite-summary",
+            artifact: "live/qa-suite-summary.json",
+            required: true,
+            failureVerdict: "qa-harness-bug",
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-13T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
+    expect(report.lanes[0]).toMatchObject({ status: "unknown" });
+    expect(report.lanes[0]?.details).toContain("no executed scenarios");
+  });
+
+  it("treats suite count and scenario mismatches as unknown", async () => {
+    await writeJson("live/qa-suite-summary.json", {
+      counts: { total: 2, passed: 2, skipped: 0, failed: 0 },
+      scenarios: [
+        { name: "passing", status: "pass" },
+        { name: "stale-failure", status: "fail" },
+      ],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "first-hour-live",
+            title: "First hour live",
+            kind: "qa-suite-summary",
+            artifact: "live/qa-suite-summary.json",
+            required: true,
+            failureVerdict: "qa-harness-bug",
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-13T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
+    expect(report.lanes[0]).toMatchObject({ status: "unknown" });
+    expect(report.lanes[0]?.details).toContain("count/scenario mismatch");
+  });
+
+  it("requires generic summary lanes to expose an explicit pass signal", async () => {
+    await writeJson("runtime/qa-runtime-parity-summary.json", {});
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "runtime-parity",
+            title: "Runtime parity",
+            kind: "runtime-parity-summary",
+            artifact: "runtime/qa-runtime-parity-summary.json",
+            required: true,
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-13T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.counts.unknown).toBe(1);
+    expect(report.lanes[0]?.details).toContain("explicit pass signal");
+  });
+
+  it("requires JSONL replay summaries to contain replayed user turns", async () => {
+    for (const [artifact, expectedDetail] of [
+      [{ transcripts: [] }, "no transcripts"],
+      [
+        { transcripts: [{ transcriptPath: "empty.jsonl", userTurnCount: 0, drift: [] }] },
+        "no replayed user turns",
+      ],
+      [
+        { transcripts: [{ transcriptPath: "missing-drift.jsonl", userTurnCount: 1 }] },
+        "missing drift array",
+      ],
+    ] as const) {
+      await writeJson("jsonl/qa-jsonl-replay-summary.json", artifact);
+
+      const report = await buildQaConfidenceReport({
+        manifest: {
+          version: 1,
+          profile: "codex-100",
+          lanes: [
+            {
+              id: "jsonl-expanded",
+              title: "Expanded JSONL replay",
+              kind: "jsonl-replay-summary",
+              artifact: "jsonl/qa-jsonl-replay-summary.json",
+              required: true,
+              failureVerdict: "fixture-bug",
+            },
+          ],
+        },
+        artifactRoot: tempRoot,
+        strictZeroUnknowns: true,
+        generatedAt: "2026-05-13T00:00:00.000Z",
+      });
+
+      expect(report.pass).toBe(false);
+      expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
+      expect(report.lanes[0]).toMatchObject({ status: "unknown" });
+      expect(report.lanes[0]?.details).toContain(expectedDetail);
+    }
+  });
+
+  it("requires confidence self-test summaries to contain every seeded canary", async () => {
+    for (const [artifact, expectedDetail] of [
+      [{ pass: true, canaries: [] }, "no canaries"],
+      [
+        { pass: true, canaries: [{ id: "prompt-drift", detected: true }] },
+        "missing expected canaries",
+      ],
+    ] as const) {
+      await writeJson("confidence-self-test/qa-confidence-self-test-summary.json", artifact);
+
+      const report = await buildQaConfidenceReport({
+        manifest: {
+          version: 1,
+          profile: "codex-100",
+          lanes: [
+            {
+              id: "confidence-self-test",
+              title: "Confidence self-test",
+              kind: "self-test-summary",
+              artifact: "confidence-self-test/qa-confidence-self-test-summary.json",
+              required: true,
+              failureVerdict: "qa-harness-bug",
+            },
+          ],
+        },
+        artifactRoot: tempRoot,
+        strictZeroUnknowns: true,
+        generatedAt: "2026-05-13T00:00:00.000Z",
+      });
+
+      expect(report.pass).toBe(false);
+      expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
+      expect(report.lanes[0]).toMatchObject({ status: "unknown" });
+      expect(report.lanes[0]?.details).toContain(expectedDetail);
+    }
+  });
+
+  it("fails strict zero-unknowns for an unclassified failing lane", async () => {
+    await writeJson("first-hour/qa-suite-summary.json", {
+      counts: { total: 18, passed: 17, failed: 1 },
+      scenarios: [{ name: "approval-turn-tool-followthrough", status: "fail", steps: [] }],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "first-hour-20-direct",
+            title: "First-hour 20 direct",
+            kind: "qa-suite-summary",
+            artifact: "first-hour/qa-suite-summary.json",
+            required: true,
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-12T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(false);
+    expect(report.counts.unknown).toBe(1);
+    expect(report.failures[0]).toContain("first-hour-20-direct is unclassified");
+  });
+
+  it("accepts a classified failing lane without treating it as unknown", async () => {
+    await writeJson("jsonl/qa-jsonl-replay-summary.json", {
+      transcripts: [
+        {
+          transcriptPath: "curated.jsonl",
+          userTurnCount: 2,
+          drift: ["none", "tool-result-shape"],
+          firstDriftAtTurn: 2,
+        },
+      ],
+    });
+
+    const report = await buildQaConfidenceReport({
+      manifest: {
+        version: 1,
+        profile: "codex-100",
+        lanes: [
+          {
+            id: "jsonl-expanded",
+            title: "Expanded JSONL replay",
+            kind: "jsonl-replay-summary",
+            artifact: "jsonl/qa-jsonl-replay-summary.json",
+            required: true,
+            failureVerdict: "fixture-bug",
+            productImpact: "P4",
+            qaImpact: "P1",
+          },
+        ],
+      },
+      artifactRoot: tempRoot,
+      strictZeroUnknowns: true,
+      generatedAt: "2026-05-12T00:00:00.000Z",
+    });
+
+    expect(report.pass).toBe(true);
+    expect(report.globalPass).toBe(false);
+    expect(report.counts.failed).toBe(1);
+    expect(report.counts.unknown).toBe(0);
+    expect(report.lanes[0]).toMatchObject({
+      status: "fail",
+      verdict: "fixture-bug",
+      productImpact: "P4",
+      qaImpact: "P1",
+    });
+  });
+
+  it("emits confidence self-test canaries for every drift class we need to catch", async () => {
+    const summary = await buildQaConfidenceSelfTestSummary("2026-05-12T00:00:00.000Z");
+
+    expect(summary.pass).toBe(true);
+    expect(summary.canaries.map((canary) => canary.id)).toEqual([
+      "prompt-drift",
+      "tool-description-schema-drift",
+      "runtime-tool-call-drop",
+      "tool-result-mismatch",
+      "failure-mode-drift",
+      "token-efficiency-regression",
+      "jsonl-replay-ordering-drift",
+    ]);
+    expect(summary.canaries.every((canary) => canary.detected)).toBe(true);
+  });
+
+  it("writes confidence self-test artifacts", async () => {
+    const result = await writeQaConfidenceSelfTestArtifacts({
+      outputDir: tempRoot,
+      generatedAt: "2026-05-12T00:00:00.000Z",
+    });
+
+    await expect(fs.stat(result.summaryPath)).resolves.toBeTruthy();
+    await expect(fs.stat(result.reportPath)).resolves.toBeTruthy();
+    const summary = JSON.parse(await fs.readFile(result.summaryPath, "utf8")) as { pass: boolean };
+    expect(summary.pass).toBe(true);
+  });
+});
--- a/extensions/qa-lab/src/confidence-report.ts
+++ b/extensions/qa-lab/src/confidence-report.ts
--- a/extensions/qa-lab/src/harness-parity.test.ts
+++ b/extensions/qa-lab/src/harness-parity.test.ts
@@ -0,0 +1,284 @@
+import { describe, expect, it } from "vitest";
+import {
+  buildHarnessParityCell,
+  buildHarnessParityResult,
+  type HarnessRuntimeParityCell,
+  type HarnessVariant,
+} from "./harness-parity.js";
+import type { RuntimeId } from "./runtime-parity.js";
+import type { RuntimeParityComparisonMode } from "./runtime-tool-metadata.js";
+
+const LEFT: HarnessVariant = { id: "left", label: "Left", runtime: "pi" };
+const RIGHT: HarnessVariant = { id: "right", label: "Right", runtime: "pi" };
+
+const BASE_PROMPT_REPORT = {
+  systemPrompt: {
+    chars: 100,
+    projectContextChars: 40,
+    nonProjectContextChars: 60,
+    hash: "system-a",
+  },
+  skills: {
+    promptChars: 12,
+    hash: "skills-a",
+  },
+  tools: {
+    schemaChars: 20,
+    entries: [
+      {
+        name: "read",
+        summaryChars: 8,
+        summaryHash: "summary-a",
+        schemaChars: 20,
+        schemaHash: "schema-a",
+        propertiesCount: 1,
+      },
+    ],
+  },
+};
+
+function makeCell(
+  runtime: RuntimeId,
+  overrides: Partial<HarnessRuntimeParityCell> = {},
+): HarnessRuntimeParityCell {
+  return {
+    runtime,
+    transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n',
+    toolCalls: [],
+    finalText: "same",
+    usage: { inputTokens: 10, outputTokens: 5, totalTokens: 15 },
+    wallClockMs: 1,
+    bootStateLines: [],
+    systemPromptReport: BASE_PROMPT_REPORT,
+    ...overrides,
+  };
+}
+
+function classify(
+  left: Partial<HarnessRuntimeParityCell>,
+  right: Partial<HarnessRuntimeParityCell>,
+  comparisonMode?: RuntimeParityComparisonMode,
+) {
+  return buildHarnessParityResult({
+    scenarioId: "scenario",
+    left: buildHarnessParityCell({
+      variant: LEFT,
+      cell: makeCell("pi", left),
+      tokenUsageSource: "live-usage",
+    }),
+    right: buildHarnessParityCell({
+      variant: RIGHT,
+      cell: makeCell("pi", right),
+      tokenUsageSource: "live-usage",
+    }),
+    ...(comparisonMode ? { comparisonMode } : {}),
+  }).drift;
+}
+
+describe("harness parity", () => {
+  it("classifies prompt and tool surface drift before behavioral drift", () => {
+    expect(
+      classify(
+        {},
+        {
+          systemPromptReport: {
+            ...BASE_PROMPT_REPORT,
+            systemPrompt: { chars: 101, projectContextChars: 40, nonProjectContextChars: 61 },
+          },
+        },
+      ),
+    ).toBe("system-prompt");
+    expect(
+      classify(
+        {},
+        {
+          systemPromptReport: {
+            ...BASE_PROMPT_REPORT,
+            systemPrompt: {
+              chars: 100,
+              projectContextChars: 40,
+              nonProjectContextChars: 60,
+              hash: "system-b",
+            },
+          },
+        },
+      ),
+    ).toBe("system-prompt");
+    expect(
+      classify(
+        {},
+        {
+          systemPromptReport: {
+            ...BASE_PROMPT_REPORT,
+            skills: { promptChars: 12, hash: "skills-b" },
+          },
+        },
+      ),
+    ).toBe("system-prompt");
+    expect(
+      classify(
+        {},
+        {
+          systemPromptReport: {
+            ...BASE_PROMPT_REPORT,
+            tools: {
+              schemaChars: 20,
+              entries: [
+                {
+                  name: "read",
+                  summaryChars: 8,
+                  summaryHash: "summary-b",
+                  schemaChars: 20,
+                  schemaHash: "schema-a",
+                  propertiesCount: 1,
+                },
+              ],
+            },
+          },
+        },
+      ),
+    ).toBe("tool-description");
+    expect(
+      classify(
+        {},
+        {
+          systemPromptReport: {
+            ...BASE_PROMPT_REPORT,
+            tools: {
+              schemaChars: 20,
+              entries: [
+                {
+                  name: "read",
+                  summaryChars: 8,
+                  summaryHash: "summary-a",
+                  schemaChars: 20,
+                  schemaHash: "schema-b",
+                  propertiesCount: 1,
+                },
+              ],
+            },
+          },
+        },
+      ),
+    ).toBe("tool-schema");
+  });
+
+  it("classifies behavioral harness drift", () => {
+    expect(
+      classify(
+        { toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r" }] },
+        { toolCalls: [{ tool: "read", argsHash: "b", resultHash: "r" }] },
+      ),
+    ).toBe("tool-call-shape");
+    expect(
+      classify(
+        { toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r1" }] },
+        { toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r2" }] },
+      ),
+    ).toBe("tool-result-shape");
+    expect(classify({ finalText: "same text" }, { finalText: "different text" })).toBe("text-only");
+    expect(
+      classify(
+        {
+          transcriptBytes:
+            '{"type":"model_change","modelId":"gpt-5.5"}\n' +
+            '{"type":"thinking_level_change","thinkingLevel":"off"}\n' +
+            '{"type":"custom","customType":"model-snapshot"}\n' +
+            '{"message":{"role":"assistant","content":"same"}}\n',
+        },
+        { transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n' },
+      ),
+    ).toBe("none");
+    expect(
+      classify(
+        { transcriptBytes: '{"message":{"role":"assistant"}}\n' },
+        { transcriptBytes: '{"message":{"role":"assistant"}}\n{"message":{"role":"tool"}}\n' },
+      ),
+    ).toBe("structural");
+    expect(
+      classify(
+        { transcriptBytes: '{"role":"assistant","content":"same"}\n' },
+        {
+          transcriptBytes:
+            '{"role":"assistant","content":"same"}\n{"role":"tool","content":"same"}\n',
+        },
+      ),
+    ).toBe("structural");
+    expect(classify({ runtimeErrorClass: "timeout" }, {})).toBe("failure-mode");
+  });
+
+  it("honors native workspace comparison mode for outcome-only harness proofs", () => {
+    expect(
+      classify(
+        {
+          transcriptBytes:
+            '{"message":{"role":"assistant","content":"same"}}\n' +
+            '{"message":{"role":"tool","content":"same result"}}\n',
+          toolCalls: [{ tool: "bash", argsHash: "sed-160", resultHash: "same-result" }],
+        },
+        {
+          transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n',
+          toolCalls: [{ tool: "bash", argsHash: "sed-200", resultHash: "same-result" }],
+        },
+        "codex-native-workspace",
+      ),
+    ).toBe("none");
+
+    expect(
+      classify(
+        { toolCalls: [{ tool: "bash", argsHash: "a", resultHash: "r1" }] },
+        { toolCalls: [{ tool: "bash", argsHash: "b", resultHash: "r2" }] },
+        "outcome-only",
+      ),
+    ).toBe("none");
+  });
+
+  it("keeps prompt and tool surface checks strict under native workspace comparison mode", () => {
+    expect(
+      classify(
+        {},
+        {
+          systemPromptReport: {
+            ...BASE_PROMPT_REPORT,
+            systemPrompt: { chars: 101, projectContextChars: 40, nonProjectContextChars: 61 },
+          },
+          toolCalls: [{ tool: "bash", argsHash: "changed", resultHash: "changed" }],
+        },
+        "codex-native-workspace",
+      ),
+    ).toBe("system-prompt");
+    expect(
+      classify(
+        {},
+        {
+          systemPromptReport: {
+            ...BASE_PROMPT_REPORT,
+            tools: {
+              schemaChars: 20,
+              entries: [{ name: "read", summaryChars: 9, schemaChars: 20, propertiesCount: 1 }],
+            },
+          },
+          toolCalls: [{ tool: "bash", argsHash: "changed", resultHash: "changed" }],
+        },
+        "outcome-only",
+      ),
+    ).toBe("tool-description");
+  });
+
+  it("labels mock token estimates separately from live usage", () => {
+    const sourceCell = makeCell("pi", {
+      usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 },
+    });
+    const cell = buildHarnessParityCell({
+      variant: LEFT,
+      cell: sourceCell,
+      tokenUsageSource: "mock-estimate",
+    });
+    const inputChars = 100 + 12 + 8 + 20 + sourceCell.transcriptBytes.length;
+
+    expect(cell.tokenUsageSource).toBe("mock-estimate");
+    expect(cell.tokenUsage.totalTokens).toBeGreaterThan(0);
+    expect(cell.tokenUsage.inputTokens).toBe(Math.ceil(inputChars / 4));
+    expect(cell.promptStats.toolCount).toBe(1);
+  });
+});
--- a/extensions/qa-lab/src/harness-parity.ts
+++ b/extensions/qa-lab/src/harness-parity.ts
@@ -0,0 +1,491 @@
+import { createHash } from "node:crypto";
+import type {
+  RuntimeId,
+  RuntimeParityCell,
+  RuntimeParityDrift,
+  RuntimeParityToolCall,
+  RuntimeParityUsage,
+} from "./runtime-parity.js";
+import type { RuntimeParityComparisonMode } from "./runtime-tool-metadata.js";
+
+export type HarnessVariant = {
+  id: string;
+  label: string;
+  runtime?: RuntimeId;
+  model?: string;
+  configPatch?: Record<string, unknown>;
+  systemPromptOverlay?: string;
+  toolDescriptionOverlay?: Record<string, string>;
+};
+
+export type HarnessParityDrift =
+  | RuntimeParityDrift
+  | "system-prompt"
+  | "tool-description"
+  | "tool-schema";
+
+export type HarnessParityPromptStats = {
+  systemPromptChars: number;
+  projectContextChars: number;
+  nonProjectContextChars: number;
+  skillPromptChars: number;
+  toolSummaryChars: number;
+  toolSchemaChars: number;
+  toolCount: number;
+};
+
+export type RuntimeParitySystemPromptReport = {
+  systemPrompt?: {
+    chars?: number;
+    projectContextChars?: number;
+    nonProjectContextChars?: number;
+    text?: string;
+    hash?: string;
+    contentHash?: string;
+  };
+  skills?: {
+    promptChars?: number;
+    prompt?: string;
+    hash?: string;
+    contentHash?: string;
+  };
+  tools?: {
+    listChars?: number;
+    schemaChars?: number;
+    entries?: Array<{
+      name?: string;
+      summary?: string;
+      summaryHash?: string;
+      summaryChars?: number;
+      schema?: unknown;
+      schemaHash?: string;
+      schemaChars?: number;
+      propertiesCount?: number;
+    }>;
+  };
+};
+
+export type HarnessRuntimeParityCell = RuntimeParityCell & {
+  systemPromptReport?: RuntimeParitySystemPromptReport;
+};
+
+export type HarnessParityCell = HarnessRuntimeParityCell & {
+  variant: HarnessVariant;
+  promptStats: HarnessParityPromptStats;
+  systemPromptHash: string;
+  toolDescriptionHash: string;
+  toolSchemaHash: string;
+  tokenUsage: RuntimeParityUsage;
+  tokenUsageSource: "live-usage" | "mock-estimate";
+};
+
+export type HarnessParityResult = {
+  scenarioId: string;
+  left: HarnessParityCell;
+  right: HarnessParityCell;
+  drift: HarnessParityDrift;
+  driftDetails?: string;
+  promptDelta: {
+    systemPromptChars: number;
+    projectContextChars: number;
+    skillPromptChars: number;
+    toolSummaryChars: number;
+    toolSchemaChars: number;
+    toolCount: number;
+  };
+  tokenDeltaPercent: number;
+  firstDriftTurn?: number;
+};
+
+export type HarnessParityReport = {
+  generatedAt: string;
+  providerMode: string;
+  left: HarnessVariant;
+  right: HarnessVariant;
+  results: HarnessParityResult[];
+  pass: boolean;
+  failures: string[];
+};
+
+function sha256(value: string) {
+  return createHash("sha256").update(value).digest("hex");
+}
+
+function countComparableTranscriptRecords(transcriptBytes: string) {
+  let count = 0;
+  for (const line of transcriptBytes.split(/\r?\n/u)) {
+    const trimmed = line.trim();
+    if (!trimmed) {
+      continue;
+    }
+    try {
+      const parsed = JSON.parse(trimmed) as {
+        message?: { role?: unknown };
+        role?: unknown;
+      };
+      if (
+        (parsed.message && typeof parsed.message.role === "string") ||
+        typeof parsed.role === "string"
+      ) {
+        count += 1;
+      }
+    } catch {
+      // Ignore malformed QA transcript rows and keep parity classification deterministic.
+    }
+  }
+  return count;
+}
+
+function normalizeForStableHash(value: unknown): unknown {
+  if (Array.isArray(value)) {
+    return value.map((entry) => normalizeForStableHash(entry));
+  }
+  if (value && typeof value === "object") {
+    const record = value as Record<string, unknown>;
+    return Object.fromEntries(
+      Object.keys(record)
+        .toSorted((left, right) => left.localeCompare(right))
+        .map((key) => [key, normalizeForStableHash(record[key])]),
+    );
+  }
+  return value;
+}
+
+function stableHash(value: unknown) {
+  return sha256(JSON.stringify(normalizeForStableHash(value)) ?? "null");
+}
+
+function readPositiveNumber(value: unknown) {
+  return typeof value === "number" && Number.isFinite(value) && value > 0 ? Math.floor(value) : 0;
+}
+
+function buildPromptStats(report: RuntimeParitySystemPromptReport | undefined) {
+  const toolEntries = Array.isArray(report?.tools?.entries) ? report.tools.entries : [];
+  return {
+    systemPromptChars: readPositiveNumber(report?.systemPrompt?.chars),
+    projectContextChars: readPositiveNumber(report?.systemPrompt?.projectContextChars),
+    nonProjectContextChars: readPositiveNumber(report?.systemPrompt?.nonProjectContextChars),
+    skillPromptChars: readPositiveNumber(report?.skills?.promptChars),
+    toolSummaryChars: toolEntries.reduce(
+      (sum, entry) => sum + readPositiveNumber(entry.summaryChars),
+      0,
+    ),
+    toolSchemaChars: readPositiveNumber(report?.tools?.schemaChars),
+    toolCount: toolEntries.length,
+  };
+}
+
+function estimateUsage(
+  cell: RuntimeParityCell,
+  stats: HarnessParityPromptStats,
+): RuntimeParityUsage {
+  const inputChars =
+    stats.systemPromptChars +
+    stats.skillPromptChars +
+    stats.toolSummaryChars +
+    stats.toolSchemaChars +
+    cell.transcriptBytes.length;
+  const outputChars = cell.finalText.length + cell.toolCalls.length * 80;
+  const inputTokens = Math.ceil(inputChars / 4);
+  const outputTokens = Math.ceil(outputChars / 4);
+  return {
+    inputTokens,
+    outputTokens,
+    totalTokens: inputTokens + outputTokens,
+  };
+}
+
+function normalizeTextForParity(text: string) {
+  return text.replace(/\s+/gu, " ").trim();
+}
+
+function compareToolCallShape(left: RuntimeParityToolCall[], right: RuntimeParityToolCall[]) {
+  if (left.length !== right.length) {
+    return `tool call count differs (${left.length} vs ${right.length})`;
+  }
+  for (let index = 0; index < left.length; index += 1) {
+    const leftCall = left[index];
+    const rightCall = right[index];
+    if (!leftCall || !rightCall) {
+      return `tool call row ${index + 1} missing`;
+    }
+    if (leftCall.tool !== rightCall.tool || leftCall.argsHash !== rightCall.argsHash) {
+      return `tool call ${index + 1} differs (${leftCall.tool}/${leftCall.argsHash} vs ${rightCall.tool}/${rightCall.argsHash})`;
+    }
+  }
+  return undefined;
+}
+
+function compareToolResultShape(left: RuntimeParityToolCall[], right: RuntimeParityToolCall[]) {
+  const total = Math.min(left.length, right.length);
+  for (let index = 0; index < total; index += 1) {
+    const leftCall = left[index];
+    const rightCall = right[index];
+    if (!leftCall || !rightCall) {
+      continue;
+    }
+    if (
+      leftCall.resultHash !== rightCall.resultHash ||
+      (leftCall.errorClass ?? "") !== (rightCall.errorClass ?? "")
+    ) {
+      return `tool result ${index + 1} differs (${leftCall.tool})`;
+    }
+  }
+  return undefined;
+}
+
+function firstDriftTurn(leftTranscript: string, rightTranscript: string): number | undefined {
+  const leftLines = leftTranscript.trim().length ? leftTranscript.trim().split(/\r?\n/u) : [];
+  const rightLines = rightTranscript.trim().length ? rightTranscript.trim().split(/\r?\n/u) : [];
+  const total = Math.max(leftLines.length, rightLines.length);
+  for (let index = 0; index < total; index += 1) {
+    if ((leftLines[index] ?? "") !== (rightLines[index] ?? "")) {
+      return index + 1;
+    }
+  }
+  return undefined;
+}
+
+export function buildHarnessParityCell(params: {
+  variant: HarnessVariant;
+  cell: HarnessRuntimeParityCell;
+  tokenUsageSource: HarnessParityCell["tokenUsageSource"];
+}): HarnessParityCell {
+  const report = params.cell.systemPromptReport;
+  const promptStats = buildPromptStats(report);
+  const toolEntries = report?.tools?.entries ?? [];
+  const tokenUsage =
+    params.tokenUsageSource === "live-usage"
+      ? params.cell.usage
+      : estimateUsage(params.cell, promptStats);
+  return {
+    ...params.cell,
+    variant: params.variant,
+    ...(report ? { systemPromptReport: report } : {}),
+    promptStats,
+    systemPromptHash: stableHash({
+      systemPrompt: report?.systemPrompt ?? null,
+      skills: report?.skills ?? null,
+    }),
+    toolDescriptionHash: stableHash(
+      toolEntries.map((entry) => {
+        return {
+          name: entry.name,
+          summary: entry.summary,
+          summaryHash: entry.summaryHash,
+          summaryChars: entry.summaryChars,
+        };
+      }),
+    ),
+    toolSchemaHash: stableHash({
+      listChars: report?.tools?.listChars,
+      schemaChars: report?.tools?.schemaChars,
+      entries: toolEntries.map((entry) => {
+        return {
+          name: entry.name,
+          schema: entry.schema,
+          schemaHash: entry.schemaHash,
+          schemaChars: entry.schemaChars,
+          propertiesCount: entry.propertiesCount,
+        };
+      }),
+    }),
+    tokenUsage,
+    tokenUsageSource: params.tokenUsageSource,
+  };
+}
+
+export function buildHarnessParityResult(params: {
+  scenarioId: string;
+  left: HarnessParityCell;
+  right: HarnessParityCell;
+  comparisonMode?: RuntimeParityComparisonMode;
+}): HarnessParityResult {
+  const promptDelta = {
+    systemPromptChars:
+      params.right.promptStats.systemPromptChars - params.left.promptStats.systemPromptChars,
+    projectContextChars:
+      params.right.promptStats.projectContextChars - params.left.promptStats.projectContextChars,
+    skillPromptChars:
+      params.right.promptStats.skillPromptChars - params.left.promptStats.skillPromptChars,
+    toolSummaryChars:
+      params.right.promptStats.toolSummaryChars - params.left.promptStats.toolSummaryChars,
+    toolSchemaChars:
+      params.right.promptStats.toolSchemaChars - params.left.promptStats.toolSchemaChars,
+    toolCount: params.right.promptStats.toolCount - params.left.promptStats.toolCount,
+  };
+  const tokenDeltaPercent =
+    params.left.tokenUsage.totalTokens === 0
+      ? params.right.tokenUsage.totalTokens === 0
+        ? 0
+        : 100
+      : ((params.right.tokenUsage.totalTokens - params.left.tokenUsage.totalTokens) /
+          params.left.tokenUsage.totalTokens) *
+        100;
+  const failDetails =
+    params.left.transportErrorClass || params.right.transportErrorClass
+      ? "at least one harness variant hit a transport failure"
+      : params.left.runtimeErrorClass || params.right.runtimeErrorClass
+        ? "at least one harness variant hit a runtime failure"
+        : undefined;
+  if (failDetails) {
+    return {
+      scenarioId: params.scenarioId,
+      left: params.left,
+      right: params.right,
+      drift: "failure-mode",
+      driftDetails: failDetails,
+      promptDelta,
+      tokenDeltaPercent,
+      firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
+    };
+  }
+  if (params.left.systemPromptHash !== params.right.systemPromptHash) {
+    return {
+      scenarioId: params.scenarioId,
+      left: params.left,
+      right: params.right,
+      drift: "system-prompt",
+      driftDetails: "system prompt report differs",
+      promptDelta,
+      tokenDeltaPercent,
+      firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
+    };
+  }
+  if (params.left.toolDescriptionHash !== params.right.toolDescriptionHash) {
+    return {
+      scenarioId: params.scenarioId,
+      left: params.left,
+      right: params.right,
+      drift: "tool-description",
+      driftDetails: "tool description summary shape differs",
+      promptDelta,
+      tokenDeltaPercent,
+      firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
+    };
+  }
+  if (params.left.toolSchemaHash !== params.right.toolSchemaHash) {
+    return {
+      scenarioId: params.scenarioId,
+      left: params.left,
+      right: params.right,
+      drift: "tool-schema",
+      driftDetails: "tool schema shape differs",
+      promptDelta,
+      tokenDeltaPercent,
+      firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
+    };
+  }
+  const compareToolShapes =
+    params.comparisonMode !== "codex-native-workspace" && params.comparisonMode !== "outcome-only";
+  const compareTranscriptStructure =
+    params.comparisonMode !== "codex-native-workspace" && params.comparisonMode !== "outcome-only";
+
+  if (compareToolShapes) {
+    const toolCallDrift = compareToolCallShape(params.left.toolCalls, params.right.toolCalls);
+    if (toolCallDrift) {
+      return {
+        scenarioId: params.scenarioId,
+        left: params.left,
+        right: params.right,
+        drift: "tool-call-shape",
+        driftDetails: toolCallDrift,
+        promptDelta,
+        tokenDeltaPercent,
+        firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
+      };
+    }
+    const toolResultDrift = compareToolResultShape(params.left.toolCalls, params.right.toolCalls);
+    if (toolResultDrift) {
+      return {
+        scenarioId: params.scenarioId,
+        left: params.left,
+        right: params.right,
+        drift: "tool-result-shape",
+        driftDetails: toolResultDrift,
+        promptDelta,
+        tokenDeltaPercent,
+        firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
+      };
+    }
+  }
+  const leftTranscriptRecords = countComparableTranscriptRecords(params.left.transcriptBytes);
+  const rightTranscriptRecords = countComparableTranscriptRecords(params.right.transcriptBytes);
+  if (
+    compareTranscriptStructure &&
+    (leftTranscriptRecords !== rightTranscriptRecords ||
+      (!params.left.finalText && !!params.right.finalText) ||
+      (!!params.left.finalText && !params.right.finalText))
+  ) {
+    return {
+      scenarioId: params.scenarioId,
+      left: params.left,
+      right: params.right,
+      drift: "structural",
+      driftDetails: `transcript/final-text structure differs (${leftTranscriptRecords} message records vs ${rightTranscriptRecords} message records)`,
+      promptDelta,
+      tokenDeltaPercent,
+      firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
+    };
+  }
+  if (
+    normalizeTextForParity(params.left.finalText) !== normalizeTextForParity(params.right.finalText)
+  ) {
+    return {
+      scenarioId: params.scenarioId,
+      left: params.left,
+      right: params.right,
+      drift: "text-only",
+      driftDetails: "final text differs after whitespace normalization",
+      promptDelta,
+      tokenDeltaPercent,
+      firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
+    };
+  }
+  return {
+    scenarioId: params.scenarioId,
+    left: params.left,
+    right: params.right,
+    drift: "none",
+    promptDelta,
+    tokenDeltaPercent,
+  };
+}
+
+function formatPercent(value: number) {
+  const normalized = Math.abs(value) < 0.05 ? 0 : value;
+  const prefix = normalized > 0 ? "+" : "";
+  return `${prefix}${normalized.toFixed(1)}%`;
+}
+
+export function renderHarnessParityMarkdownReport(report: HarnessParityReport): string {
+  const lines = [
+    `# OpenClaw Harness Parity - ${report.left.label} vs ${report.right.label}`,
+    "",
+    `- Generated at: ${report.generatedAt}`,
+    `- Provider mode: ${report.providerMode}`,
+    `- Verdict: ${report.pass ? "pass" : "fail"}`,
+    "",
+    "| Scenario | Drift | First drift turn | Token delta | Prompt chars delta | Tool count delta | Details |",
+    "| --- | --- | ---: | ---: | ---: | ---: | --- |",
+  ];
+
+  for (const result of report.results) {
+    lines.push(
+      `| ${result.scenarioId} | ${result.drift} | ${result.firstDriftTurn ?? ""} | ${formatPercent(
+        result.tokenDeltaPercent,
+      )} | ${result.promptDelta.systemPromptChars} | ${result.promptDelta.toolCount} | ${
+        result.driftDetails ?? ""
+      } |`,
+    );
+  }
+
+  if (report.failures.length > 0) {
+    lines.push("", "## Gate Failures", "");
+    for (const failure of report.failures) {
+      lines.push(`- ${failure}`);
+    }
+  }
+
+  return `${lines.join("\n").trimEnd()}\n`;
+}
--- a/src/agents/system-prompt-report.test.ts
+++ b/src/agents/system-prompt-report.test.ts
@@ -144,4 +144,76 @@ describe("buildSystemPromptReport", () => {
    expect(report.systemPrompt.projectContextChars).toBe(0);
    expect(report.systemPrompt.nonProjectContextChars).toBe("custom override".length);
  });
+
+  it("emits content hashes for prompt and tool parity checks", () => {
+    const file = makeBootstrapFile({ path: "/tmp/workspace/AGENTS.md" });
+    const report = buildSystemPromptReport({
+      source: "run",
+      generatedAt: 0,
+      bootstrapMaxChars: 20_000,
+      systemPrompt: "system",
+      bootstrapFiles: [file],
+      injectedFiles: [],
+      skillsPrompt: "<skill><name>docs</name></skill>",
+      tools: [
+        {
+          name: "read",
+          description: "Read files",
+          parameters: {
+            type: "object",
+            properties: { path: { type: "string" } },
+          },
+        },
+      ] as never,
+    });
+    const sameLengthChangedPrompt = buildSystemPromptReport({
+      source: "run",
+      generatedAt: 0,
+      bootstrapMaxChars: 20_000,
+      systemPrompt: "systen",
+      bootstrapFiles: [file],
+      injectedFiles: [],
+      skillsPrompt: "<skill><name>docs</name></skill>",
+      tools: [],
+    });
+
+    expect(report.systemPrompt.hash).toMatch(/^[a-f0-9]{64}$/u);
+    expect(report.skills.hash).toMatch(/^[a-f0-9]{64}$/u);
+    expect(report.tools.entries[0]?.summaryHash).toMatch(/^[a-f0-9]{64}$/u);
+    expect(report.tools.entries[0]?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
+    expect(sameLengthChangedPrompt.systemPrompt.hash).not.toBe(report.systemPrompt.hash);
+  });
+
+  it("keeps reporting when a tool schema cannot be stringified", () => {
+    const file = makeBootstrapFile({ path: "/tmp/workspace/AGENTS.md" });
+    const circularSchema: Record<string, unknown> = {
+      type: "object",
+      properties: { count: { type: "integer" } },
+    };
+    circularSchema.self = circularSchema;
+
+    const report = buildSystemPromptReport({
+      source: "run",
+      generatedAt: 0,
+      bootstrapMaxChars: 20_000,
+      systemPrompt: "system",
+      bootstrapFiles: [file],
+      injectedFiles: [],
+      skillsPrompt: "",
+      tools: [
+        {
+          name: "broken",
+          description: "Broken schema",
+          parameters: circularSchema,
+        },
+      ] as never,
+    });
+
+    expect(report.tools.entries[0]).toMatchObject({
+      name: "broken",
+      schemaChars: 0,
+      propertiesCount: 1,
+    });
+    expect(report.tools.entries[0]?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
+  });
 });
--- a/src/agents/system-prompt-report.ts
+++ b/src/agents/system-prompt-report.ts
@@ -1,3 +1,4 @@
+import { createHash } from "node:crypto";
 import type { AgentTool } from "@earendil-works/pi-agent-core";
 import type { SessionSystemPromptReport } from "../config/sessions/types.js";
 import { buildBootstrapInjectionStats } from "./bootstrap-budget.js";
@@ -9,9 +10,47 @@ type ToolReportEntry = SessionSystemPromptReport["tools"]["entries"][number];
 const toolReportEntryCache = new WeakMap<AgentTool, ToolReportEntry>();
 const toolSchemaStatsCache = new WeakMap<
  object,
-  Pick<ToolReportEntry, "propertiesCount" | "schemaChars">
+  Pick<ToolReportEntry, "propertiesCount" | "schemaChars" | "schemaHash">
 >();

+function sha256(value: string): string {
+  return createHash("sha256").update(value).digest("hex");
+}
+
+function normalizeForStableHash(value: unknown, seen = new WeakSet<object>()): unknown {
+  if (typeof value === "bigint") {
+    return `${value.toString()}n`;
+  }
+  if (value && typeof value === "object") {
+    if (seen.has(value)) {
+      return "[Circular]";
+    }
+    seen.add(value);
+    if (Array.isArray(value)) {
+      const normalized = value.map((entry) => normalizeForStableHash(entry, seen));
+      seen.delete(value);
+      return normalized;
+    }
+    const record = value as Record<string, unknown>;
+    const normalized = Object.fromEntries(
+      Object.keys(record)
+        .toSorted((left, right) => left.localeCompare(right))
+        .map((key) => [key, normalizeForStableHash(record[key], seen)]),
+    );
+    seen.delete(value);
+    return normalized;
+  }
+  return value;
+}
+
+function stableJsonHash(value: unknown): string {
+  try {
+    return sha256(JSON.stringify(normalizeForStableHash(value)) ?? "null");
+  } catch {
+    return sha256("[unserializable]");
+  }
+}
+
 function extractBetween(input: string, startMarker: string, endMarker: string): string {
  const start = input.indexOf(startMarker);
  if (start === -1) {
@@ -39,9 +78,9 @@ function parseSkillBlocks(skillsPrompt: string): Array<{ name: string; blockChar

 function buildToolSchemaStats(
  parameters: AgentTool["parameters"],
-): Pick<ToolReportEntry, "propertiesCount" | "schemaChars"> {
+): Pick<ToolReportEntry, "propertiesCount" | "schemaChars" | "schemaHash"> {
  if (!parameters || typeof parameters !== "object") {
-    return { schemaChars: 0, propertiesCount: null };
+    return { schemaChars: 0, schemaHash: stableJsonHash(null), propertiesCount: null };
  }
  const cached = toolSchemaStatsCache.get(parameters);
  if (cached) {
@@ -55,6 +94,7 @@ function buildToolSchemaStats(
        return 0;
      }
    })(),
+    schemaHash: stableJsonHash(parameters),
    propertiesCount: (() => {
      const schema = parameters as Record<string, unknown>;
      const props = typeof schema.properties === "object" ? schema.properties : null;
@@ -78,7 +118,7 @@ function buildToolsEntries(tools: AgentTool[]): SessionSystemPromptReport["tools
    const summary = tool.description?.trim() || tool.label?.trim() || "";
    const summaryChars = summary.length;
    const schemaStats = buildToolSchemaStats(tool.parameters);
-    const entry = { name, summaryChars, ...schemaStats };
+    const entry = { name, summaryChars, summaryHash: sha256(summary), ...schemaStats };
    toolReportEntryCache.set(tool, entry);
    return entry;
  });
@@ -129,6 +169,7 @@ export function buildSystemPromptReport(params: {
      chars: systemPromptChars,
      projectContextChars,
      nonProjectContextChars: Math.max(0, systemPromptChars - projectContextChars),
+      hash: sha256(params.systemPrompt),
    },
    ...(params.currentTurn ? { currentTurn: params.currentTurn } : {}),
    injectedWorkspaceFiles: buildBootstrapInjectionStats({
@@ -137,6 +178,7 @@ export function buildSystemPromptReport(params: {
    }),
    skills: {
      promptChars: params.skillsPrompt.length,
+      hash: sha256(params.skillsPrompt),
      entries: skillsEntries,
    },
    tools: {
--- a/src/config/sessions/types.ts
+++ b/src/config/sessions/types.ts
@@ -644,6 +644,7 @@ export type SessionSystemPromptReport = {
    chars: number;
    projectContextChars: number;
    nonProjectContextChars: number;
+    hash?: string;
  };
  currentTurn?: {
    kind?: "user_request" | "room_event";
@@ -660,6 +661,7 @@ export type SessionSystemPromptReport = {
  }>;
  skills: {
    promptChars: number;
+    hash?: string;
    entries: Array<{ name: string; blockChars: number }>;
  };
  tools: {
@@ -668,7 +670,9 @@ export type SessionSystemPromptReport = {
    entries: Array<{
      name: string;
      summaryChars: number;
+      summaryHash?: string;
      schemaChars: number;
+      schemaHash?: string;
      propertiesCount?: number | null;
    }>;
  };