mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-06 05:51:15 +08:00
test(qa-lab): add runtime confidence reports
This commit is contained in:
@@ -2844,11 +2844,17 @@ describe("runCodexAppServerAttempt", () => {
|
||||
expect(report?.provider).toBe("codex");
|
||||
expect(report?.model).toBe("gpt-5.4-codex");
|
||||
expect(report?.systemPrompt.chars).toBeGreaterThan(0);
|
||||
expect(report?.systemPrompt.hash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
expect(report?.skills.hash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
|
||||
const message = report?.tools.entries.find((tool) => tool.name === "message");
|
||||
const webSearch = report?.tools.entries.find((tool) => tool.name === "web_search");
|
||||
expect(message?.schemaChars).toBeGreaterThan(0);
|
||||
expect(message?.summaryHash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
expect(message?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
expect(webSearch?.schemaChars).toBe(0);
|
||||
expect(webSearch?.summaryHash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
expect(webSearch?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
expect(report?.tools.schemaChars).toBe(message?.schemaChars);
|
||||
});
|
||||
|
||||
@@ -6574,7 +6580,8 @@ describe("runCodexAppServerAttempt", () => {
|
||||
input?: Array<{ text?: string }>;
|
||||
};
|
||||
expect(turnStartParams.input?.[0]?.text).toBe(exactCommand);
|
||||
expect(result.systemPromptReport?.skills).toEqual({ promptChars: 0, entries: [] });
|
||||
expect(result.systemPromptReport?.skills).toMatchObject({ promptChars: 0, entries: [] });
|
||||
expect(result.systemPromptReport?.skills.hash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
});
|
||||
|
||||
it("fires llm_input, llm_output, and agent_end hooks for codex turns", async () => {
|
||||
|
||||
@@ -5095,6 +5095,7 @@ function buildCodexSystemPromptReport(params: {
|
||||
chars: params.developerInstructions.length,
|
||||
projectContextChars: 0,
|
||||
nonProjectContextChars: params.developerInstructions.length,
|
||||
hash: sha256Text(params.developerInstructions),
|
||||
},
|
||||
injectedWorkspaceFiles: buildCodexBootstrapInjectionStats({
|
||||
bootstrapFiles: params.workspaceBootstrapContext.bootstrapFiles,
|
||||
@@ -5106,6 +5107,7 @@ function buildCodexSystemPromptReport(params: {
|
||||
}),
|
||||
skills: {
|
||||
promptChars: skillsPrompt.length,
|
||||
hash: sha256Text(skillsPrompt),
|
||||
entries: buildCodexSkillReportEntries(skillsPrompt),
|
||||
},
|
||||
tools: {
|
||||
@@ -5137,20 +5139,23 @@ function buildCodexToolReportEntry(tool: CodexDynamicToolSpec): CodexToolReportE
|
||||
return {
|
||||
name: tool.name,
|
||||
summaryChars: summary.length,
|
||||
summaryHash: sha256Text(summary),
|
||||
schemaChars: 0,
|
||||
schemaHash: stableJsonHash(null),
|
||||
propertiesCount: null,
|
||||
};
|
||||
}
|
||||
return {
|
||||
name: tool.name,
|
||||
summaryChars: summary.length,
|
||||
summaryHash: sha256Text(summary),
|
||||
...buildCodexToolSchemaStats(tool.inputSchema),
|
||||
};
|
||||
}
|
||||
|
||||
function buildCodexToolSchemaStats(
|
||||
schema: JsonValue,
|
||||
): Pick<CodexToolReportEntry, "schemaChars" | "propertiesCount"> {
|
||||
): Pick<CodexToolReportEntry, "schemaChars" | "schemaHash" | "propertiesCount"> {
|
||||
const schemaChars = (() => {
|
||||
try {
|
||||
return JSON.stringify(schema).length;
|
||||
@@ -5162,10 +5167,34 @@ function buildCodexToolSchemaStats(
|
||||
isJsonObject(schema) && isJsonObject(schema.properties) ? schema.properties : null;
|
||||
return {
|
||||
schemaChars,
|
||||
schemaHash: stableJsonHash(schema),
|
||||
propertiesCount: properties ? Object.keys(properties).length : null,
|
||||
};
|
||||
}
|
||||
|
||||
function sha256Text(value: string): string {
|
||||
return createHash("sha256").update(value).digest("hex");
|
||||
}
|
||||
|
||||
function normalizeForStableHash(value: unknown): unknown {
|
||||
if (Array.isArray(value)) {
|
||||
return value.map((entry) => normalizeForStableHash(entry));
|
||||
}
|
||||
if (value && typeof value === "object") {
|
||||
const record = value as Record<string, unknown>;
|
||||
return Object.fromEntries(
|
||||
Object.keys(record)
|
||||
.toSorted((left, right) => left.localeCompare(right))
|
||||
.map((key) => [key, normalizeForStableHash(record[key])]),
|
||||
);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function stableJsonHash(value: JsonValue): string {
|
||||
return sha256Text(JSON.stringify(normalizeForStableHash(value)) ?? "null");
|
||||
}
|
||||
|
||||
function buildCodexBootstrapInjectionStats(params: {
|
||||
bootstrapFiles: CodexBootstrapFile[];
|
||||
injectedFiles: EmbeddedContextFile[];
|
||||
|
||||
168
extensions/qa-lab/confidence-profiles/codex-100.json
Normal file
168
extensions/qa-lab/confidence-profiles/codex-100.json
Normal file
@@ -0,0 +1,168 @@
|
||||
{
|
||||
"version": 1,
|
||||
"profile": "codex-100",
|
||||
"lanes": [
|
||||
{
|
||||
"id": "tool-defaults-direct",
|
||||
"title": "Tool-defaults direct runtime parity",
|
||||
"kind": "qa-suite-summary",
|
||||
"artifact": "tool-defaults-direct/qa-suite-summary.json",
|
||||
"required": true,
|
||||
"productImpact": "P2",
|
||||
"qaImpact": "P0",
|
||||
"issue": "https://github.com/openclaw/openclaw/issues/80319",
|
||||
"ownerAction": "Fix product or harness before claiming the tool-defaults gate is trusted.",
|
||||
"labels": ["qa-lab", "runtime-parity", "codex"]
|
||||
},
|
||||
{
|
||||
"id": "openclaw-dynamic-tools-direct",
|
||||
"title": "OpenClaw dynamic integration tools direct runtime parity",
|
||||
"kind": "qa-suite-summary",
|
||||
"artifact": "openclaw-dynamic-tools-direct/qa-suite-summary.json",
|
||||
"required": true,
|
||||
"productImpact": "P1",
|
||||
"qaImpact": "P0",
|
||||
"issue": "https://github.com/openclaw/openclaw/issues/80319",
|
||||
"ownerAction": "Investigate any hard failure as an OpenClaw dynamic integration or QA loading regression.",
|
||||
"labels": ["qa-lab", "runtime-parity", "openclaw-dynamic-tools"]
|
||||
},
|
||||
{
|
||||
"id": "tool-defaults-searchable",
|
||||
"title": "Tool-defaults searchable runtime parity",
|
||||
"kind": "qa-suite-summary",
|
||||
"artifact": "tool-defaults-searchable/qa-suite-summary.json",
|
||||
"required": true,
|
||||
"failureVerdict": "mock-limitation",
|
||||
"skipBackfillLane": "openclaw-dynamic-tools-searchable-live",
|
||||
"productImpact": "P4",
|
||||
"qaImpact": "P2",
|
||||
"issue": "https://github.com/openclaw/openclaw/issues/80319",
|
||||
"ownerAction": "Keep as report-only until searchable/deferred tool modeling has no mock-only ambiguity.",
|
||||
"labels": ["qa-lab", "runtime-parity", "searchable-tools"]
|
||||
},
|
||||
{
|
||||
"id": "first-hour-20-direct",
|
||||
"title": "First-hour 20-turn direct runtime parity",
|
||||
"kind": "qa-suite-summary",
|
||||
"artifact": "first-hour-20-direct/qa-suite-summary.json",
|
||||
"required": true,
|
||||
"skipBackfillLane": "codex-native-live",
|
||||
"productImpact": "P1",
|
||||
"qaImpact": "P0",
|
||||
"ownerAction": "Triage row-by-row; do not file product bugs unless live/native proof reproduces.",
|
||||
"labels": ["qa-lab", "runtime-parity", "first-hour"]
|
||||
},
|
||||
{
|
||||
"id": "mock-token-efficiency",
|
||||
"title": "Mock assistant-message token efficiency estimate",
|
||||
"kind": "token-efficiency-summary",
|
||||
"artifact": "first-hour-20-direct-report/qa-runtime-token-efficiency-summary.json",
|
||||
"required": true,
|
||||
"expectedTokenUsageSource": "mock-estimate",
|
||||
"productImpact": "P4",
|
||||
"qaImpact": "P1",
|
||||
"ownerAction": "Fix labeling before trusting token-efficiency comparisons.",
|
||||
"labels": ["qa-lab", "runtime-parity", "token-efficiency"]
|
||||
},
|
||||
{
|
||||
"id": "fault-injection-mock",
|
||||
"title": "Mock fault-injection runtime parity",
|
||||
"kind": "qa-suite-summary",
|
||||
"artifact": "fault-injection-mock/qa-suite-summary.json",
|
||||
"required": true,
|
||||
"skipBackfillLane": "codex-native-live",
|
||||
"productImpact": "P2",
|
||||
"qaImpact": "P0",
|
||||
"ownerAction": "Treat failures as retry/recovery regressions unless evidence shows fixture drift.",
|
||||
"labels": ["qa-lab", "runtime-parity", "fault-injection"]
|
||||
},
|
||||
{
|
||||
"id": "jsonl-expanded",
|
||||
"title": "Expanded curated JSONL replay",
|
||||
"kind": "jsonl-replay-summary",
|
||||
"artifact": "jsonl-expanded/qa-jsonl-replay-summary.json",
|
||||
"required": true,
|
||||
"productImpact": "P2",
|
||||
"qaImpact": "P0",
|
||||
"ownerAction": "Inspect first drift turn and transcript class before filing any product issue.",
|
||||
"labels": ["qa-lab", "runtime-parity", "jsonl-replay"]
|
||||
},
|
||||
{
|
||||
"id": "confidence-self-test",
|
||||
"title": "Seeded confidence negative controls",
|
||||
"kind": "self-test-summary",
|
||||
"artifact": "confidence-self-test/qa-confidence-self-test-summary.json",
|
||||
"required": true,
|
||||
"productImpact": "P4",
|
||||
"qaImpact": "P0",
|
||||
"ownerAction": "Fix the harness before trusting any green parity result.",
|
||||
"labels": ["qa-lab", "confidence-gate", "negative-controls"]
|
||||
},
|
||||
{
|
||||
"id": "codex-native-live",
|
||||
"title": "Codex-native live workspace capability proof",
|
||||
"kind": "qa-suite-summary",
|
||||
"artifact": "codex-native-live/qa-suite-summary.json",
|
||||
"required": true,
|
||||
"missingVerdict": "environment-blocked",
|
||||
"missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
|
||||
"productImpact": "P1",
|
||||
"qaImpact": "P1",
|
||||
"ownerAction": "Run with live-frontier OAuth before using this lane as product proof.",
|
||||
"labels": ["qa-lab", "runtime-parity", "live-proof"]
|
||||
},
|
||||
{
|
||||
"id": "first-hour-live",
|
||||
"title": "Live first-hour capability proof",
|
||||
"kind": "qa-suite-summary",
|
||||
"artifact": "first-hour-live/qa-suite-summary.json",
|
||||
"required": true,
|
||||
"missingVerdict": "environment-blocked",
|
||||
"missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
|
||||
"productImpact": "P1",
|
||||
"qaImpact": "P1",
|
||||
"ownerAction": "Run with live-frontier OAuth before claiming live first-hour coverage.",
|
||||
"labels": ["qa-lab", "runtime-parity", "live-proof"]
|
||||
},
|
||||
{
|
||||
"id": "openclaw-dynamic-tools-searchable-live",
|
||||
"title": "Live OpenClaw dynamic tools searchable proof",
|
||||
"kind": "qa-suite-summary",
|
||||
"artifact": "openclaw-dynamic-tools-searchable-live/qa-suite-summary.json",
|
||||
"required": true,
|
||||
"missingVerdict": "environment-blocked",
|
||||
"missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
|
||||
"productImpact": "P1",
|
||||
"qaImpact": "P1",
|
||||
"ownerAction": "Run with live-frontier OAuth before claiming production-shaped searchable OpenClaw dynamic tool coverage.",
|
||||
"labels": ["qa-lab", "runtime-parity", "searchable-tools", "live-proof"]
|
||||
},
|
||||
{
|
||||
"id": "live-token-efficiency",
|
||||
"title": "Live assistant-message token efficiency",
|
||||
"kind": "token-efficiency-summary",
|
||||
"artifact": "live-token-efficiency/qa-runtime-token-efficiency-summary.json",
|
||||
"required": true,
|
||||
"expectedTokenUsageSource": "live-usage",
|
||||
"missingVerdict": "environment-blocked",
|
||||
"missingReason": "Live/OAuth runner or OpenAI credentials were unavailable for this proof bundle.",
|
||||
"productImpact": "P3",
|
||||
"qaImpact": "P1",
|
||||
"ownerAction": "Run a live-frontier runtime parity summary and regenerate token efficiency.",
|
||||
"labels": ["qa-lab", "runtime-parity", "token-efficiency"]
|
||||
},
|
||||
{
|
||||
"id": "soak-100",
|
||||
"title": "Optional 100-turn soak",
|
||||
"kind": "qa-suite-summary",
|
||||
"artifact": "soak-100/qa-suite-summary.json",
|
||||
"required": true,
|
||||
"missingVerdict": "environment-blocked",
|
||||
"missingReason": "Scheduled/Testbox soak runner did not upload artifacts for this proof bundle.",
|
||||
"productImpact": "P3",
|
||||
"qaImpact": "P2",
|
||||
"ownerAction": "Run remotely with a long timeout or record the runner budget blocker.",
|
||||
"labels": ["qa-lab", "runtime-parity", "soak"]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -13,6 +13,12 @@ import {
|
||||
import { resolveQaParityPackScenarioIds } from "./agentic-parity.js";
|
||||
import { runQaCharacterEval, type QaCharacterModelOptions } from "./character-eval.js";
|
||||
import { resolveRepoRelativeOutputDir } from "./cli-paths.js";
|
||||
import {
|
||||
buildQaConfidenceReport,
|
||||
readQaConfidenceManifestFile,
|
||||
renderQaConfidenceMarkdownReport,
|
||||
writeQaConfidenceSelfTestArtifacts,
|
||||
} from "./confidence-report.js";
|
||||
import {
|
||||
buildQaCoverageInventory,
|
||||
findQaScenarioMatches,
|
||||
@@ -786,6 +792,60 @@ export async function runQaParityReportCommand(opts: {
|
||||
}
|
||||
}
|
||||
|
||||
export async function runQaConfidenceReportCommand(opts: {
|
||||
repoRoot?: string;
|
||||
manifest: string;
|
||||
artifactRoot?: string;
|
||||
outputDir?: string;
|
||||
strictZeroUnknowns?: boolean;
|
||||
strictGlobalPass?: boolean;
|
||||
}) {
|
||||
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
|
||||
const manifestPath = path.resolve(repoRoot, opts.manifest);
|
||||
const artifactRoot = path.resolve(repoRoot, opts.artifactRoot ?? ".");
|
||||
const outputDir =
|
||||
resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
|
||||
path.join(repoRoot, ".artifacts", "qa-e2e", `confidence-${Date.now().toString(36)}`);
|
||||
await fs.mkdir(outputDir, { recursive: true });
|
||||
const manifest = await readQaConfidenceManifestFile(manifestPath);
|
||||
const reportPayload = await buildQaConfidenceReport({
|
||||
manifest,
|
||||
artifactRoot,
|
||||
strictZeroUnknowns: opts.strictZeroUnknowns === true,
|
||||
strictGlobalPass: opts.strictGlobalPass === true,
|
||||
});
|
||||
const report = renderQaConfidenceMarkdownReport(reportPayload);
|
||||
const reportPath = path.join(outputDir, "qa-confidence-report.md");
|
||||
const summaryPath = path.join(outputDir, "qa-confidence-summary.json");
|
||||
await fs.writeFile(reportPath, report, "utf8");
|
||||
await fs.writeFile(summaryPath, `${JSON.stringify(reportPayload, null, 2)}\n`, "utf8");
|
||||
process.stdout.write(`QA confidence report: ${reportPath}\n`);
|
||||
process.stdout.write(`QA confidence summary: ${summaryPath}\n`);
|
||||
process.stdout.write(`QA confidence verdict: ${reportPayload.pass ? "pass" : "fail"}\n`);
|
||||
if (!reportPayload.pass) {
|
||||
process.exitCode = 1;
|
||||
}
|
||||
}
|
||||
|
||||
export async function runQaConfidenceSelfTestCommand(opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
}) {
|
||||
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
|
||||
const outputDir =
|
||||
resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
|
||||
path.join(repoRoot, ".artifacts", "qa-e2e", `confidence-self-test-${Date.now().toString(36)}`);
|
||||
const result = await writeQaConfidenceSelfTestArtifacts({ outputDir });
|
||||
process.stdout.write(`QA confidence self-test report: ${result.reportPath}\n`);
|
||||
process.stdout.write(`QA confidence self-test summary: ${result.summaryPath}\n`);
|
||||
process.stdout.write(
|
||||
`QA confidence self-test verdict: ${result.summary.pass ? "pass" : "fail"}\n`,
|
||||
);
|
||||
if (!result.summary.pass) {
|
||||
process.exitCode = 1;
|
||||
}
|
||||
}
|
||||
|
||||
export async function runQaCoverageReportCommand(opts: {
|
||||
repoRoot?: string;
|
||||
output?: string;
|
||||
|
||||
@@ -72,6 +72,23 @@ async function runQaParityReport(opts: {
|
||||
await runtime.runQaParityReportCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaConfidenceReport(opts: {
|
||||
repoRoot?: string;
|
||||
manifest: string;
|
||||
artifactRoot?: string;
|
||||
outputDir?: string;
|
||||
strictZeroUnknowns?: boolean;
|
||||
strictGlobalPass?: boolean;
|
||||
}) {
|
||||
const runtime = await loadQaLabCliRuntime();
|
||||
await runtime.runQaConfidenceReportCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaConfidenceSelfTest(opts: { repoRoot?: string; outputDir?: string }) {
|
||||
const runtime = await loadQaLabCliRuntime();
|
||||
await runtime.runQaConfidenceSelfTestCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaCoverageReport(opts: {
|
||||
repoRoot?: string;
|
||||
output?: string;
|
||||
@@ -424,6 +441,43 @@ export function registerQaLabCli(program: Command) {
|
||||
},
|
||||
);
|
||||
|
||||
qa.command("confidence-report")
|
||||
.description("Classify QA proof artifacts into a zero-unknown confidence report")
|
||||
.requiredOption("--manifest <path>", "Confidence profile manifest JSON")
|
||||
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
|
||||
.option("--artifact-root <path>", "Root directory for relative artifact paths", ".")
|
||||
.option("--output-dir <path>", "Artifact directory for the confidence report")
|
||||
.option(
|
||||
"--strict-zero-unknowns",
|
||||
"Fail unless every lane passes or has an explicit non-unknown verdict",
|
||||
false,
|
||||
)
|
||||
.option(
|
||||
"--strict-global-pass",
|
||||
"Fail unless every lane passes with no blocked, missing, unknown, classified-fail, or unbackfilled skipped rows",
|
||||
false,
|
||||
)
|
||||
.action(
|
||||
async (opts: {
|
||||
repoRoot?: string;
|
||||
manifest: string;
|
||||
artifactRoot?: string;
|
||||
outputDir?: string;
|
||||
strictZeroUnknowns?: boolean;
|
||||
strictGlobalPass?: boolean;
|
||||
}) => {
|
||||
await runQaConfidenceReport(opts);
|
||||
},
|
||||
);
|
||||
|
||||
qa.command("confidence-self-test")
|
||||
.description("Write seeded negative-control canaries proving the confidence gate detects drift")
|
||||
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
|
||||
.option("--output-dir <path>", "Artifact directory for the confidence self-test")
|
||||
.action(async (opts: { repoRoot?: string; outputDir?: string }) => {
|
||||
await runQaConfidenceSelfTest(opts);
|
||||
});
|
||||
|
||||
qa.command("jsonl-replay")
|
||||
.description("Replay curated JSONL transcripts through the runtime parity replay harness")
|
||||
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
|
||||
|
||||
881
extensions/qa-lab/src/confidence-report.test.ts
Normal file
881
extensions/qa-lab/src/confidence-report.test.ts
Normal file
@@ -0,0 +1,881 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||
import {
|
||||
buildQaConfidenceReport,
|
||||
buildQaConfidenceSelfTestSummary,
|
||||
renderQaConfidenceMarkdownReport,
|
||||
writeQaConfidenceSelfTestArtifacts,
|
||||
type QaConfidenceManifest,
|
||||
} from "./confidence-report.js";
|
||||
|
||||
describe("qa confidence report", () => {
|
||||
let tempRoot: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-confidence-"));
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await fs.rm(tempRoot, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
async function writeJson(relativePath: string, payload: unknown) {
|
||||
const filePath = path.join(tempRoot, relativePath);
|
||||
await fs.mkdir(path.dirname(filePath), { recursive: true });
|
||||
await fs.writeFile(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
|
||||
return filePath;
|
||||
}
|
||||
|
||||
it("passes strict zero-unknowns when every lane passes or has an allowed blocked verdict", async () => {
|
||||
await writeJson("tool-defaults/qa-suite-summary.json", {
|
||||
counts: { total: 20, passed: 18, skipped: 2, failed: 0 },
|
||||
scenarios: [],
|
||||
});
|
||||
await writeJson("token/qa-runtime-token-efficiency-summary.json", {
|
||||
status: "estimated",
|
||||
pass: true,
|
||||
rows: [{ scenarioId: "one", usageSource: "mock-estimate" }],
|
||||
});
|
||||
|
||||
const manifest: QaConfidenceManifest = {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "tool-defaults-direct",
|
||||
title: "Tool defaults direct",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "tool-defaults/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
id: "mock-token-efficiency",
|
||||
title: "Mock token efficiency",
|
||||
kind: "token-efficiency-summary",
|
||||
artifact: "token/qa-runtime-token-efficiency-summary.json",
|
||||
required: true,
|
||||
expectedTokenUsageSource: "mock-estimate",
|
||||
},
|
||||
{
|
||||
id: "live-token-efficiency",
|
||||
title: "Live token efficiency",
|
||||
kind: "token-efficiency-summary",
|
||||
artifact: "live/qa-runtime-token-efficiency-summary.json",
|
||||
required: true,
|
||||
missingVerdict: "environment-blocked",
|
||||
missingReason: "OPENAI OAuth credentials are not available in this runner.",
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest,
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(true);
|
||||
expect(report.globalPass).toBe(false);
|
||||
expect(report.counts).toMatchObject({ passed: 2, blocked: 1, unknown: 0, failed: 0 });
|
||||
expect(report.lanes.map((lane) => lane.verdict)).toEqual([
|
||||
"pass",
|
||||
"pass",
|
||||
"environment-blocked",
|
||||
]);
|
||||
expect(report.lanes[0]?.artifactPath).toBe("tool-defaults/qa-suite-summary.json");
|
||||
expect(report.lanes[0]?.artifactPath).not.toContain(tempRoot);
|
||||
expect(report.lanes[0]?.details).toContain("counts.skipped=2");
|
||||
expect(renderQaConfidenceMarkdownReport(report)).toContain("Zero unknowns: yes");
|
||||
expect(renderQaConfidenceMarkdownReport(report)).toContain("Global pass: no");
|
||||
});
|
||||
|
||||
it("does not let optional lanes block strict gates", async () => {
|
||||
await writeJson("required/qa-suite-summary.json", {
|
||||
counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
|
||||
scenarios: [],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "required",
|
||||
title: "Required",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "required/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
id: "optional-missing",
|
||||
title: "Optional missing",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "optional/qa-suite-summary.json",
|
||||
required: false,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
strictGlobalPass: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(true);
|
||||
expect(report.counts).toMatchObject({ total: 1, passed: 1, unknown: 0 });
|
||||
expect(report.failures).toEqual([]);
|
||||
expect(report.lanes[1]).toMatchObject({ id: "optional-missing", status: "missing" });
|
||||
});
|
||||
|
||||
it("fails strict global pass when any lane is blocked, missing, unknown, or classified failed", async () => {
|
||||
await writeJson("classified/qa-suite-summary.json", {
|
||||
counts: { total: 1, passed: 0, skipped: 0, failed: 1 },
|
||||
scenarios: [{ name: "classified", status: "fail" }],
|
||||
});
|
||||
await writeJson("unknown/qa-suite-summary.json", {
|
||||
counts: { total: 1, passed: 0, skipped: 0, failed: 1 },
|
||||
scenarios: [{ name: "unknown", status: "fail" }],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "blocked-live",
|
||||
title: "Blocked live",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "live/qa-suite-summary.json",
|
||||
required: true,
|
||||
missingVerdict: "environment-blocked",
|
||||
missingReason: "OPENAI_API_KEY missing.",
|
||||
},
|
||||
{
|
||||
id: "missing-soak",
|
||||
title: "Missing soak",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "soak/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
id: "classified-fixture",
|
||||
title: "Classified fixture",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "classified/qa-suite-summary.json",
|
||||
required: true,
|
||||
failureVerdict: "fixture-bug",
|
||||
},
|
||||
{
|
||||
id: "unknown-failure",
|
||||
title: "Unknown failure",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "unknown/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
strictGlobalPass: true,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.zeroUnknowns).toBe(false);
|
||||
expect(report.globalPass).toBe(false);
|
||||
expect(report.counts).toMatchObject({
|
||||
blocked: 1,
|
||||
missing: 1,
|
||||
failed: 1,
|
||||
unknown: 2,
|
||||
});
|
||||
expect(report.failures).toEqual([
|
||||
"blocked-live is blocked: OPENAI_API_KEY missing.",
|
||||
"missing-soak is missing: artifact missing and no missingVerdict was configured",
|
||||
"classified-fixture is classified fixture-bug: qa-suite-summary counts.failed=1 counts.total=1 counts.skipped=0",
|
||||
"unknown-failure is unclassified: qa-suite-summary counts.failed=1 counts.total=1 counts.skipped=0",
|
||||
]);
|
||||
});
|
||||
|
||||
it("fails strict global pass for skipped suite rows until a backfill lane passes", async () => {
|
||||
await writeJson("report-only/qa-suite-summary.json", {
|
||||
counts: { total: 3, passed: 2, skipped: 1, failed: 0 },
|
||||
scenarios: [],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "report-only",
|
||||
title: "Report-only",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "report-only/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
strictGlobalPass: true,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.zeroUnknowns).toBe(true);
|
||||
expect(report.globalPass).toBe(false);
|
||||
expect(report.failures).toEqual([
|
||||
"report-only has 1 skipped row(s) with no passing backfill lane",
|
||||
]);
|
||||
});
|
||||
|
||||
it("infers skipped suite rows from totals and scenario status", async () => {
|
||||
for (const [artifact, expectedDetail] of [
|
||||
[{ counts: { total: 3, passed: 2, failed: 0 }, scenarios: [] }, "counts.skipped=1"],
|
||||
[
|
||||
{
|
||||
counts: { total: 2, passed: 2, failed: 0 },
|
||||
scenarios: [
|
||||
{ name: "passing", status: "pass" },
|
||||
{ name: "skipped", status: "skip" },
|
||||
],
|
||||
},
|
||||
"counts.skipped=1",
|
||||
],
|
||||
] as const) {
|
||||
await writeJson("report-only/qa-suite-summary.json", artifact);
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "report-only",
|
||||
title: "Report-only",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "report-only/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
strictGlobalPass: true,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.globalPass).toBe(false);
|
||||
expect(report.failures).toEqual([
|
||||
"report-only has 1 skipped row(s) with no passing backfill lane",
|
||||
]);
|
||||
expect(report.lanes[0]).toMatchObject({ skippedCount: 1 });
|
||||
expect(report.lanes[0]?.details).toContain(expectedDetail);
|
||||
}
|
||||
});
|
||||
|
||||
it("rejects skipped token reports when a live usage source is required", async () => {
|
||||
await writeJson("live-token/qa-runtime-token-efficiency-summary.json", {
|
||||
status: "skipped",
|
||||
pass: true,
|
||||
rows: [],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "live-token-efficiency",
|
||||
title: "Live token efficiency",
|
||||
kind: "token-efficiency-summary",
|
||||
artifact: "live-token/qa-runtime-token-efficiency-summary.json",
|
||||
required: true,
|
||||
expectedTokenUsageSource: "live-usage",
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.lanes[0]).toMatchObject({
|
||||
status: "unknown",
|
||||
details: "token summary has no live-usage rows",
|
||||
});
|
||||
});
|
||||
|
||||
it("preserves partial zero-unknown mode for classified failing lanes", async () => {
|
||||
await writeJson("classified/qa-suite-summary.json", {
|
||||
counts: { total: 1, passed: 0, skipped: 0, failed: 1 },
|
||||
scenarios: [{ name: "classified", status: "fail" }],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "classified-fixture",
|
||||
title: "Classified fixture",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "classified/qa-suite-summary.json",
|
||||
required: true,
|
||||
failureVerdict: "fixture-bug",
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(true);
|
||||
expect(report.zeroUnknowns).toBe(true);
|
||||
expect(report.globalPass).toBe(false);
|
||||
expect(report.counts.failed).toBe(1);
|
||||
});
|
||||
|
||||
it("passes strict global pass when skipped suite rows are backfilled by a passing lane", async () => {
|
||||
await writeJson("report-only/qa-suite-summary.json", {
|
||||
counts: { total: 3, passed: 2, skipped: 1, failed: 0 },
|
||||
scenarios: [],
|
||||
});
|
||||
await writeJson("live-backfill/qa-suite-summary.json", {
|
||||
counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
|
||||
scenarios: [],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "report-only",
|
||||
title: "Report-only",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "report-only/qa-suite-summary.json",
|
||||
required: true,
|
||||
skipBackfillLane: "live-backfill",
|
||||
},
|
||||
{
|
||||
id: "live-backfill",
|
||||
title: "Live backfill",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "live-backfill/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
strictGlobalPass: true,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(true);
|
||||
expect(report.zeroUnknowns).toBe(true);
|
||||
expect(report.globalPass).toBe(true);
|
||||
expect(report.lanes[0]).toMatchObject({
|
||||
skippedCount: 1,
|
||||
skipBackfillLane: "live-backfill",
|
||||
skipBackfilled: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("classifies environment-blocking gateway sentinels without turning them into unknowns", async () => {
|
||||
await writeJson("live/qa-suite-summary.json", {
|
||||
counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
|
||||
gatewayLogSentinels: [
|
||||
{
|
||||
kind: "live-quota-or-subscription",
|
||||
verdict: "environment-blocked",
|
||||
owner: "environment",
|
||||
productImpact: "P4",
|
||||
qaImpact: "P0",
|
||||
line: 12,
|
||||
text: "OpenAI quota exceeded",
|
||||
},
|
||||
],
|
||||
scenarios: [],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "first-hour-live",
|
||||
title: "First hour live",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "live/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(true);
|
||||
expect(report.globalPass).toBe(false);
|
||||
expect(report.counts).toMatchObject({ blocked: 1, unknown: 0 });
|
||||
expect(report.lanes[0]).toMatchObject({
|
||||
status: "blocked",
|
||||
verdict: "environment-blocked",
|
||||
});
|
||||
});
|
||||
|
||||
it("does not let environment sentinels hide separate suite failures", async () => {
|
||||
await writeJson("live/qa-suite-summary.json", {
|
||||
counts: { total: 2, passed: 1, skipped: 0, failed: 1 },
|
||||
gatewayLogSentinels: [
|
||||
{
|
||||
kind: "live-quota-or-subscription",
|
||||
verdict: "environment-blocked",
|
||||
owner: "environment",
|
||||
line: 12,
|
||||
text: "OpenAI quota exceeded",
|
||||
},
|
||||
],
|
||||
scenarios: [
|
||||
{ name: "quota", status: "pass" },
|
||||
{ name: "unrelated-drift", status: "fail" },
|
||||
],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "first-hour-live",
|
||||
title: "First hour live",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "live/qa-suite-summary.json",
|
||||
required: true,
|
||||
missingVerdict: "environment-blocked",
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.counts).toMatchObject({ blocked: 0, unknown: 1 });
|
||||
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
|
||||
expect(report.lanes[0]?.details).toContain("suite also reports failures");
|
||||
});
|
||||
|
||||
it("classifies product and plugin gateway sentinels as known failing lanes", async () => {
|
||||
await writeJson("live/qa-suite-summary.json", {
|
||||
counts: { total: 1, passed: 1, skipped: 0, failed: 0 },
|
||||
scenarios: [
|
||||
{
|
||||
name: "plugin hook health sentinel",
|
||||
status: "pass",
|
||||
steps: [],
|
||||
runtimeParity: {
|
||||
scenarioId: "plugin-hook-health-sentinel",
|
||||
drift: "none",
|
||||
cells: {
|
||||
pi: { sentinelFindings: [] },
|
||||
codex: {
|
||||
sentinelFindings: [
|
||||
{
|
||||
kind: "plugin-hook-failure",
|
||||
verdict: "qa-harness-bug",
|
||||
owner: "plugin",
|
||||
productImpact: "P1",
|
||||
qaImpact: "P0",
|
||||
line: 4,
|
||||
text: "before_prompt_build hook failed",
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "first-hour-live",
|
||||
title: "First hour live",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "live/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(true);
|
||||
expect(report.globalPass).toBe(false);
|
||||
expect(report.counts).toMatchObject({ failed: 1, unknown: 0 });
|
||||
expect(report.lanes[0]).toMatchObject({
|
||||
status: "fail",
|
||||
verdict: "qa-harness-bug",
|
||||
});
|
||||
});
|
||||
|
||||
it("treats corrupt artifacts as unknown instead of allowed missing lanes", async () => {
|
||||
const artifactPath = path.join(tempRoot, "live", "qa-suite-summary.json");
|
||||
await fs.mkdir(path.dirname(artifactPath), { recursive: true });
|
||||
await fs.writeFile(artifactPath, "{not-json", "utf8");
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "first-hour-live",
|
||||
title: "First hour live",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "live/qa-suite-summary.json",
|
||||
required: true,
|
||||
missingVerdict: "environment-blocked",
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.counts).toMatchObject({ blocked: 0, unknown: 1 });
|
||||
expect(report.lanes[0]).toMatchObject({
|
||||
status: "unknown",
|
||||
});
|
||||
expect(report.lanes[0]?.details).toContain("artifact unreadable");
|
||||
});
|
||||
|
||||
it("treats schema-invalid suite artifacts as unknown", async () => {
|
||||
await writeJson("live/qa-suite-summary.json", {});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "first-hour-live",
|
||||
title: "First hour live",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "live/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.counts.unknown).toBe(1);
|
||||
expect(report.lanes[0]?.details).toContain("missing counts.failed and scenarios[]");
|
||||
});
|
||||
|
||||
it("treats empty suite artifacts as unknown", async () => {
|
||||
await writeJson("live/qa-suite-summary.json", {
|
||||
counts: { total: 0, passed: 0, skipped: 0, failed: 0 },
|
||||
scenarios: [],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "first-hour-live",
|
||||
title: "First hour live",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "live/qa-suite-summary.json",
|
||||
required: true,
|
||||
failureVerdict: "qa-harness-bug",
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
|
||||
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
|
||||
expect(report.lanes[0]?.details).toContain("no executed scenarios");
|
||||
});
|
||||
|
||||
it("treats suite count and scenario mismatches as unknown", async () => {
|
||||
await writeJson("live/qa-suite-summary.json", {
|
||||
counts: { total: 2, passed: 2, skipped: 0, failed: 0 },
|
||||
scenarios: [
|
||||
{ name: "passing", status: "pass" },
|
||||
{ name: "stale-failure", status: "fail" },
|
||||
],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "first-hour-live",
|
||||
title: "First hour live",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "live/qa-suite-summary.json",
|
||||
required: true,
|
||||
failureVerdict: "qa-harness-bug",
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
|
||||
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
|
||||
expect(report.lanes[0]?.details).toContain("count/scenario mismatch");
|
||||
});
|
||||
|
||||
it("requires generic summary lanes to expose an explicit pass signal", async () => {
|
||||
await writeJson("runtime/qa-runtime-parity-summary.json", {});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "runtime-parity",
|
||||
title: "Runtime parity",
|
||||
kind: "runtime-parity-summary",
|
||||
artifact: "runtime/qa-runtime-parity-summary.json",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.counts.unknown).toBe(1);
|
||||
expect(report.lanes[0]?.details).toContain("explicit pass signal");
|
||||
});
|
||||
|
||||
it("requires JSONL replay summaries to contain replayed user turns", async () => {
|
||||
for (const [artifact, expectedDetail] of [
|
||||
[{ transcripts: [] }, "no transcripts"],
|
||||
[
|
||||
{ transcripts: [{ transcriptPath: "empty.jsonl", userTurnCount: 0, drift: [] }] },
|
||||
"no replayed user turns",
|
||||
],
|
||||
[
|
||||
{ transcripts: [{ transcriptPath: "missing-drift.jsonl", userTurnCount: 1 }] },
|
||||
"missing drift array",
|
||||
],
|
||||
] as const) {
|
||||
await writeJson("jsonl/qa-jsonl-replay-summary.json", artifact);
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "jsonl-expanded",
|
||||
title: "Expanded JSONL replay",
|
||||
kind: "jsonl-replay-summary",
|
||||
artifact: "jsonl/qa-jsonl-replay-summary.json",
|
||||
required: true,
|
||||
failureVerdict: "fixture-bug",
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
|
||||
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
|
||||
expect(report.lanes[0]?.details).toContain(expectedDetail);
|
||||
}
|
||||
});
|
||||
|
||||
it("requires confidence self-test summaries to contain every seeded canary", async () => {
|
||||
for (const [artifact, expectedDetail] of [
|
||||
[{ pass: true, canaries: [] }, "no canaries"],
|
||||
[
|
||||
{ pass: true, canaries: [{ id: "prompt-drift", detected: true }] },
|
||||
"missing expected canaries",
|
||||
],
|
||||
] as const) {
|
||||
await writeJson("confidence-self-test/qa-confidence-self-test-summary.json", artifact);
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "confidence-self-test",
|
||||
title: "Confidence self-test",
|
||||
kind: "self-test-summary",
|
||||
artifact: "confidence-self-test/qa-confidence-self-test-summary.json",
|
||||
required: true,
|
||||
failureVerdict: "qa-harness-bug",
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-13T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.counts).toMatchObject({ failed: 0, unknown: 1 });
|
||||
expect(report.lanes[0]).toMatchObject({ status: "unknown" });
|
||||
expect(report.lanes[0]?.details).toContain(expectedDetail);
|
||||
}
|
||||
});
|
||||
|
||||
it("fails strict zero-unknowns for an unclassified failing lane", async () => {
|
||||
await writeJson("first-hour/qa-suite-summary.json", {
|
||||
counts: { total: 18, passed: 17, failed: 1 },
|
||||
scenarios: [{ name: "approval-turn-tool-followthrough", status: "fail", steps: [] }],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "first-hour-20-direct",
|
||||
title: "First-hour 20 direct",
|
||||
kind: "qa-suite-summary",
|
||||
artifact: "first-hour/qa-suite-summary.json",
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(false);
|
||||
expect(report.counts.unknown).toBe(1);
|
||||
expect(report.failures[0]).toContain("first-hour-20-direct is unclassified");
|
||||
});
|
||||
|
||||
it("accepts a classified failing lane without treating it as unknown", async () => {
|
||||
await writeJson("jsonl/qa-jsonl-replay-summary.json", {
|
||||
transcripts: [
|
||||
{
|
||||
transcriptPath: "curated.jsonl",
|
||||
userTurnCount: 2,
|
||||
drift: ["none", "tool-result-shape"],
|
||||
firstDriftAtTurn: 2,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const report = await buildQaConfidenceReport({
|
||||
manifest: {
|
||||
version: 1,
|
||||
profile: "codex-100",
|
||||
lanes: [
|
||||
{
|
||||
id: "jsonl-expanded",
|
||||
title: "Expanded JSONL replay",
|
||||
kind: "jsonl-replay-summary",
|
||||
artifact: "jsonl/qa-jsonl-replay-summary.json",
|
||||
required: true,
|
||||
failureVerdict: "fixture-bug",
|
||||
productImpact: "P4",
|
||||
qaImpact: "P1",
|
||||
},
|
||||
],
|
||||
},
|
||||
artifactRoot: tempRoot,
|
||||
strictZeroUnknowns: true,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
expect(report.pass).toBe(true);
|
||||
expect(report.globalPass).toBe(false);
|
||||
expect(report.counts.failed).toBe(1);
|
||||
expect(report.counts.unknown).toBe(0);
|
||||
expect(report.lanes[0]).toMatchObject({
|
||||
status: "fail",
|
||||
verdict: "fixture-bug",
|
||||
productImpact: "P4",
|
||||
qaImpact: "P1",
|
||||
});
|
||||
});
|
||||
|
||||
it("emits confidence self-test canaries for every drift class we need to catch", async () => {
|
||||
const summary = await buildQaConfidenceSelfTestSummary("2026-05-12T00:00:00.000Z");
|
||||
|
||||
expect(summary.pass).toBe(true);
|
||||
expect(summary.canaries.map((canary) => canary.id)).toEqual([
|
||||
"prompt-drift",
|
||||
"tool-description-schema-drift",
|
||||
"runtime-tool-call-drop",
|
||||
"tool-result-mismatch",
|
||||
"failure-mode-drift",
|
||||
"token-efficiency-regression",
|
||||
"jsonl-replay-ordering-drift",
|
||||
]);
|
||||
expect(summary.canaries.every((canary) => canary.detected)).toBe(true);
|
||||
});
|
||||
|
||||
it("writes confidence self-test artifacts", async () => {
|
||||
const result = await writeQaConfidenceSelfTestArtifacts({
|
||||
outputDir: tempRoot,
|
||||
generatedAt: "2026-05-12T00:00:00.000Z",
|
||||
});
|
||||
|
||||
await expect(fs.stat(result.summaryPath)).resolves.toBeTruthy();
|
||||
await expect(fs.stat(result.reportPath)).resolves.toBeTruthy();
|
||||
const summary = JSON.parse(await fs.readFile(result.summaryPath, "utf8")) as { pass: boolean };
|
||||
expect(summary.pass).toBe(true);
|
||||
});
|
||||
});
|
||||
1238
extensions/qa-lab/src/confidence-report.ts
Normal file
1238
extensions/qa-lab/src/confidence-report.ts
Normal file
File diff suppressed because it is too large
Load Diff
284
extensions/qa-lab/src/harness-parity.test.ts
Normal file
284
extensions/qa-lab/src/harness-parity.test.ts
Normal file
@@ -0,0 +1,284 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
buildHarnessParityCell,
|
||||
buildHarnessParityResult,
|
||||
type HarnessRuntimeParityCell,
|
||||
type HarnessVariant,
|
||||
} from "./harness-parity.js";
|
||||
import type { RuntimeId } from "./runtime-parity.js";
|
||||
import type { RuntimeParityComparisonMode } from "./runtime-tool-metadata.js";
|
||||
|
||||
const LEFT: HarnessVariant = { id: "left", label: "Left", runtime: "pi" };
|
||||
const RIGHT: HarnessVariant = { id: "right", label: "Right", runtime: "pi" };
|
||||
|
||||
const BASE_PROMPT_REPORT = {
|
||||
systemPrompt: {
|
||||
chars: 100,
|
||||
projectContextChars: 40,
|
||||
nonProjectContextChars: 60,
|
||||
hash: "system-a",
|
||||
},
|
||||
skills: {
|
||||
promptChars: 12,
|
||||
hash: "skills-a",
|
||||
},
|
||||
tools: {
|
||||
schemaChars: 20,
|
||||
entries: [
|
||||
{
|
||||
name: "read",
|
||||
summaryChars: 8,
|
||||
summaryHash: "summary-a",
|
||||
schemaChars: 20,
|
||||
schemaHash: "schema-a",
|
||||
propertiesCount: 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
function makeCell(
|
||||
runtime: RuntimeId,
|
||||
overrides: Partial<HarnessRuntimeParityCell> = {},
|
||||
): HarnessRuntimeParityCell {
|
||||
return {
|
||||
runtime,
|
||||
transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n',
|
||||
toolCalls: [],
|
||||
finalText: "same",
|
||||
usage: { inputTokens: 10, outputTokens: 5, totalTokens: 15 },
|
||||
wallClockMs: 1,
|
||||
bootStateLines: [],
|
||||
systemPromptReport: BASE_PROMPT_REPORT,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function classify(
|
||||
left: Partial<HarnessRuntimeParityCell>,
|
||||
right: Partial<HarnessRuntimeParityCell>,
|
||||
comparisonMode?: RuntimeParityComparisonMode,
|
||||
) {
|
||||
return buildHarnessParityResult({
|
||||
scenarioId: "scenario",
|
||||
left: buildHarnessParityCell({
|
||||
variant: LEFT,
|
||||
cell: makeCell("pi", left),
|
||||
tokenUsageSource: "live-usage",
|
||||
}),
|
||||
right: buildHarnessParityCell({
|
||||
variant: RIGHT,
|
||||
cell: makeCell("pi", right),
|
||||
tokenUsageSource: "live-usage",
|
||||
}),
|
||||
...(comparisonMode ? { comparisonMode } : {}),
|
||||
}).drift;
|
||||
}
|
||||
|
||||
describe("harness parity", () => {
|
||||
it("classifies prompt and tool surface drift before behavioral drift", () => {
|
||||
expect(
|
||||
classify(
|
||||
{},
|
||||
{
|
||||
systemPromptReport: {
|
||||
...BASE_PROMPT_REPORT,
|
||||
systemPrompt: { chars: 101, projectContextChars: 40, nonProjectContextChars: 61 },
|
||||
},
|
||||
},
|
||||
),
|
||||
).toBe("system-prompt");
|
||||
expect(
|
||||
classify(
|
||||
{},
|
||||
{
|
||||
systemPromptReport: {
|
||||
...BASE_PROMPT_REPORT,
|
||||
systemPrompt: {
|
||||
chars: 100,
|
||||
projectContextChars: 40,
|
||||
nonProjectContextChars: 60,
|
||||
hash: "system-b",
|
||||
},
|
||||
},
|
||||
},
|
||||
),
|
||||
).toBe("system-prompt");
|
||||
expect(
|
||||
classify(
|
||||
{},
|
||||
{
|
||||
systemPromptReport: {
|
||||
...BASE_PROMPT_REPORT,
|
||||
skills: { promptChars: 12, hash: "skills-b" },
|
||||
},
|
||||
},
|
||||
),
|
||||
).toBe("system-prompt");
|
||||
expect(
|
||||
classify(
|
||||
{},
|
||||
{
|
||||
systemPromptReport: {
|
||||
...BASE_PROMPT_REPORT,
|
||||
tools: {
|
||||
schemaChars: 20,
|
||||
entries: [
|
||||
{
|
||||
name: "read",
|
||||
summaryChars: 8,
|
||||
summaryHash: "summary-b",
|
||||
schemaChars: 20,
|
||||
schemaHash: "schema-a",
|
||||
propertiesCount: 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
),
|
||||
).toBe("tool-description");
|
||||
expect(
|
||||
classify(
|
||||
{},
|
||||
{
|
||||
systemPromptReport: {
|
||||
...BASE_PROMPT_REPORT,
|
||||
tools: {
|
||||
schemaChars: 20,
|
||||
entries: [
|
||||
{
|
||||
name: "read",
|
||||
summaryChars: 8,
|
||||
summaryHash: "summary-a",
|
||||
schemaChars: 20,
|
||||
schemaHash: "schema-b",
|
||||
propertiesCount: 1,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
),
|
||||
).toBe("tool-schema");
|
||||
});
|
||||
|
||||
it("classifies behavioral harness drift", () => {
|
||||
expect(
|
||||
classify(
|
||||
{ toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r" }] },
|
||||
{ toolCalls: [{ tool: "read", argsHash: "b", resultHash: "r" }] },
|
||||
),
|
||||
).toBe("tool-call-shape");
|
||||
expect(
|
||||
classify(
|
||||
{ toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r1" }] },
|
||||
{ toolCalls: [{ tool: "read", argsHash: "a", resultHash: "r2" }] },
|
||||
),
|
||||
).toBe("tool-result-shape");
|
||||
expect(classify({ finalText: "same text" }, { finalText: "different text" })).toBe("text-only");
|
||||
expect(
|
||||
classify(
|
||||
{
|
||||
transcriptBytes:
|
||||
'{"type":"model_change","modelId":"gpt-5.5"}\n' +
|
||||
'{"type":"thinking_level_change","thinkingLevel":"off"}\n' +
|
||||
'{"type":"custom","customType":"model-snapshot"}\n' +
|
||||
'{"message":{"role":"assistant","content":"same"}}\n',
|
||||
},
|
||||
{ transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n' },
|
||||
),
|
||||
).toBe("none");
|
||||
expect(
|
||||
classify(
|
||||
{ transcriptBytes: '{"message":{"role":"assistant"}}\n' },
|
||||
{ transcriptBytes: '{"message":{"role":"assistant"}}\n{"message":{"role":"tool"}}\n' },
|
||||
),
|
||||
).toBe("structural");
|
||||
expect(
|
||||
classify(
|
||||
{ transcriptBytes: '{"role":"assistant","content":"same"}\n' },
|
||||
{
|
||||
transcriptBytes:
|
||||
'{"role":"assistant","content":"same"}\n{"role":"tool","content":"same"}\n',
|
||||
},
|
||||
),
|
||||
).toBe("structural");
|
||||
expect(classify({ runtimeErrorClass: "timeout" }, {})).toBe("failure-mode");
|
||||
});
|
||||
|
||||
it("honors native workspace comparison mode for outcome-only harness proofs", () => {
|
||||
expect(
|
||||
classify(
|
||||
{
|
||||
transcriptBytes:
|
||||
'{"message":{"role":"assistant","content":"same"}}\n' +
|
||||
'{"message":{"role":"tool","content":"same result"}}\n',
|
||||
toolCalls: [{ tool: "bash", argsHash: "sed-160", resultHash: "same-result" }],
|
||||
},
|
||||
{
|
||||
transcriptBytes: '{"message":{"role":"assistant","content":"same"}}\n',
|
||||
toolCalls: [{ tool: "bash", argsHash: "sed-200", resultHash: "same-result" }],
|
||||
},
|
||||
"codex-native-workspace",
|
||||
),
|
||||
).toBe("none");
|
||||
|
||||
expect(
|
||||
classify(
|
||||
{ toolCalls: [{ tool: "bash", argsHash: "a", resultHash: "r1" }] },
|
||||
{ toolCalls: [{ tool: "bash", argsHash: "b", resultHash: "r2" }] },
|
||||
"outcome-only",
|
||||
),
|
||||
).toBe("none");
|
||||
});
|
||||
|
||||
it("keeps prompt and tool surface checks strict under native workspace comparison mode", () => {
|
||||
expect(
|
||||
classify(
|
||||
{},
|
||||
{
|
||||
systemPromptReport: {
|
||||
...BASE_PROMPT_REPORT,
|
||||
systemPrompt: { chars: 101, projectContextChars: 40, nonProjectContextChars: 61 },
|
||||
},
|
||||
toolCalls: [{ tool: "bash", argsHash: "changed", resultHash: "changed" }],
|
||||
},
|
||||
"codex-native-workspace",
|
||||
),
|
||||
).toBe("system-prompt");
|
||||
expect(
|
||||
classify(
|
||||
{},
|
||||
{
|
||||
systemPromptReport: {
|
||||
...BASE_PROMPT_REPORT,
|
||||
tools: {
|
||||
schemaChars: 20,
|
||||
entries: [{ name: "read", summaryChars: 9, schemaChars: 20, propertiesCount: 1 }],
|
||||
},
|
||||
},
|
||||
toolCalls: [{ tool: "bash", argsHash: "changed", resultHash: "changed" }],
|
||||
},
|
||||
"outcome-only",
|
||||
),
|
||||
).toBe("tool-description");
|
||||
});
|
||||
|
||||
it("labels mock token estimates separately from live usage", () => {
|
||||
const sourceCell = makeCell("pi", {
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 },
|
||||
});
|
||||
const cell = buildHarnessParityCell({
|
||||
variant: LEFT,
|
||||
cell: sourceCell,
|
||||
tokenUsageSource: "mock-estimate",
|
||||
});
|
||||
const inputChars = 100 + 12 + 8 + 20 + sourceCell.transcriptBytes.length;
|
||||
|
||||
expect(cell.tokenUsageSource).toBe("mock-estimate");
|
||||
expect(cell.tokenUsage.totalTokens).toBeGreaterThan(0);
|
||||
expect(cell.tokenUsage.inputTokens).toBe(Math.ceil(inputChars / 4));
|
||||
expect(cell.promptStats.toolCount).toBe(1);
|
||||
});
|
||||
});
|
||||
491
extensions/qa-lab/src/harness-parity.ts
Normal file
491
extensions/qa-lab/src/harness-parity.ts
Normal file
@@ -0,0 +1,491 @@
|
||||
import { createHash } from "node:crypto";
|
||||
import type {
|
||||
RuntimeId,
|
||||
RuntimeParityCell,
|
||||
RuntimeParityDrift,
|
||||
RuntimeParityToolCall,
|
||||
RuntimeParityUsage,
|
||||
} from "./runtime-parity.js";
|
||||
import type { RuntimeParityComparisonMode } from "./runtime-tool-metadata.js";
|
||||
|
||||
export type HarnessVariant = {
|
||||
id: string;
|
||||
label: string;
|
||||
runtime?: RuntimeId;
|
||||
model?: string;
|
||||
configPatch?: Record<string, unknown>;
|
||||
systemPromptOverlay?: string;
|
||||
toolDescriptionOverlay?: Record<string, string>;
|
||||
};
|
||||
|
||||
export type HarnessParityDrift =
|
||||
| RuntimeParityDrift
|
||||
| "system-prompt"
|
||||
| "tool-description"
|
||||
| "tool-schema";
|
||||
|
||||
export type HarnessParityPromptStats = {
|
||||
systemPromptChars: number;
|
||||
projectContextChars: number;
|
||||
nonProjectContextChars: number;
|
||||
skillPromptChars: number;
|
||||
toolSummaryChars: number;
|
||||
toolSchemaChars: number;
|
||||
toolCount: number;
|
||||
};
|
||||
|
||||
export type RuntimeParitySystemPromptReport = {
|
||||
systemPrompt?: {
|
||||
chars?: number;
|
||||
projectContextChars?: number;
|
||||
nonProjectContextChars?: number;
|
||||
text?: string;
|
||||
hash?: string;
|
||||
contentHash?: string;
|
||||
};
|
||||
skills?: {
|
||||
promptChars?: number;
|
||||
prompt?: string;
|
||||
hash?: string;
|
||||
contentHash?: string;
|
||||
};
|
||||
tools?: {
|
||||
listChars?: number;
|
||||
schemaChars?: number;
|
||||
entries?: Array<{
|
||||
name?: string;
|
||||
summary?: string;
|
||||
summaryHash?: string;
|
||||
summaryChars?: number;
|
||||
schema?: unknown;
|
||||
schemaHash?: string;
|
||||
schemaChars?: number;
|
||||
propertiesCount?: number;
|
||||
}>;
|
||||
};
|
||||
};
|
||||
|
||||
export type HarnessRuntimeParityCell = RuntimeParityCell & {
|
||||
systemPromptReport?: RuntimeParitySystemPromptReport;
|
||||
};
|
||||
|
||||
export type HarnessParityCell = HarnessRuntimeParityCell & {
|
||||
variant: HarnessVariant;
|
||||
promptStats: HarnessParityPromptStats;
|
||||
systemPromptHash: string;
|
||||
toolDescriptionHash: string;
|
||||
toolSchemaHash: string;
|
||||
tokenUsage: RuntimeParityUsage;
|
||||
tokenUsageSource: "live-usage" | "mock-estimate";
|
||||
};
|
||||
|
||||
export type HarnessParityResult = {
|
||||
scenarioId: string;
|
||||
left: HarnessParityCell;
|
||||
right: HarnessParityCell;
|
||||
drift: HarnessParityDrift;
|
||||
driftDetails?: string;
|
||||
promptDelta: {
|
||||
systemPromptChars: number;
|
||||
projectContextChars: number;
|
||||
skillPromptChars: number;
|
||||
toolSummaryChars: number;
|
||||
toolSchemaChars: number;
|
||||
toolCount: number;
|
||||
};
|
||||
tokenDeltaPercent: number;
|
||||
firstDriftTurn?: number;
|
||||
};
|
||||
|
||||
export type HarnessParityReport = {
|
||||
generatedAt: string;
|
||||
providerMode: string;
|
||||
left: HarnessVariant;
|
||||
right: HarnessVariant;
|
||||
results: HarnessParityResult[];
|
||||
pass: boolean;
|
||||
failures: string[];
|
||||
};
|
||||
|
||||
function sha256(value: string) {
|
||||
return createHash("sha256").update(value).digest("hex");
|
||||
}
|
||||
|
||||
function countComparableTranscriptRecords(transcriptBytes: string) {
|
||||
let count = 0;
|
||||
for (const line of transcriptBytes.split(/\r?\n/u)) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
const parsed = JSON.parse(trimmed) as {
|
||||
message?: { role?: unknown };
|
||||
role?: unknown;
|
||||
};
|
||||
if (
|
||||
(parsed.message && typeof parsed.message.role === "string") ||
|
||||
typeof parsed.role === "string"
|
||||
) {
|
||||
count += 1;
|
||||
}
|
||||
} catch {
|
||||
// Ignore malformed QA transcript rows and keep parity classification deterministic.
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
function normalizeForStableHash(value: unknown): unknown {
|
||||
if (Array.isArray(value)) {
|
||||
return value.map((entry) => normalizeForStableHash(entry));
|
||||
}
|
||||
if (value && typeof value === "object") {
|
||||
const record = value as Record<string, unknown>;
|
||||
return Object.fromEntries(
|
||||
Object.keys(record)
|
||||
.toSorted((left, right) => left.localeCompare(right))
|
||||
.map((key) => [key, normalizeForStableHash(record[key])]),
|
||||
);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function stableHash(value: unknown) {
|
||||
return sha256(JSON.stringify(normalizeForStableHash(value)) ?? "null");
|
||||
}
|
||||
|
||||
function readPositiveNumber(value: unknown) {
|
||||
return typeof value === "number" && Number.isFinite(value) && value > 0 ? Math.floor(value) : 0;
|
||||
}
|
||||
|
||||
function buildPromptStats(report: RuntimeParitySystemPromptReport | undefined) {
|
||||
const toolEntries = Array.isArray(report?.tools?.entries) ? report.tools.entries : [];
|
||||
return {
|
||||
systemPromptChars: readPositiveNumber(report?.systemPrompt?.chars),
|
||||
projectContextChars: readPositiveNumber(report?.systemPrompt?.projectContextChars),
|
||||
nonProjectContextChars: readPositiveNumber(report?.systemPrompt?.nonProjectContextChars),
|
||||
skillPromptChars: readPositiveNumber(report?.skills?.promptChars),
|
||||
toolSummaryChars: toolEntries.reduce(
|
||||
(sum, entry) => sum + readPositiveNumber(entry.summaryChars),
|
||||
0,
|
||||
),
|
||||
toolSchemaChars: readPositiveNumber(report?.tools?.schemaChars),
|
||||
toolCount: toolEntries.length,
|
||||
};
|
||||
}
|
||||
|
||||
function estimateUsage(
|
||||
cell: RuntimeParityCell,
|
||||
stats: HarnessParityPromptStats,
|
||||
): RuntimeParityUsage {
|
||||
const inputChars =
|
||||
stats.systemPromptChars +
|
||||
stats.skillPromptChars +
|
||||
stats.toolSummaryChars +
|
||||
stats.toolSchemaChars +
|
||||
cell.transcriptBytes.length;
|
||||
const outputChars = cell.finalText.length + cell.toolCalls.length * 80;
|
||||
const inputTokens = Math.ceil(inputChars / 4);
|
||||
const outputTokens = Math.ceil(outputChars / 4);
|
||||
return {
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
totalTokens: inputTokens + outputTokens,
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeTextForParity(text: string) {
|
||||
return text.replace(/\s+/gu, " ").trim();
|
||||
}
|
||||
|
||||
function compareToolCallShape(left: RuntimeParityToolCall[], right: RuntimeParityToolCall[]) {
|
||||
if (left.length !== right.length) {
|
||||
return `tool call count differs (${left.length} vs ${right.length})`;
|
||||
}
|
||||
for (let index = 0; index < left.length; index += 1) {
|
||||
const leftCall = left[index];
|
||||
const rightCall = right[index];
|
||||
if (!leftCall || !rightCall) {
|
||||
return `tool call row ${index + 1} missing`;
|
||||
}
|
||||
if (leftCall.tool !== rightCall.tool || leftCall.argsHash !== rightCall.argsHash) {
|
||||
return `tool call ${index + 1} differs (${leftCall.tool}/${leftCall.argsHash} vs ${rightCall.tool}/${rightCall.argsHash})`;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function compareToolResultShape(left: RuntimeParityToolCall[], right: RuntimeParityToolCall[]) {
|
||||
const total = Math.min(left.length, right.length);
|
||||
for (let index = 0; index < total; index += 1) {
|
||||
const leftCall = left[index];
|
||||
const rightCall = right[index];
|
||||
if (!leftCall || !rightCall) {
|
||||
continue;
|
||||
}
|
||||
if (
|
||||
leftCall.resultHash !== rightCall.resultHash ||
|
||||
(leftCall.errorClass ?? "") !== (rightCall.errorClass ?? "")
|
||||
) {
|
||||
return `tool result ${index + 1} differs (${leftCall.tool})`;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function firstDriftTurn(leftTranscript: string, rightTranscript: string): number | undefined {
|
||||
const leftLines = leftTranscript.trim().length ? leftTranscript.trim().split(/\r?\n/u) : [];
|
||||
const rightLines = rightTranscript.trim().length ? rightTranscript.trim().split(/\r?\n/u) : [];
|
||||
const total = Math.max(leftLines.length, rightLines.length);
|
||||
for (let index = 0; index < total; index += 1) {
|
||||
if ((leftLines[index] ?? "") !== (rightLines[index] ?? "")) {
|
||||
return index + 1;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function buildHarnessParityCell(params: {
|
||||
variant: HarnessVariant;
|
||||
cell: HarnessRuntimeParityCell;
|
||||
tokenUsageSource: HarnessParityCell["tokenUsageSource"];
|
||||
}): HarnessParityCell {
|
||||
const report = params.cell.systemPromptReport;
|
||||
const promptStats = buildPromptStats(report);
|
||||
const toolEntries = report?.tools?.entries ?? [];
|
||||
const tokenUsage =
|
||||
params.tokenUsageSource === "live-usage"
|
||||
? params.cell.usage
|
||||
: estimateUsage(params.cell, promptStats);
|
||||
return {
|
||||
...params.cell,
|
||||
variant: params.variant,
|
||||
...(report ? { systemPromptReport: report } : {}),
|
||||
promptStats,
|
||||
systemPromptHash: stableHash({
|
||||
systemPrompt: report?.systemPrompt ?? null,
|
||||
skills: report?.skills ?? null,
|
||||
}),
|
||||
toolDescriptionHash: stableHash(
|
||||
toolEntries.map((entry) => {
|
||||
return {
|
||||
name: entry.name,
|
||||
summary: entry.summary,
|
||||
summaryHash: entry.summaryHash,
|
||||
summaryChars: entry.summaryChars,
|
||||
};
|
||||
}),
|
||||
),
|
||||
toolSchemaHash: stableHash({
|
||||
listChars: report?.tools?.listChars,
|
||||
schemaChars: report?.tools?.schemaChars,
|
||||
entries: toolEntries.map((entry) => {
|
||||
return {
|
||||
name: entry.name,
|
||||
schema: entry.schema,
|
||||
schemaHash: entry.schemaHash,
|
||||
schemaChars: entry.schemaChars,
|
||||
propertiesCount: entry.propertiesCount,
|
||||
};
|
||||
}),
|
||||
}),
|
||||
tokenUsage,
|
||||
tokenUsageSource: params.tokenUsageSource,
|
||||
};
|
||||
}
|
||||
|
||||
export function buildHarnessParityResult(params: {
|
||||
scenarioId: string;
|
||||
left: HarnessParityCell;
|
||||
right: HarnessParityCell;
|
||||
comparisonMode?: RuntimeParityComparisonMode;
|
||||
}): HarnessParityResult {
|
||||
const promptDelta = {
|
||||
systemPromptChars:
|
||||
params.right.promptStats.systemPromptChars - params.left.promptStats.systemPromptChars,
|
||||
projectContextChars:
|
||||
params.right.promptStats.projectContextChars - params.left.promptStats.projectContextChars,
|
||||
skillPromptChars:
|
||||
params.right.promptStats.skillPromptChars - params.left.promptStats.skillPromptChars,
|
||||
toolSummaryChars:
|
||||
params.right.promptStats.toolSummaryChars - params.left.promptStats.toolSummaryChars,
|
||||
toolSchemaChars:
|
||||
params.right.promptStats.toolSchemaChars - params.left.promptStats.toolSchemaChars,
|
||||
toolCount: params.right.promptStats.toolCount - params.left.promptStats.toolCount,
|
||||
};
|
||||
const tokenDeltaPercent =
|
||||
params.left.tokenUsage.totalTokens === 0
|
||||
? params.right.tokenUsage.totalTokens === 0
|
||||
? 0
|
||||
: 100
|
||||
: ((params.right.tokenUsage.totalTokens - params.left.tokenUsage.totalTokens) /
|
||||
params.left.tokenUsage.totalTokens) *
|
||||
100;
|
||||
const failDetails =
|
||||
params.left.transportErrorClass || params.right.transportErrorClass
|
||||
? "at least one harness variant hit a transport failure"
|
||||
: params.left.runtimeErrorClass || params.right.runtimeErrorClass
|
||||
? "at least one harness variant hit a runtime failure"
|
||||
: undefined;
|
||||
if (failDetails) {
|
||||
return {
|
||||
scenarioId: params.scenarioId,
|
||||
left: params.left,
|
||||
right: params.right,
|
||||
drift: "failure-mode",
|
||||
driftDetails: failDetails,
|
||||
promptDelta,
|
||||
tokenDeltaPercent,
|
||||
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
|
||||
};
|
||||
}
|
||||
if (params.left.systemPromptHash !== params.right.systemPromptHash) {
|
||||
return {
|
||||
scenarioId: params.scenarioId,
|
||||
left: params.left,
|
||||
right: params.right,
|
||||
drift: "system-prompt",
|
||||
driftDetails: "system prompt report differs",
|
||||
promptDelta,
|
||||
tokenDeltaPercent,
|
||||
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
|
||||
};
|
||||
}
|
||||
if (params.left.toolDescriptionHash !== params.right.toolDescriptionHash) {
|
||||
return {
|
||||
scenarioId: params.scenarioId,
|
||||
left: params.left,
|
||||
right: params.right,
|
||||
drift: "tool-description",
|
||||
driftDetails: "tool description summary shape differs",
|
||||
promptDelta,
|
||||
tokenDeltaPercent,
|
||||
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
|
||||
};
|
||||
}
|
||||
if (params.left.toolSchemaHash !== params.right.toolSchemaHash) {
|
||||
return {
|
||||
scenarioId: params.scenarioId,
|
||||
left: params.left,
|
||||
right: params.right,
|
||||
drift: "tool-schema",
|
||||
driftDetails: "tool schema shape differs",
|
||||
promptDelta,
|
||||
tokenDeltaPercent,
|
||||
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
|
||||
};
|
||||
}
|
||||
const compareToolShapes =
|
||||
params.comparisonMode !== "codex-native-workspace" && params.comparisonMode !== "outcome-only";
|
||||
const compareTranscriptStructure =
|
||||
params.comparisonMode !== "codex-native-workspace" && params.comparisonMode !== "outcome-only";
|
||||
|
||||
if (compareToolShapes) {
|
||||
const toolCallDrift = compareToolCallShape(params.left.toolCalls, params.right.toolCalls);
|
||||
if (toolCallDrift) {
|
||||
return {
|
||||
scenarioId: params.scenarioId,
|
||||
left: params.left,
|
||||
right: params.right,
|
||||
drift: "tool-call-shape",
|
||||
driftDetails: toolCallDrift,
|
||||
promptDelta,
|
||||
tokenDeltaPercent,
|
||||
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
|
||||
};
|
||||
}
|
||||
const toolResultDrift = compareToolResultShape(params.left.toolCalls, params.right.toolCalls);
|
||||
if (toolResultDrift) {
|
||||
return {
|
||||
scenarioId: params.scenarioId,
|
||||
left: params.left,
|
||||
right: params.right,
|
||||
drift: "tool-result-shape",
|
||||
driftDetails: toolResultDrift,
|
||||
promptDelta,
|
||||
tokenDeltaPercent,
|
||||
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
|
||||
};
|
||||
}
|
||||
}
|
||||
const leftTranscriptRecords = countComparableTranscriptRecords(params.left.transcriptBytes);
|
||||
const rightTranscriptRecords = countComparableTranscriptRecords(params.right.transcriptBytes);
|
||||
if (
|
||||
compareTranscriptStructure &&
|
||||
(leftTranscriptRecords !== rightTranscriptRecords ||
|
||||
(!params.left.finalText && !!params.right.finalText) ||
|
||||
(!!params.left.finalText && !params.right.finalText))
|
||||
) {
|
||||
return {
|
||||
scenarioId: params.scenarioId,
|
||||
left: params.left,
|
||||
right: params.right,
|
||||
drift: "structural",
|
||||
driftDetails: `transcript/final-text structure differs (${leftTranscriptRecords} message records vs ${rightTranscriptRecords} message records)`,
|
||||
promptDelta,
|
||||
tokenDeltaPercent,
|
||||
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
|
||||
};
|
||||
}
|
||||
if (
|
||||
normalizeTextForParity(params.left.finalText) !== normalizeTextForParity(params.right.finalText)
|
||||
) {
|
||||
return {
|
||||
scenarioId: params.scenarioId,
|
||||
left: params.left,
|
||||
right: params.right,
|
||||
drift: "text-only",
|
||||
driftDetails: "final text differs after whitespace normalization",
|
||||
promptDelta,
|
||||
tokenDeltaPercent,
|
||||
firstDriftTurn: firstDriftTurn(params.left.transcriptBytes, params.right.transcriptBytes),
|
||||
};
|
||||
}
|
||||
return {
|
||||
scenarioId: params.scenarioId,
|
||||
left: params.left,
|
||||
right: params.right,
|
||||
drift: "none",
|
||||
promptDelta,
|
||||
tokenDeltaPercent,
|
||||
};
|
||||
}
|
||||
|
||||
function formatPercent(value: number) {
|
||||
const normalized = Math.abs(value) < 0.05 ? 0 : value;
|
||||
const prefix = normalized > 0 ? "+" : "";
|
||||
return `${prefix}${normalized.toFixed(1)}%`;
|
||||
}
|
||||
|
||||
export function renderHarnessParityMarkdownReport(report: HarnessParityReport): string {
|
||||
const lines = [
|
||||
`# OpenClaw Harness Parity - ${report.left.label} vs ${report.right.label}`,
|
||||
"",
|
||||
`- Generated at: ${report.generatedAt}`,
|
||||
`- Provider mode: ${report.providerMode}`,
|
||||
`- Verdict: ${report.pass ? "pass" : "fail"}`,
|
||||
"",
|
||||
"| Scenario | Drift | First drift turn | Token delta | Prompt chars delta | Tool count delta | Details |",
|
||||
"| --- | --- | ---: | ---: | ---: | ---: | --- |",
|
||||
];
|
||||
|
||||
for (const result of report.results) {
|
||||
lines.push(
|
||||
`| ${result.scenarioId} | ${result.drift} | ${result.firstDriftTurn ?? ""} | ${formatPercent(
|
||||
result.tokenDeltaPercent,
|
||||
)} | ${result.promptDelta.systemPromptChars} | ${result.promptDelta.toolCount} | ${
|
||||
result.driftDetails ?? ""
|
||||
} |`,
|
||||
);
|
||||
}
|
||||
|
||||
if (report.failures.length > 0) {
|
||||
lines.push("", "## Gate Failures", "");
|
||||
for (const failure of report.failures) {
|
||||
lines.push(`- ${failure}`);
|
||||
}
|
||||
}
|
||||
|
||||
return `${lines.join("\n").trimEnd()}\n`;
|
||||
}
|
||||
@@ -144,4 +144,76 @@ describe("buildSystemPromptReport", () => {
|
||||
expect(report.systemPrompt.projectContextChars).toBe(0);
|
||||
expect(report.systemPrompt.nonProjectContextChars).toBe("custom override".length);
|
||||
});
|
||||
|
||||
it("emits content hashes for prompt and tool parity checks", () => {
|
||||
const file = makeBootstrapFile({ path: "/tmp/workspace/AGENTS.md" });
|
||||
const report = buildSystemPromptReport({
|
||||
source: "run",
|
||||
generatedAt: 0,
|
||||
bootstrapMaxChars: 20_000,
|
||||
systemPrompt: "system",
|
||||
bootstrapFiles: [file],
|
||||
injectedFiles: [],
|
||||
skillsPrompt: "<skill><name>docs</name></skill>",
|
||||
tools: [
|
||||
{
|
||||
name: "read",
|
||||
description: "Read files",
|
||||
parameters: {
|
||||
type: "object",
|
||||
properties: { path: { type: "string" } },
|
||||
},
|
||||
},
|
||||
] as never,
|
||||
});
|
||||
const sameLengthChangedPrompt = buildSystemPromptReport({
|
||||
source: "run",
|
||||
generatedAt: 0,
|
||||
bootstrapMaxChars: 20_000,
|
||||
systemPrompt: "systen",
|
||||
bootstrapFiles: [file],
|
||||
injectedFiles: [],
|
||||
skillsPrompt: "<skill><name>docs</name></skill>",
|
||||
tools: [],
|
||||
});
|
||||
|
||||
expect(report.systemPrompt.hash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
expect(report.skills.hash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
expect(report.tools.entries[0]?.summaryHash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
expect(report.tools.entries[0]?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
expect(sameLengthChangedPrompt.systemPrompt.hash).not.toBe(report.systemPrompt.hash);
|
||||
});
|
||||
|
||||
it("keeps reporting when a tool schema cannot be stringified", () => {
|
||||
const file = makeBootstrapFile({ path: "/tmp/workspace/AGENTS.md" });
|
||||
const circularSchema: Record<string, unknown> = {
|
||||
type: "object",
|
||||
properties: { count: { type: "integer" } },
|
||||
};
|
||||
circularSchema.self = circularSchema;
|
||||
|
||||
const report = buildSystemPromptReport({
|
||||
source: "run",
|
||||
generatedAt: 0,
|
||||
bootstrapMaxChars: 20_000,
|
||||
systemPrompt: "system",
|
||||
bootstrapFiles: [file],
|
||||
injectedFiles: [],
|
||||
skillsPrompt: "",
|
||||
tools: [
|
||||
{
|
||||
name: "broken",
|
||||
description: "Broken schema",
|
||||
parameters: circularSchema,
|
||||
},
|
||||
] as never,
|
||||
});
|
||||
|
||||
expect(report.tools.entries[0]).toMatchObject({
|
||||
name: "broken",
|
||||
schemaChars: 0,
|
||||
propertiesCount: 1,
|
||||
});
|
||||
expect(report.tools.entries[0]?.schemaHash).toMatch(/^[a-f0-9]{64}$/u);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { createHash } from "node:crypto";
|
||||
import type { AgentTool } from "@earendil-works/pi-agent-core";
|
||||
import type { SessionSystemPromptReport } from "../config/sessions/types.js";
|
||||
import { buildBootstrapInjectionStats } from "./bootstrap-budget.js";
|
||||
@@ -9,9 +10,47 @@ type ToolReportEntry = SessionSystemPromptReport["tools"]["entries"][number];
|
||||
const toolReportEntryCache = new WeakMap<AgentTool, ToolReportEntry>();
|
||||
const toolSchemaStatsCache = new WeakMap<
|
||||
object,
|
||||
Pick<ToolReportEntry, "propertiesCount" | "schemaChars">
|
||||
Pick<ToolReportEntry, "propertiesCount" | "schemaChars" | "schemaHash">
|
||||
>();
|
||||
|
||||
function sha256(value: string): string {
|
||||
return createHash("sha256").update(value).digest("hex");
|
||||
}
|
||||
|
||||
function normalizeForStableHash(value: unknown, seen = new WeakSet<object>()): unknown {
|
||||
if (typeof value === "bigint") {
|
||||
return `${value.toString()}n`;
|
||||
}
|
||||
if (value && typeof value === "object") {
|
||||
if (seen.has(value)) {
|
||||
return "[Circular]";
|
||||
}
|
||||
seen.add(value);
|
||||
if (Array.isArray(value)) {
|
||||
const normalized = value.map((entry) => normalizeForStableHash(entry, seen));
|
||||
seen.delete(value);
|
||||
return normalized;
|
||||
}
|
||||
const record = value as Record<string, unknown>;
|
||||
const normalized = Object.fromEntries(
|
||||
Object.keys(record)
|
||||
.toSorted((left, right) => left.localeCompare(right))
|
||||
.map((key) => [key, normalizeForStableHash(record[key], seen)]),
|
||||
);
|
||||
seen.delete(value);
|
||||
return normalized;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function stableJsonHash(value: unknown): string {
|
||||
try {
|
||||
return sha256(JSON.stringify(normalizeForStableHash(value)) ?? "null");
|
||||
} catch {
|
||||
return sha256("[unserializable]");
|
||||
}
|
||||
}
|
||||
|
||||
function extractBetween(input: string, startMarker: string, endMarker: string): string {
|
||||
const start = input.indexOf(startMarker);
|
||||
if (start === -1) {
|
||||
@@ -39,9 +78,9 @@ function parseSkillBlocks(skillsPrompt: string): Array<{ name: string; blockChar
|
||||
|
||||
function buildToolSchemaStats(
|
||||
parameters: AgentTool["parameters"],
|
||||
): Pick<ToolReportEntry, "propertiesCount" | "schemaChars"> {
|
||||
): Pick<ToolReportEntry, "propertiesCount" | "schemaChars" | "schemaHash"> {
|
||||
if (!parameters || typeof parameters !== "object") {
|
||||
return { schemaChars: 0, propertiesCount: null };
|
||||
return { schemaChars: 0, schemaHash: stableJsonHash(null), propertiesCount: null };
|
||||
}
|
||||
const cached = toolSchemaStatsCache.get(parameters);
|
||||
if (cached) {
|
||||
@@ -55,6 +94,7 @@ function buildToolSchemaStats(
|
||||
return 0;
|
||||
}
|
||||
})(),
|
||||
schemaHash: stableJsonHash(parameters),
|
||||
propertiesCount: (() => {
|
||||
const schema = parameters as Record<string, unknown>;
|
||||
const props = typeof schema.properties === "object" ? schema.properties : null;
|
||||
@@ -78,7 +118,7 @@ function buildToolsEntries(tools: AgentTool[]): SessionSystemPromptReport["tools
|
||||
const summary = tool.description?.trim() || tool.label?.trim() || "";
|
||||
const summaryChars = summary.length;
|
||||
const schemaStats = buildToolSchemaStats(tool.parameters);
|
||||
const entry = { name, summaryChars, ...schemaStats };
|
||||
const entry = { name, summaryChars, summaryHash: sha256(summary), ...schemaStats };
|
||||
toolReportEntryCache.set(tool, entry);
|
||||
return entry;
|
||||
});
|
||||
@@ -129,6 +169,7 @@ export function buildSystemPromptReport(params: {
|
||||
chars: systemPromptChars,
|
||||
projectContextChars,
|
||||
nonProjectContextChars: Math.max(0, systemPromptChars - projectContextChars),
|
||||
hash: sha256(params.systemPrompt),
|
||||
},
|
||||
...(params.currentTurn ? { currentTurn: params.currentTurn } : {}),
|
||||
injectedWorkspaceFiles: buildBootstrapInjectionStats({
|
||||
@@ -137,6 +178,7 @@ export function buildSystemPromptReport(params: {
|
||||
}),
|
||||
skills: {
|
||||
promptChars: params.skillsPrompt.length,
|
||||
hash: sha256(params.skillsPrompt),
|
||||
entries: skillsEntries,
|
||||
},
|
||||
tools: {
|
||||
|
||||
@@ -644,6 +644,7 @@ export type SessionSystemPromptReport = {
|
||||
chars: number;
|
||||
projectContextChars: number;
|
||||
nonProjectContextChars: number;
|
||||
hash?: string;
|
||||
};
|
||||
currentTurn?: {
|
||||
kind?: "user_request" | "room_event";
|
||||
@@ -660,6 +661,7 @@ export type SessionSystemPromptReport = {
|
||||
}>;
|
||||
skills: {
|
||||
promptChars: number;
|
||||
hash?: string;
|
||||
entries: Array<{ name: string; blockChars: number }>;
|
||||
};
|
||||
tools: {
|
||||
@@ -668,7 +670,9 @@ export type SessionSystemPromptReport = {
|
||||
entries: Array<{
|
||||
name: string;
|
||||
summaryChars: number;
|
||||
summaryHash?: string;
|
||||
schemaChars: number;
|
||||
schemaHash?: string;
|
||||
propertiesCount?: number | null;
|
||||
}>;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user