mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-06 05:51:15 +08:00
feat(qa-lab): add jsonl replay harness
This commit is contained in:
@@ -9,6 +9,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Gateway/plugins: reuse a compatible Gateway startup plugin registry during dispatch so safe plugin dispatches avoid redundant registry loading. (#84324) Thanks @ai-hpc.
|
||||
- Dependencies: refresh provider, plugin, UI, and tooling packages, update `protobufjs` to 8.4.0 to clear the current npm advisory, and carry the Claude ACP completion patch forward to `@agentclientprotocol/claude-agent-acp` 0.36.1.
|
||||
- Agents/tools: remove the old sender-owner tool gating path so configured tools stay visible for trusted sessions while command and channel-action auth still carry real sender identity.
|
||||
- QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin.
|
||||
- Tests/perf: isolate doctor core health check unit coverage from real skills/workspace discovery so `doctor-core-checks` no longer dominates unit perf while keeping one real skills-readiness smoke. (#84493) Thanks @frankekn.
|
||||
|
||||
### Fixes
|
||||
|
||||
@@ -76,6 +76,7 @@ import {
|
||||
runQaDockerUpCommand,
|
||||
runQaCharacterEvalCommand,
|
||||
runQaCoverageReportCommand,
|
||||
runQaJsonlReplayCommand,
|
||||
runQaManualLaneCommand,
|
||||
runQaParityReportCommand,
|
||||
runQaSuiteCommand,
|
||||
@@ -1084,6 +1085,44 @@ describe("qa cli runtime", () => {
|
||||
expectWriteContains(stdoutWrite, "codex-native-workspace");
|
||||
});
|
||||
|
||||
it("writes a curated mock JSONL replay report and summary", async () => {
|
||||
const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-jsonl-replay-cli-"));
|
||||
try {
|
||||
await runQaJsonlReplayCommand({
|
||||
repoRoot,
|
||||
transcripts: path.resolve("qa/scenarios/jsonl-replay"),
|
||||
outputDir: "jsonl-output",
|
||||
runtimePair: "pi,codex",
|
||||
});
|
||||
|
||||
const report = await fs.readFile(
|
||||
path.join(repoRoot, "jsonl-output", "qa-jsonl-replay-report.md"),
|
||||
"utf8",
|
||||
);
|
||||
const summary = JSON.parse(
|
||||
await fs.readFile(
|
||||
path.join(repoRoot, "jsonl-output", "qa-jsonl-replay-summary.json"),
|
||||
"utf8",
|
||||
),
|
||||
) as { transcripts?: Array<{ userTurnCount?: number }> };
|
||||
|
||||
expect(report).toContain("# OpenClaw JSONL Replay Report - pi vs codex");
|
||||
expect(report).toContain("| plan-mode-boundaries.jsonl | 3 | | none, none, none |");
|
||||
expect(summary.transcripts).toHaveLength(7);
|
||||
} finally {
|
||||
await fs.rm(repoRoot, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it("keeps JSONL replay mock-only until real runtime cell replay is wired", async () => {
|
||||
await expect(
|
||||
runQaJsonlReplayCommand({
|
||||
repoRoot: process.cwd(),
|
||||
providerMode: "live-frontier",
|
||||
}),
|
||||
).rejects.toThrow("qa jsonl-replay currently supports mock-openai curated fixtures only.");
|
||||
});
|
||||
|
||||
it("exits nonzero when tool coverage summary is missing a required runtime tool call", async () => {
|
||||
const priorExitCode = process.exitCode;
|
||||
const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-tool-coverage-"));
|
||||
|
||||
@@ -16,6 +16,12 @@ import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./cove
|
||||
import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js";
|
||||
import { runQaDockerUp } from "./docker-up.runtime.js";
|
||||
import type { QaCliBackendAuthMode } from "./gateway-child.js";
|
||||
import {
|
||||
createMockJsonlReplayCellRunner,
|
||||
renderJsonlReplayMarkdownReport,
|
||||
runJsonlReplay,
|
||||
type JsonlReplayInput,
|
||||
} from "./jsonl-replay.js";
|
||||
import { startQaLabServer } from "./lab-server.js";
|
||||
import { runQaManualLane } from "./manual-lane.runtime.js";
|
||||
import { runQaMultipass } from "./multipass.runtime.js";
|
||||
@@ -820,6 +826,50 @@ export async function runQaCoverageReportCommand(opts: {
|
||||
process.stdout.write(body);
|
||||
}
|
||||
|
||||
export async function runQaJsonlReplayCommand(opts: {
|
||||
repoRoot?: string;
|
||||
transcripts?: string;
|
||||
outputDir?: string;
|
||||
runtimePair?: string;
|
||||
providerMode?: QaProviderModeInput;
|
||||
}) {
|
||||
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
|
||||
const runtimePair = parseQaRuntimePair(opts.runtimePair) ?? ["pi", "codex"];
|
||||
if (runtimePair[0] !== "pi" || runtimePair[1] !== "codex") {
|
||||
throw new Error('--runtime-pair for jsonl-replay must be "pi,codex".');
|
||||
}
|
||||
const providerMode = normalizeQaProviderMode(opts.providerMode ?? "mock-openai");
|
||||
if (providerMode !== "mock-openai") {
|
||||
throw new Error("qa jsonl-replay currently supports mock-openai curated fixtures only.");
|
||||
}
|
||||
const transcriptDir = path.resolve(repoRoot, opts.transcripts ?? "qa/scenarios/jsonl-replay");
|
||||
const outputDir =
|
||||
resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
|
||||
path.join(repoRoot, ".artifacts", "qa-e2e", `jsonl-replay-${Date.now().toString(36)}`);
|
||||
await fs.mkdir(outputDir, { recursive: true });
|
||||
const result = await runJsonlReplay(
|
||||
{
|
||||
directory: transcriptDir,
|
||||
runtimePair: runtimePair as JsonlReplayInput["runtimePair"],
|
||||
providerMode,
|
||||
},
|
||||
{ runCell: createMockJsonlReplayCellRunner() },
|
||||
);
|
||||
const reportPayload = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
providerMode,
|
||||
runtimePair: runtimePair as JsonlReplayInput["runtimePair"],
|
||||
transcripts: result.transcripts,
|
||||
};
|
||||
const report = renderJsonlReplayMarkdownReport(reportPayload);
|
||||
const reportPath = path.join(outputDir, "qa-jsonl-replay-report.md");
|
||||
const summaryPath = path.join(outputDir, "qa-jsonl-replay-summary.json");
|
||||
await fs.writeFile(reportPath, report, "utf8");
|
||||
await fs.writeFile(summaryPath, `${JSON.stringify(result, null, 2)}\n`, "utf8");
|
||||
process.stdout.write(`QA JSONL replay report: ${reportPath}\n`);
|
||||
process.stdout.write(`QA JSONL replay summary: ${summaryPath}\n`);
|
||||
}
|
||||
|
||||
export async function runQaCharacterEvalCommand(opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
|
||||
@@ -45,6 +45,7 @@ const {
|
||||
runQaCredentialsListCommand,
|
||||
runQaCredentialsRemoveCommand,
|
||||
runQaCoverageReportCommand,
|
||||
runQaJsonlReplayCommand,
|
||||
runQaProviderServerCommand,
|
||||
runQaSuiteCommand,
|
||||
runQaTelegramCommand,
|
||||
@@ -58,6 +59,7 @@ const {
|
||||
runQaCredentialsListCommand: vi.fn(),
|
||||
runQaCredentialsRemoveCommand: vi.fn(),
|
||||
runQaCoverageReportCommand: vi.fn(),
|
||||
runQaJsonlReplayCommand: vi.fn(),
|
||||
runQaProviderServerCommand: vi.fn(),
|
||||
runQaSuiteCommand: vi.fn(),
|
||||
runQaTelegramCommand: vi.fn(),
|
||||
@@ -113,6 +115,7 @@ vi.mock("./cli.runtime.js", () => ({
|
||||
runQaCredentialsListCommand,
|
||||
runQaCredentialsRemoveCommand,
|
||||
runQaCoverageReportCommand,
|
||||
runQaJsonlReplayCommand,
|
||||
runQaProviderServerCommand,
|
||||
runQaSuiteCommand,
|
||||
}));
|
||||
@@ -128,6 +131,7 @@ describe("qa cli registration", () => {
|
||||
runQaCredentialsListCommand.mockReset();
|
||||
runQaCredentialsRemoveCommand.mockReset();
|
||||
runQaCoverageReportCommand.mockReset();
|
||||
runQaJsonlReplayCommand.mockReset();
|
||||
runQaProviderServerCommand.mockReset();
|
||||
runQaSuiteCommand.mockReset();
|
||||
runQaTelegramCommand.mockReset();
|
||||
@@ -480,6 +484,33 @@ describe("qa cli registration", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("routes JSONL replay flags into the qa runtime command", async () => {
|
||||
await program.parseAsync([
|
||||
"node",
|
||||
"openclaw",
|
||||
"qa",
|
||||
"jsonl-replay",
|
||||
"--repo-root",
|
||||
"/tmp/openclaw-repo",
|
||||
"--transcripts",
|
||||
"qa/scenarios/jsonl-replay",
|
||||
"--runtime-pair",
|
||||
"pi,codex",
|
||||
"--provider-mode",
|
||||
"mock-openai",
|
||||
"--output-dir",
|
||||
".artifacts/qa-e2e/jsonl-replay-test",
|
||||
]);
|
||||
|
||||
expect(runQaJsonlReplayCommand).toHaveBeenCalledWith({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
transcripts: "qa/scenarios/jsonl-replay",
|
||||
runtimePair: "pi,codex",
|
||||
providerMode: "mock-openai",
|
||||
outputDir: ".artifacts/qa-e2e/jsonl-replay-test",
|
||||
});
|
||||
});
|
||||
|
||||
it("delegates discovered qa runner registration through the generic host seam", () => {
|
||||
const [{ registration }] = listQaRunnerCliContributions.mock.results[0]?.value;
|
||||
expect(registration.register).toHaveBeenCalledTimes(1);
|
||||
|
||||
@@ -83,6 +83,17 @@ async function runQaCoverageReport(opts: {
|
||||
await runtime.runQaCoverageReportCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaJsonlReplay(opts: {
|
||||
repoRoot?: string;
|
||||
transcripts?: string;
|
||||
outputDir?: string;
|
||||
runtimePair?: string;
|
||||
providerMode?: QaProviderModeInput;
|
||||
}) {
|
||||
const runtime = await loadQaLabCliRuntime();
|
||||
await runtime.runQaJsonlReplayCommand(opts);
|
||||
}
|
||||
|
||||
async function runQaCharacterEval(opts: {
|
||||
repoRoot?: string;
|
||||
outputDir?: string;
|
||||
@@ -402,6 +413,33 @@ export function registerQaLabCli(program: Command) {
|
||||
},
|
||||
);
|
||||
|
||||
qa.command("jsonl-replay")
|
||||
.description("Replay curated JSONL transcripts through the runtime parity replay harness")
|
||||
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
|
||||
.option(
|
||||
"--transcripts <path>",
|
||||
"Directory of curated JSONL transcripts",
|
||||
"qa/scenarios/jsonl-replay",
|
||||
)
|
||||
.option("--runtime-pair <pair>", "Runtime pair label, e.g. pi,codex", "pi,codex")
|
||||
.option(
|
||||
"--provider-mode <mode>",
|
||||
`Provider mode (${formatQaProviderModeHelp()})`,
|
||||
"mock-openai",
|
||||
)
|
||||
.option("--output-dir <path>", "Artifact directory for the JSONL replay report")
|
||||
.action(
|
||||
async (opts: {
|
||||
repoRoot?: string;
|
||||
transcripts?: string;
|
||||
runtimePair?: string;
|
||||
providerMode?: QaProviderModeInput;
|
||||
outputDir?: string;
|
||||
}) => {
|
||||
await runQaJsonlReplay(opts);
|
||||
},
|
||||
);
|
||||
|
||||
qa.command("character-eval")
|
||||
.description("Run the character QA scenario across live models and write a judged report")
|
||||
.option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
|
||||
|
||||
169
extensions/qa-lab/src/jsonl-replay.test.ts
Normal file
169
extensions/qa-lab/src/jsonl-replay.test.ts
Normal file
@@ -0,0 +1,169 @@
|
||||
import fs from "node:fs/promises";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import {
|
||||
createMockJsonlReplayCellRunner,
|
||||
extractJsonlReplayUserTurns,
|
||||
renderJsonlReplayMarkdownReport,
|
||||
runJsonlReplay,
|
||||
type JsonlReplayCellRunner,
|
||||
} from "./jsonl-replay.js";
|
||||
import type { RuntimeId, RuntimeParityCell, RuntimeParityToolCall } from "./runtime-parity.js";
|
||||
|
||||
const tempRoots: string[] = [];
|
||||
|
||||
function makeCell(
|
||||
runtime: RuntimeId,
|
||||
overrides: Partial<RuntimeParityCell> = {},
|
||||
): RuntimeParityCell {
|
||||
return {
|
||||
runtime,
|
||||
transcriptBytes: `{"message":{"role":"assistant","content":"${runtime} reply"}}\n`,
|
||||
toolCalls: [],
|
||||
finalText: "same reply",
|
||||
usage: {
|
||||
inputTokens: 10,
|
||||
outputTokens: 5,
|
||||
totalTokens: 15,
|
||||
},
|
||||
wallClockMs: 12,
|
||||
bootStateLines: [],
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function makeToolCall(overrides: Partial<RuntimeParityToolCall> = {}): RuntimeParityToolCall {
|
||||
return {
|
||||
tool: "read",
|
||||
argsHash: "args-a",
|
||||
resultHash: "result-a",
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
async function makeTempDir() {
|
||||
const tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "jsonl-replay-"));
|
||||
tempRoots.push(tempRoot);
|
||||
return tempRoot;
|
||||
}
|
||||
|
||||
afterEach(async () => {
|
||||
await Promise.all(
|
||||
tempRoots.splice(0).map((tempRoot) => fs.rm(tempRoot, { recursive: true, force: true })),
|
||||
);
|
||||
});
|
||||
|
||||
describe("jsonl replay", () => {
|
||||
it("extracts user-turn boundaries while ignoring system, tool-only, empty, and malformed rows", () => {
|
||||
const turns = extractJsonlReplayUserTurns(
|
||||
[
|
||||
`{"message":{"role":"system","content":"System setup"}}`,
|
||||
`{"message":{"role":"tool","content":"tool-only prelude"}}`,
|
||||
`{"message":{"role":"user","content":" "}}`,
|
||||
`{not-json`,
|
||||
`{"message":{"role":"assistant","content":"Ready."}}`,
|
||||
`{"message":{"role":"user","content":[{"type":"text","text":"Plan the release"},{"type":"tool_result","content":"ignored"}]}}`,
|
||||
`{"role":"user","content":[{"type":"input_text","text":"Check the follow-up"}]}`,
|
||||
].join("\n"),
|
||||
);
|
||||
|
||||
expect(turns).toEqual([
|
||||
expect.objectContaining({
|
||||
turn: 1,
|
||||
lineNumber: 6,
|
||||
userText: "Plan the release",
|
||||
}),
|
||||
expect.objectContaining({
|
||||
turn: 2,
|
||||
lineNumber: 7,
|
||||
userText: "Check the follow-up",
|
||||
}),
|
||||
]);
|
||||
expect(turns[0]?.transcriptPrefix).toContain(`"role":"system"`);
|
||||
expect(turns[0]?.transcriptPrefix).not.toContain("{not-json");
|
||||
});
|
||||
|
||||
it("reports the earliest divergent turn using runtime parity drift classes", async () => {
|
||||
const transcriptDir = await makeTempDir();
|
||||
await fs.writeFile(
|
||||
path.join(transcriptDir, "three-turns.jsonl"),
|
||||
[
|
||||
`{"message":{"role":"user","content":"Turn one"}}`,
|
||||
`{"message":{"role":"assistant","content":"Ready"}}`,
|
||||
`{"message":{"role":"user","content":"Turn two"}}`,
|
||||
`{"message":{"role":"assistant","content":"Using a tool"}}`,
|
||||
`{"message":{"role":"user","content":"Turn three"}}`,
|
||||
].join("\n"),
|
||||
"utf8",
|
||||
);
|
||||
|
||||
const runCell: JsonlReplayCellRunner = async ({ runtime, turn }) => {
|
||||
if (turn.turn === 2) {
|
||||
return {
|
||||
scenarioStatus: "pass",
|
||||
cell: makeCell(runtime, {
|
||||
toolCalls: [makeToolCall(runtime === "pi" ? {} : { argsHash: "args-codex" })],
|
||||
}),
|
||||
};
|
||||
}
|
||||
if (turn.turn === 3) {
|
||||
return {
|
||||
scenarioStatus: "pass",
|
||||
cell: makeCell(runtime, {
|
||||
finalText: runtime === "pi" ? "pi wording" : "codex wording",
|
||||
}),
|
||||
};
|
||||
}
|
||||
return {
|
||||
scenarioStatus: "pass",
|
||||
cell: makeCell(runtime),
|
||||
};
|
||||
};
|
||||
|
||||
const result = await runJsonlReplay(
|
||||
{
|
||||
directory: transcriptDir,
|
||||
runtimePair: ["pi", "codex"],
|
||||
providerMode: "mock-openai",
|
||||
},
|
||||
{ runCell },
|
||||
);
|
||||
|
||||
expect(result.transcripts).toHaveLength(1);
|
||||
expect(result.transcripts[0]).toEqual(
|
||||
expect.objectContaining({
|
||||
userTurnCount: 3,
|
||||
drift: ["none", "tool-call-shape", "text-only"],
|
||||
firstDriftAtTurn: 2,
|
||||
}),
|
||||
);
|
||||
expect(result.transcripts[0]?.cells.pi).toHaveLength(3);
|
||||
expect(result.transcripts[0]?.cells.codex).toHaveLength(3);
|
||||
});
|
||||
|
||||
it("runs the curated replay fixture set in mock-openai mode", async () => {
|
||||
const fixtureDir = path.resolve("qa/scenarios/jsonl-replay");
|
||||
|
||||
const result = await runJsonlReplay(
|
||||
{
|
||||
directory: fixtureDir,
|
||||
runtimePair: ["pi", "codex"],
|
||||
providerMode: "mock-openai",
|
||||
},
|
||||
{ runCell: createMockJsonlReplayCellRunner() },
|
||||
);
|
||||
|
||||
expect(result.transcripts).toHaveLength(7);
|
||||
expect(result.transcripts.map((entry) => entry.userTurnCount)).toEqual([2, 2, 3, 2, 2, 2, 2]);
|
||||
expect(result.transcripts.every((entry) => entry.firstDriftAtTurn === undefined)).toBe(true);
|
||||
expect(
|
||||
renderJsonlReplayMarkdownReport({
|
||||
generatedAt: "2026-05-10T00:00:00.000Z",
|
||||
providerMode: "mock-openai",
|
||||
runtimePair: ["pi", "codex"],
|
||||
transcripts: result.transcripts,
|
||||
}),
|
||||
).toContain("| plan-mode-boundaries.jsonl | 3 | | none, none, none |");
|
||||
});
|
||||
});
|
||||
270
extensions/qa-lab/src/jsonl-replay.ts
Normal file
270
extensions/qa-lab/src/jsonl-replay.ts
Normal file
@@ -0,0 +1,270 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import {
|
||||
runRuntimeParityScenario,
|
||||
type RuntimeId,
|
||||
type RuntimeParityCell,
|
||||
type RuntimeParityResult,
|
||||
type RuntimeParityScenarioExecution,
|
||||
} from "./runtime-parity.js";
|
||||
|
||||
export type JsonlReplayInput = {
|
||||
directory: string;
|
||||
runtimePair: ["pi", "codex"];
|
||||
providerMode: "mock-openai" | "live-frontier";
|
||||
};
|
||||
|
||||
export type JsonlReplayTurn = {
|
||||
turn: number;
|
||||
lineNumber: number;
|
||||
userText: string;
|
||||
transcriptPrefix: string;
|
||||
};
|
||||
|
||||
export type JsonlReplayCellRunner = (params: {
|
||||
runtime: RuntimeId;
|
||||
transcriptPath: string;
|
||||
turn: JsonlReplayTurn;
|
||||
turns: readonly JsonlReplayTurn[];
|
||||
providerMode: JsonlReplayInput["providerMode"];
|
||||
}) => Promise<RuntimeParityScenarioExecution>;
|
||||
|
||||
export type JsonlReplayResult = {
|
||||
transcripts: Array<{
|
||||
transcriptPath: string;
|
||||
userTurnCount: number;
|
||||
cells: { pi: RuntimeParityCell[]; codex: RuntimeParityCell[] };
|
||||
drift: Array<RuntimeParityResult["drift"]>;
|
||||
firstDriftAtTurn?: number;
|
||||
}>;
|
||||
};
|
||||
|
||||
export type JsonlReplayOptions = {
|
||||
runCell?: JsonlReplayCellRunner;
|
||||
};
|
||||
|
||||
export type JsonlReplayMarkdownReport = {
|
||||
generatedAt: string;
|
||||
providerMode: JsonlReplayInput["providerMode"];
|
||||
runtimePair: JsonlReplayInput["runtimePair"];
|
||||
transcripts: JsonlReplayResult["transcripts"];
|
||||
};
|
||||
|
||||
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
||||
}
|
||||
|
||||
function readString(value: unknown): string | undefined {
|
||||
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
||||
}
|
||||
|
||||
function readReplayMessage(record: Record<string, unknown>): Record<string, unknown> | undefined {
|
||||
if (isRecord(record.message)) {
|
||||
return record.message;
|
||||
}
|
||||
return readString(record.role) ? record : undefined;
|
||||
}
|
||||
|
||||
function readRole(message: Record<string, unknown>) {
|
||||
return readString(message.role)?.toLowerCase();
|
||||
}
|
||||
|
||||
function isTextLikeContentBlock(block: Record<string, unknown>) {
|
||||
const type = readString(block.type)?.toLowerCase();
|
||||
return (
|
||||
!type ||
|
||||
type === "text" ||
|
||||
type === "input_text" ||
|
||||
type === "message" ||
|
||||
type === "output_text" ||
|
||||
type === "user_text"
|
||||
);
|
||||
}
|
||||
|
||||
function extractTextContent(content: unknown): string {
|
||||
if (typeof content === "string") {
|
||||
return content.trim();
|
||||
}
|
||||
if (!Array.isArray(content)) {
|
||||
return "";
|
||||
}
|
||||
const parts: string[] = [];
|
||||
for (const block of content) {
|
||||
if (typeof block === "string") {
|
||||
if (block.trim()) {
|
||||
parts.push(block.trim());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (!isRecord(block) || !isTextLikeContentBlock(block)) {
|
||||
continue;
|
||||
}
|
||||
const text = readString(block.text) ?? readString(block.content);
|
||||
if (text) {
|
||||
parts.push(text);
|
||||
}
|
||||
}
|
||||
return parts.join("\n").trim();
|
||||
}
|
||||
|
||||
export function extractJsonlReplayUserTurns(transcriptBytes: string): JsonlReplayTurn[] {
|
||||
const turns: JsonlReplayTurn[] = [];
|
||||
const acceptedLines: string[] = [];
|
||||
for (const [lineIndex, rawLine] of transcriptBytes.split(/\r?\n/u).entries()) {
|
||||
const trimmed = rawLine.trim();
|
||||
if (!trimmed) {
|
||||
continue;
|
||||
}
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = JSON.parse(trimmed) as unknown;
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
if (!isRecord(parsed)) {
|
||||
continue;
|
||||
}
|
||||
acceptedLines.push(trimmed);
|
||||
const message = readReplayMessage(parsed);
|
||||
if (!message || readRole(message) !== "user") {
|
||||
continue;
|
||||
}
|
||||
const userText = extractTextContent(message.content);
|
||||
if (!userText) {
|
||||
continue;
|
||||
}
|
||||
turns.push({
|
||||
turn: turns.length + 1,
|
||||
lineNumber: lineIndex + 1,
|
||||
userText,
|
||||
transcriptPrefix: `${acceptedLines.join("\n")}\n`,
|
||||
});
|
||||
}
|
||||
return turns;
|
||||
}
|
||||
|
||||
async function listJsonlFiles(directory: string): Promise<string[]> {
|
||||
const entries = await fs.readdir(directory, { withFileTypes: true });
|
||||
const files: string[] = [];
|
||||
for (const entry of entries.toSorted((left, right) => left.name.localeCompare(right.name))) {
|
||||
const entryPath = path.join(directory, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
files.push(...(await listJsonlFiles(entryPath)));
|
||||
continue;
|
||||
}
|
||||
if (entry.isFile() && entry.name.endsWith(".jsonl")) {
|
||||
files.push(entryPath);
|
||||
}
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
function defaultRunCell(): Promise<RuntimeParityScenarioExecution> {
|
||||
throw new Error(
|
||||
"jsonl replay requires a runtime cell runner; CLI/suite wiring should provide the Phase 1 runtime parity runner",
|
||||
);
|
||||
}
|
||||
|
||||
function assertSupportedRuntimePair(runtimePair: JsonlReplayInput["runtimePair"]) {
|
||||
if (runtimePair[0] !== "pi" || runtimePair[1] !== "codex") {
|
||||
throw new Error(`unsupported jsonl replay runtime pair: ${runtimePair.join(",")}`);
|
||||
}
|
||||
}
|
||||
|
||||
export function createMockJsonlReplayCellRunner(): JsonlReplayCellRunner {
|
||||
return async ({ runtime, turn }) => ({
|
||||
scenarioStatus: "pass",
|
||||
cell: {
|
||||
runtime,
|
||||
transcriptBytes: turn.transcriptPrefix,
|
||||
toolCalls: [],
|
||||
finalText: `Replayed curated turn ${turn.turn}.`,
|
||||
usage: {
|
||||
inputTokens: Math.max(1, Math.ceil(turn.transcriptPrefix.length / 4)),
|
||||
outputTokens: 8,
|
||||
totalTokens: Math.max(1, Math.ceil(turn.transcriptPrefix.length / 4)) + 8,
|
||||
},
|
||||
wallClockMs: 1,
|
||||
bootStateLines: [],
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
export async function runJsonlReplay(
|
||||
input: JsonlReplayInput,
|
||||
options: JsonlReplayOptions = {},
|
||||
): Promise<JsonlReplayResult> {
|
||||
assertSupportedRuntimePair(input.runtimePair);
|
||||
const directory = path.resolve(input.directory);
|
||||
const transcriptPaths = await listJsonlFiles(directory);
|
||||
const runCell = options.runCell ?? defaultRunCell;
|
||||
const transcripts: JsonlReplayResult["transcripts"] = [];
|
||||
|
||||
for (const transcriptPath of transcriptPaths) {
|
||||
const transcriptBytes = await fs.readFile(transcriptPath, "utf8");
|
||||
const turns = extractJsonlReplayUserTurns(transcriptBytes);
|
||||
const cells: { pi: RuntimeParityCell[]; codex: RuntimeParityCell[] } = {
|
||||
pi: [],
|
||||
codex: [],
|
||||
};
|
||||
const drift: Array<RuntimeParityResult["drift"]> = [];
|
||||
let firstDriftAtTurn: number | undefined;
|
||||
|
||||
for (const turn of turns) {
|
||||
const parity = await runRuntimeParityScenario({
|
||||
scenarioId: `${path.basename(transcriptPath)}#turn-${turn.turn}`,
|
||||
runCell: async (runtime) =>
|
||||
runCell({
|
||||
runtime,
|
||||
transcriptPath,
|
||||
turn,
|
||||
turns,
|
||||
providerMode: input.providerMode,
|
||||
}),
|
||||
});
|
||||
cells.pi.push(parity.cells.pi);
|
||||
cells.codex.push(parity.cells.codex);
|
||||
drift.push(parity.drift);
|
||||
if (firstDriftAtTurn === undefined && parity.drift !== "none") {
|
||||
firstDriftAtTurn = turn.turn;
|
||||
}
|
||||
}
|
||||
|
||||
transcripts.push({
|
||||
transcriptPath,
|
||||
userTurnCount: turns.length,
|
||||
cells,
|
||||
drift,
|
||||
...(firstDriftAtTurn !== undefined ? { firstDriftAtTurn } : {}),
|
||||
});
|
||||
}
|
||||
|
||||
return { transcripts };
|
||||
}
|
||||
|
||||
export function renderJsonlReplayMarkdownReport(report: JsonlReplayMarkdownReport): string {
|
||||
const totalTurns = report.transcripts.reduce((sum, entry) => sum + entry.userTurnCount, 0);
|
||||
const driftedTranscripts = report.transcripts.filter(
|
||||
(entry) => entry.firstDriftAtTurn !== undefined,
|
||||
);
|
||||
const lines = [
|
||||
`# OpenClaw JSONL Replay Report - ${report.runtimePair[0]} vs ${report.runtimePair[1]}`,
|
||||
"",
|
||||
`- Generated at: ${report.generatedAt}`,
|
||||
`- Provider mode: ${report.providerMode}`,
|
||||
`- Transcripts: ${report.transcripts.length}`,
|
||||
`- User turns: ${totalTurns}`,
|
||||
`- Drifted transcripts: ${driftedTranscripts.length}`,
|
||||
"",
|
||||
"| Transcript | User turns | First drift turn | Drift sequence |",
|
||||
"| --- | ---: | ---: | --- |",
|
||||
];
|
||||
|
||||
for (const transcript of report.transcripts) {
|
||||
lines.push(
|
||||
`| ${path.basename(transcript.transcriptPath)} | ${transcript.userTurnCount} | ${transcript.firstDriftAtTurn ?? ""} | ${transcript.drift.join(", ")} |`,
|
||||
);
|
||||
}
|
||||
|
||||
return `${lines.join("\n").trimEnd()}\n`;
|
||||
}
|
||||
6
qa/scenarios/jsonl-replay/approval-denial-retry.jsonl
Normal file
6
qa/scenarios/jsonl-replay/approval-denial-retry.jsonl
Normal file
@@ -0,0 +1,6 @@
|
||||
{"message":{"role":"system","content":"Curated replay fixture: approval denial then safe retry."}}
|
||||
{"message":{"role":"user","content":"Read the guarded config file, but do not modify anything without approval."}}
|
||||
{"message":{"role":"assistant","content":"I will request the read and stop before any write."}}
|
||||
{"message":{"role":"tool","content":"approval denied for write attempt; read-only path remains allowed"}}
|
||||
{"message":{"role":"user","content":"Retry with read-only inspection and summarize the safe path."}}
|
||||
{"message":{"role":"assistant","content":"The safe path is read-only inspection followed by a summary."}}
|
||||
6
qa/scenarios/jsonl-replay/gateway-restart-recovery.jsonl
Normal file
6
qa/scenarios/jsonl-replay/gateway-restart-recovery.jsonl
Normal file
@@ -0,0 +1,6 @@
|
||||
{"message":{"role":"system","content":"Curated replay fixture: gateway restart recovery."}}
|
||||
{"message":{"role":"user","content":"Start a long workspace check and record the marker RESTART-RECOVERY-A."}}
|
||||
{"message":{"role":"assistant","content":"Started the check and recorded RESTART-RECOVERY-A."}}
|
||||
{"message":{"role":"tool","content":"gateway restarted before final delivery; session state recovered"}}
|
||||
{"message":{"role":"user","content":"Continue after the restart and confirm no duplicate marker was sent."}}
|
||||
{"message":{"role":"assistant","content":"Recovered after restart and did not duplicate the marker."}}
|
||||
8
qa/scenarios/jsonl-replay/plan-mode-boundaries.jsonl
Normal file
8
qa/scenarios/jsonl-replay/plan-mode-boundaries.jsonl
Normal file
@@ -0,0 +1,8 @@
|
||||
{"message":{"role":"system","content":"Curated JSONL replay fixture: plan-mode boundary smoke. Synthetic data only."}}
|
||||
{"message":{"role":"tool","toolName":"status","content":"prelude status row before any user turn"}}
|
||||
{"message":{"role":"user","content":"Enter plan mode and report the mode fields only."}}
|
||||
{"message":{"role":"assistant","content":"inPlanMode=true mode=plan"}}
|
||||
{"message":{"role":"user","content":[{"type":"text","text":"Attempt a write while plan approval is pending."},{"type":"tool_result","content":"ignored tool result block"}]}}
|
||||
{"message":{"role":"assistant","content":"The write is blocked while plan approval is pending."}}
|
||||
{"message":{"role":"user","content":"Exit plan mode and stop after reporting status."}}
|
||||
{"message":{"role":"assistant","content":"inPlanMode=false mode=normal"}}
|
||||
@@ -0,0 +1,6 @@
|
||||
{"message":{"role":"system","content":"Curated replay fixture: plugin lifecycle with searchable tools."}}
|
||||
{"message":{"role":"user","content":"Install the demo plugin and discover its OpenClaw integration tool before calling it."}}
|
||||
{"message":{"role":"assistant","content":"The plugin is installed; I will discover the integration tool before use."}}
|
||||
{"message":{"role":"tool","content":"searchable tool discovery returned openclaw.demo.status"}}
|
||||
{"message":{"role":"user","content":"Call the discovered status tool and summarize the result."}}
|
||||
{"message":{"role":"assistant","content":"The discovered plugin status tool returned healthy."}}
|
||||
4
qa/scenarios/jsonl-replay/recovery-partial-session.jsonl
Normal file
4
qa/scenarios/jsonl-replay/recovery-partial-session.jsonl
Normal file
@@ -0,0 +1,4 @@
|
||||
{"message":{"role":"system","content":"Curated JSONL replay fixture: partial recovery session. Synthetic data only."}}
|
||||
{"message":{"role":"user","content":"A runtime produced an empty assistant response after a safe read. Summarize the recovery plan."}}
|
||||
{"message":{"role":"assistant","content":"Retry once with the same safe read, preserve tool results, and avoid mutating actions."}}
|
||||
{"message":{"role":"user","content":"Now replay the follow-up with only the safe read context and no network dependency."}}
|
||||
7
qa/scenarios/jsonl-replay/repo-triage-tool-loop.jsonl
Normal file
7
qa/scenarios/jsonl-replay/repo-triage-tool-loop.jsonl
Normal file
@@ -0,0 +1,7 @@
|
||||
{"message":{"role":"system","content":"Curated JSONL replay fixture: repository triage. Synthetic data only; no private transcript content."}}
|
||||
{"message":{"role":"user","content":"Review the QA fixture index and identify one missing runtime coverage row."}}
|
||||
{"message":{"role":"assistant","content":[{"type":"tool_use","id":"fixture_tool_1","name":"read","input":{"path":"qa/scenarios/index.md"}}]}}
|
||||
{"message":{"role":"tool","toolName":"read","content":"Runtime coverage index includes basic channel and model rows."}}
|
||||
{"message":{"role":"assistant","content":"The index has channel and model rows; runtime replay coverage is a good follow-up."}}
|
||||
{"message":{"role":"user","content":"Draft the smallest next test without editing production wiring."}}
|
||||
{"message":{"role":"assistant","content":"Add a fixture-only replay test and leave CLI wiring for the parent branch."}}
|
||||
6
qa/scenarios/jsonl-replay/workspace-edit-loop.jsonl
Normal file
6
qa/scenarios/jsonl-replay/workspace-edit-loop.jsonl
Normal file
@@ -0,0 +1,6 @@
|
||||
{"message":{"role":"system","content":"Curated replay fixture: workspace edit loop."}}
|
||||
{"message":{"role":"user","content":"Read README.md, patch the typo in the local draft, and show the diff summary."}}
|
||||
{"message":{"role":"assistant","content":"I read README.md, applied the local typo patch, and summarized the diff."}}
|
||||
{"message":{"role":"tool","content":"patch applied to draft.md with one changed line"}}
|
||||
{"message":{"role":"user","content":"Run the lightweight check and tell me whether the patch is safe."}}
|
||||
{"message":{"role":"assistant","content":"The lightweight check passed and the patch is safe."}}
|
||||
Reference in New Issue
Block a user