feat(qa-lab): add jsonl replay harness

2026-06-06 05:51:15 +08:00 · 2026-05-21 15:27:24 +08:00
parent 66dcc4ee8f
commit cf0657852f
14 changed files with 641 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ Docs: https://docs.openclaw.ai
 - Gateway/plugins: reuse a compatible Gateway startup plugin registry during dispatch so safe plugin dispatches avoid redundant registry loading. (#84324) Thanks @ai-hpc.
 - Dependencies: refresh provider, plugin, UI, and tooling packages, update `protobufjs` to 8.4.0 to clear the current npm advisory, and carry the Claude ACP completion patch forward to `@agentclientprotocol/claude-agent-acp` 0.36.1.
 - Agents/tools: remove the old sender-owner tool gating path so configured tools stay visible for trusted sessions while command and channel-action auth still carry real sender identity.
+- QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin.
 - Tests/perf: isolate doctor core health check unit coverage from real skills/workspace discovery so `doctor-core-checks` no longer dominates unit perf while keeping one real skills-readiness smoke. (#84493) Thanks @frankekn.

 ### Fixes
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -76,6 +76,7 @@ import {
  runQaDockerUpCommand,
  runQaCharacterEvalCommand,
  runQaCoverageReportCommand,
+  runQaJsonlReplayCommand,
  runQaManualLaneCommand,
  runQaParityReportCommand,
  runQaSuiteCommand,
@@ -1084,6 +1085,44 @@ describe("qa cli runtime", () => {
    expectWriteContains(stdoutWrite, "codex-native-workspace");
  });

+  it("writes a curated mock JSONL replay report and summary", async () => {
+    const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-jsonl-replay-cli-"));
+    try {
+      await runQaJsonlReplayCommand({
+        repoRoot,
+        transcripts: path.resolve("qa/scenarios/jsonl-replay"),
+        outputDir: "jsonl-output",
+        runtimePair: "pi,codex",
+      });
+
+      const report = await fs.readFile(
+        path.join(repoRoot, "jsonl-output", "qa-jsonl-replay-report.md"),
+        "utf8",
+      );
+      const summary = JSON.parse(
+        await fs.readFile(
+          path.join(repoRoot, "jsonl-output", "qa-jsonl-replay-summary.json"),
+          "utf8",
+        ),
+      ) as { transcripts?: Array<{ userTurnCount?: number }> };
+
+      expect(report).toContain("# OpenClaw JSONL Replay Report - pi vs codex");
+      expect(report).toContain("| plan-mode-boundaries.jsonl | 3 |  | none, none, none |");
+      expect(summary.transcripts).toHaveLength(7);
+    } finally {
+      await fs.rm(repoRoot, { recursive: true, force: true });
+    }
+  });
+
+  it("keeps JSONL replay mock-only until real runtime cell replay is wired", async () => {
+    await expect(
+      runQaJsonlReplayCommand({
+        repoRoot: process.cwd(),
+        providerMode: "live-frontier",
+      }),
+    ).rejects.toThrow("qa jsonl-replay currently supports mock-openai curated fixtures only.");
+  });
+
  it("exits nonzero when tool coverage summary is missing a required runtime tool call", async () => {
    const priorExitCode = process.exitCode;
    const repoRoot = await fs.mkdtemp(path.join(os.tmpdir(), "qa-tool-coverage-"));
--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -16,6 +16,12 @@ import { buildQaCoverageInventory, renderQaCoverageMarkdownReport } from "./cove
 import { buildQaDockerHarnessImage, writeQaDockerHarnessFiles } from "./docker-harness.js";
 import { runQaDockerUp } from "./docker-up.runtime.js";
 import type { QaCliBackendAuthMode } from "./gateway-child.js";
+import {
+  createMockJsonlReplayCellRunner,
+  renderJsonlReplayMarkdownReport,
+  runJsonlReplay,
+  type JsonlReplayInput,
+} from "./jsonl-replay.js";
 import { startQaLabServer } from "./lab-server.js";
 import { runQaManualLane } from "./manual-lane.runtime.js";
 import { runQaMultipass } from "./multipass.runtime.js";
@@ -820,6 +826,50 @@ export async function runQaCoverageReportCommand(opts: {
  process.stdout.write(body);
 }

+export async function runQaJsonlReplayCommand(opts: {
+  repoRoot?: string;
+  transcripts?: string;
+  outputDir?: string;
+  runtimePair?: string;
+  providerMode?: QaProviderModeInput;
+}) {
+  const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
+  const runtimePair = parseQaRuntimePair(opts.runtimePair) ?? ["pi", "codex"];
+  if (runtimePair[0] !== "pi" || runtimePair[1] !== "codex") {
+    throw new Error('--runtime-pair for jsonl-replay must be "pi,codex".');
+  }
+  const providerMode = normalizeQaProviderMode(opts.providerMode ?? "mock-openai");
+  if (providerMode !== "mock-openai") {
+    throw new Error("qa jsonl-replay currently supports mock-openai curated fixtures only.");
+  }
+  const transcriptDir = path.resolve(repoRoot, opts.transcripts ?? "qa/scenarios/jsonl-replay");
+  const outputDir =
+    resolveRepoRelativeOutputDir(repoRoot, opts.outputDir) ??
+    path.join(repoRoot, ".artifacts", "qa-e2e", `jsonl-replay-${Date.now().toString(36)}`);
+  await fs.mkdir(outputDir, { recursive: true });
+  const result = await runJsonlReplay(
+    {
+      directory: transcriptDir,
+      runtimePair: runtimePair as JsonlReplayInput["runtimePair"],
+      providerMode,
+    },
+    { runCell: createMockJsonlReplayCellRunner() },
+  );
+  const reportPayload = {
+    generatedAt: new Date().toISOString(),
+    providerMode,
+    runtimePair: runtimePair as JsonlReplayInput["runtimePair"],
+    transcripts: result.transcripts,
+  };
+  const report = renderJsonlReplayMarkdownReport(reportPayload);
+  const reportPath = path.join(outputDir, "qa-jsonl-replay-report.md");
+  const summaryPath = path.join(outputDir, "qa-jsonl-replay-summary.json");
+  await fs.writeFile(reportPath, report, "utf8");
+  await fs.writeFile(summaryPath, `${JSON.stringify(result, null, 2)}\n`, "utf8");
+  process.stdout.write(`QA JSONL replay report: ${reportPath}\n`);
+  process.stdout.write(`QA JSONL replay summary: ${summaryPath}\n`);
+}
+
 export async function runQaCharacterEvalCommand(opts: {
  repoRoot?: string;
  outputDir?: string;
--- a/extensions/qa-lab/src/cli.test.ts
+++ b/extensions/qa-lab/src/cli.test.ts
@@ -45,6 +45,7 @@ const {
  runQaCredentialsListCommand,
  runQaCredentialsRemoveCommand,
  runQaCoverageReportCommand,
+  runQaJsonlReplayCommand,
  runQaProviderServerCommand,
  runQaSuiteCommand,
  runQaTelegramCommand,
@@ -58,6 +59,7 @@ const {
  runQaCredentialsListCommand: vi.fn(),
  runQaCredentialsRemoveCommand: vi.fn(),
  runQaCoverageReportCommand: vi.fn(),
+  runQaJsonlReplayCommand: vi.fn(),
  runQaProviderServerCommand: vi.fn(),
  runQaSuiteCommand: vi.fn(),
  runQaTelegramCommand: vi.fn(),
@@ -113,6 +115,7 @@ vi.mock("./cli.runtime.js", () => ({
  runQaCredentialsListCommand,
  runQaCredentialsRemoveCommand,
  runQaCoverageReportCommand,
+  runQaJsonlReplayCommand,
  runQaProviderServerCommand,
  runQaSuiteCommand,
 }));
@@ -128,6 +131,7 @@ describe("qa cli registration", () => {
    runQaCredentialsListCommand.mockReset();
    runQaCredentialsRemoveCommand.mockReset();
    runQaCoverageReportCommand.mockReset();
+    runQaJsonlReplayCommand.mockReset();
    runQaProviderServerCommand.mockReset();
    runQaSuiteCommand.mockReset();
    runQaTelegramCommand.mockReset();
@@ -480,6 +484,33 @@ describe("qa cli registration", () => {
    });
  });

+  it("routes JSONL replay flags into the qa runtime command", async () => {
+    await program.parseAsync([
+      "node",
+      "openclaw",
+      "qa",
+      "jsonl-replay",
+      "--repo-root",
+      "/tmp/openclaw-repo",
+      "--transcripts",
+      "qa/scenarios/jsonl-replay",
+      "--runtime-pair",
+      "pi,codex",
+      "--provider-mode",
+      "mock-openai",
+      "--output-dir",
+      ".artifacts/qa-e2e/jsonl-replay-test",
+    ]);
+
+    expect(runQaJsonlReplayCommand).toHaveBeenCalledWith({
+      repoRoot: "/tmp/openclaw-repo",
+      transcripts: "qa/scenarios/jsonl-replay",
+      runtimePair: "pi,codex",
+      providerMode: "mock-openai",
+      outputDir: ".artifacts/qa-e2e/jsonl-replay-test",
+    });
+  });
+
  it("delegates discovered qa runner registration through the generic host seam", () => {
    const [{ registration }] = listQaRunnerCliContributions.mock.results[0]?.value;
    expect(registration.register).toHaveBeenCalledTimes(1);
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -83,6 +83,17 @@ async function runQaCoverageReport(opts: {
  await runtime.runQaCoverageReportCommand(opts);
 }

+async function runQaJsonlReplay(opts: {
+  repoRoot?: string;
+  transcripts?: string;
+  outputDir?: string;
+  runtimePair?: string;
+  providerMode?: QaProviderModeInput;
+}) {
+  const runtime = await loadQaLabCliRuntime();
+  await runtime.runQaJsonlReplayCommand(opts);
+}
+
 async function runQaCharacterEval(opts: {
  repoRoot?: string;
  outputDir?: string;
@@ -402,6 +413,33 @@ export function registerQaLabCli(program: Command) {
      },
    );

+  qa.command("jsonl-replay")
+    .description("Replay curated JSONL transcripts through the runtime parity replay harness")
+    .option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
+    .option(
+      "--transcripts <path>",
+      "Directory of curated JSONL transcripts",
+      "qa/scenarios/jsonl-replay",
+    )
+    .option("--runtime-pair <pair>", "Runtime pair label, e.g. pi,codex", "pi,codex")
+    .option(
+      "--provider-mode <mode>",
+      `Provider mode (${formatQaProviderModeHelp()})`,
+      "mock-openai",
+    )
+    .option("--output-dir <path>", "Artifact directory for the JSONL replay report")
+    .action(
+      async (opts: {
+        repoRoot?: string;
+        transcripts?: string;
+        runtimePair?: string;
+        providerMode?: QaProviderModeInput;
+        outputDir?: string;
+      }) => {
+        await runQaJsonlReplay(opts);
+      },
+    );
+
  qa.command("character-eval")
    .description("Run the character QA scenario across live models and write a judged report")
    .option("--repo-root <path>", "Repository root to target when running from a neutral cwd")
--- a/extensions/qa-lab/src/jsonl-replay.test.ts
+++ b/extensions/qa-lab/src/jsonl-replay.test.ts
@@ -0,0 +1,169 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+import { afterEach, describe, expect, it } from "vitest";
+import {
+  createMockJsonlReplayCellRunner,
+  extractJsonlReplayUserTurns,
+  renderJsonlReplayMarkdownReport,
+  runJsonlReplay,
+  type JsonlReplayCellRunner,
+} from "./jsonl-replay.js";
+import type { RuntimeId, RuntimeParityCell, RuntimeParityToolCall } from "./runtime-parity.js";
+
+const tempRoots: string[] = [];
+
+function makeCell(
+  runtime: RuntimeId,
+  overrides: Partial<RuntimeParityCell> = {},
+): RuntimeParityCell {
+  return {
+    runtime,
+    transcriptBytes: `{"message":{"role":"assistant","content":"${runtime} reply"}}\n`,
+    toolCalls: [],
+    finalText: "same reply",
+    usage: {
+      inputTokens: 10,
+      outputTokens: 5,
+      totalTokens: 15,
+    },
+    wallClockMs: 12,
+    bootStateLines: [],
+    ...overrides,
+  };
+}
+
+function makeToolCall(overrides: Partial<RuntimeParityToolCall> = {}): RuntimeParityToolCall {
+  return {
+    tool: "read",
+    argsHash: "args-a",
+    resultHash: "result-a",
+    ...overrides,
+  };
+}
+
+async function makeTempDir() {
+  const tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "jsonl-replay-"));
+  tempRoots.push(tempRoot);
+  return tempRoot;
+}
+
+afterEach(async () => {
+  await Promise.all(
+    tempRoots.splice(0).map((tempRoot) => fs.rm(tempRoot, { recursive: true, force: true })),
+  );
+});
+
+describe("jsonl replay", () => {
+  it("extracts user-turn boundaries while ignoring system, tool-only, empty, and malformed rows", () => {
+    const turns = extractJsonlReplayUserTurns(
+      [
+        `{"message":{"role":"system","content":"System setup"}}`,
+        `{"message":{"role":"tool","content":"tool-only prelude"}}`,
+        `{"message":{"role":"user","content":"   "}}`,
+        `{not-json`,
+        `{"message":{"role":"assistant","content":"Ready."}}`,
+        `{"message":{"role":"user","content":[{"type":"text","text":"Plan the release"},{"type":"tool_result","content":"ignored"}]}}`,
+        `{"role":"user","content":[{"type":"input_text","text":"Check the follow-up"}]}`,
+      ].join("\n"),
+    );
+
+    expect(turns).toEqual([
+      expect.objectContaining({
+        turn: 1,
+        lineNumber: 6,
+        userText: "Plan the release",
+      }),
+      expect.objectContaining({
+        turn: 2,
+        lineNumber: 7,
+        userText: "Check the follow-up",
+      }),
+    ]);
+    expect(turns[0]?.transcriptPrefix).toContain(`"role":"system"`);
+    expect(turns[0]?.transcriptPrefix).not.toContain("{not-json");
+  });
+
+  it("reports the earliest divergent turn using runtime parity drift classes", async () => {
+    const transcriptDir = await makeTempDir();
+    await fs.writeFile(
+      path.join(transcriptDir, "three-turns.jsonl"),
+      [
+        `{"message":{"role":"user","content":"Turn one"}}`,
+        `{"message":{"role":"assistant","content":"Ready"}}`,
+        `{"message":{"role":"user","content":"Turn two"}}`,
+        `{"message":{"role":"assistant","content":"Using a tool"}}`,
+        `{"message":{"role":"user","content":"Turn three"}}`,
+      ].join("\n"),
+      "utf8",
+    );
+
+    const runCell: JsonlReplayCellRunner = async ({ runtime, turn }) => {
+      if (turn.turn === 2) {
+        return {
+          scenarioStatus: "pass",
+          cell: makeCell(runtime, {
+            toolCalls: [makeToolCall(runtime === "pi" ? {} : { argsHash: "args-codex" })],
+          }),
+        };
+      }
+      if (turn.turn === 3) {
+        return {
+          scenarioStatus: "pass",
+          cell: makeCell(runtime, {
+            finalText: runtime === "pi" ? "pi wording" : "codex wording",
+          }),
+        };
+      }
+      return {
+        scenarioStatus: "pass",
+        cell: makeCell(runtime),
+      };
+    };
+
+    const result = await runJsonlReplay(
+      {
+        directory: transcriptDir,
+        runtimePair: ["pi", "codex"],
+        providerMode: "mock-openai",
+      },
+      { runCell },
+    );
+
+    expect(result.transcripts).toHaveLength(1);
+    expect(result.transcripts[0]).toEqual(
+      expect.objectContaining({
+        userTurnCount: 3,
+        drift: ["none", "tool-call-shape", "text-only"],
+        firstDriftAtTurn: 2,
+      }),
+    );
+    expect(result.transcripts[0]?.cells.pi).toHaveLength(3);
+    expect(result.transcripts[0]?.cells.codex).toHaveLength(3);
+  });
+
+  it("runs the curated replay fixture set in mock-openai mode", async () => {
+    const fixtureDir = path.resolve("qa/scenarios/jsonl-replay");
+
+    const result = await runJsonlReplay(
+      {
+        directory: fixtureDir,
+        runtimePair: ["pi", "codex"],
+        providerMode: "mock-openai",
+      },
+      { runCell: createMockJsonlReplayCellRunner() },
+    );
+
+    expect(result.transcripts).toHaveLength(7);
+    expect(result.transcripts.map((entry) => entry.userTurnCount)).toEqual([2, 2, 3, 2, 2, 2, 2]);
+    expect(result.transcripts.every((entry) => entry.firstDriftAtTurn === undefined)).toBe(true);
+    expect(
+      renderJsonlReplayMarkdownReport({
+        generatedAt: "2026-05-10T00:00:00.000Z",
+        providerMode: "mock-openai",
+        runtimePair: ["pi", "codex"],
+        transcripts: result.transcripts,
+      }),
+    ).toContain("| plan-mode-boundaries.jsonl | 3 |  | none, none, none |");
+  });
+});
--- a/extensions/qa-lab/src/jsonl-replay.ts
+++ b/extensions/qa-lab/src/jsonl-replay.ts
@@ -0,0 +1,270 @@
+import fs from "node:fs/promises";
+import path from "node:path";
+import {
+  runRuntimeParityScenario,
+  type RuntimeId,
+  type RuntimeParityCell,
+  type RuntimeParityResult,
+  type RuntimeParityScenarioExecution,
+} from "./runtime-parity.js";
+
+export type JsonlReplayInput = {
+  directory: string;
+  runtimePair: ["pi", "codex"];
+  providerMode: "mock-openai" | "live-frontier";
+};
+
+export type JsonlReplayTurn = {
+  turn: number;
+  lineNumber: number;
+  userText: string;
+  transcriptPrefix: string;
+};
+
+export type JsonlReplayCellRunner = (params: {
+  runtime: RuntimeId;
+  transcriptPath: string;
+  turn: JsonlReplayTurn;
+  turns: readonly JsonlReplayTurn[];
+  providerMode: JsonlReplayInput["providerMode"];
+}) => Promise<RuntimeParityScenarioExecution>;
+
+export type JsonlReplayResult = {
+  transcripts: Array<{
+    transcriptPath: string;
+    userTurnCount: number;
+    cells: { pi: RuntimeParityCell[]; codex: RuntimeParityCell[] };
+    drift: Array<RuntimeParityResult["drift"]>;
+    firstDriftAtTurn?: number;
+  }>;
+};
+
+export type JsonlReplayOptions = {
+  runCell?: JsonlReplayCellRunner;
+};
+
+export type JsonlReplayMarkdownReport = {
+  generatedAt: string;
+  providerMode: JsonlReplayInput["providerMode"];
+  runtimePair: JsonlReplayInput["runtimePair"];
+  transcripts: JsonlReplayResult["transcripts"];
+};
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return Boolean(value) && typeof value === "object" && !Array.isArray(value);
+}
+
+function readString(value: unknown): string | undefined {
+  return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
+}
+
+function readReplayMessage(record: Record<string, unknown>): Record<string, unknown> | undefined {
+  if (isRecord(record.message)) {
+    return record.message;
+  }
+  return readString(record.role) ? record : undefined;
+}
+
+function readRole(message: Record<string, unknown>) {
+  return readString(message.role)?.toLowerCase();
+}
+
+function isTextLikeContentBlock(block: Record<string, unknown>) {
+  const type = readString(block.type)?.toLowerCase();
+  return (
+    !type ||
+    type === "text" ||
+    type === "input_text" ||
+    type === "message" ||
+    type === "output_text" ||
+    type === "user_text"
+  );
+}
+
+function extractTextContent(content: unknown): string {
+  if (typeof content === "string") {
+    return content.trim();
+  }
+  if (!Array.isArray(content)) {
+    return "";
+  }
+  const parts: string[] = [];
+  for (const block of content) {
+    if (typeof block === "string") {
+      if (block.trim()) {
+        parts.push(block.trim());
+      }
+      continue;
+    }
+    if (!isRecord(block) || !isTextLikeContentBlock(block)) {
+      continue;
+    }
+    const text = readString(block.text) ?? readString(block.content);
+    if (text) {
+      parts.push(text);
+    }
+  }
+  return parts.join("\n").trim();
+}
+
+export function extractJsonlReplayUserTurns(transcriptBytes: string): JsonlReplayTurn[] {
+  const turns: JsonlReplayTurn[] = [];
+  const acceptedLines: string[] = [];
+  for (const [lineIndex, rawLine] of transcriptBytes.split(/\r?\n/u).entries()) {
+    const trimmed = rawLine.trim();
+    if (!trimmed) {
+      continue;
+    }
+    let parsed: unknown;
+    try {
+      parsed = JSON.parse(trimmed) as unknown;
+    } catch {
+      continue;
+    }
+    if (!isRecord(parsed)) {
+      continue;
+    }
+    acceptedLines.push(trimmed);
+    const message = readReplayMessage(parsed);
+    if (!message || readRole(message) !== "user") {
+      continue;
+    }
+    const userText = extractTextContent(message.content);
+    if (!userText) {
+      continue;
+    }
+    turns.push({
+      turn: turns.length + 1,
+      lineNumber: lineIndex + 1,
+      userText,
+      transcriptPrefix: `${acceptedLines.join("\n")}\n`,
+    });
+  }
+  return turns;
+}
+
+async function listJsonlFiles(directory: string): Promise<string[]> {
+  const entries = await fs.readdir(directory, { withFileTypes: true });
+  const files: string[] = [];
+  for (const entry of entries.toSorted((left, right) => left.name.localeCompare(right.name))) {
+    const entryPath = path.join(directory, entry.name);
+    if (entry.isDirectory()) {
+      files.push(...(await listJsonlFiles(entryPath)));
+      continue;
+    }
+    if (entry.isFile() && entry.name.endsWith(".jsonl")) {
+      files.push(entryPath);
+    }
+  }
+  return files;
+}
+
+function defaultRunCell(): Promise<RuntimeParityScenarioExecution> {
+  throw new Error(
+    "jsonl replay requires a runtime cell runner; CLI/suite wiring should provide the Phase 1 runtime parity runner",
+  );
+}
+
+function assertSupportedRuntimePair(runtimePair: JsonlReplayInput["runtimePair"]) {
+  if (runtimePair[0] !== "pi" || runtimePair[1] !== "codex") {
+    throw new Error(`unsupported jsonl replay runtime pair: ${runtimePair.join(",")}`);
+  }
+}
+
+export function createMockJsonlReplayCellRunner(): JsonlReplayCellRunner {
+  return async ({ runtime, turn }) => ({
+    scenarioStatus: "pass",
+    cell: {
+      runtime,
+      transcriptBytes: turn.transcriptPrefix,
+      toolCalls: [],
+      finalText: `Replayed curated turn ${turn.turn}.`,
+      usage: {
+        inputTokens: Math.max(1, Math.ceil(turn.transcriptPrefix.length / 4)),
+        outputTokens: 8,
+        totalTokens: Math.max(1, Math.ceil(turn.transcriptPrefix.length / 4)) + 8,
+      },
+      wallClockMs: 1,
+      bootStateLines: [],
+    },
+  });
+}
+
+export async function runJsonlReplay(
+  input: JsonlReplayInput,
+  options: JsonlReplayOptions = {},
+): Promise<JsonlReplayResult> {
+  assertSupportedRuntimePair(input.runtimePair);
+  const directory = path.resolve(input.directory);
+  const transcriptPaths = await listJsonlFiles(directory);
+  const runCell = options.runCell ?? defaultRunCell;
+  const transcripts: JsonlReplayResult["transcripts"] = [];
+
+  for (const transcriptPath of transcriptPaths) {
+    const transcriptBytes = await fs.readFile(transcriptPath, "utf8");
+    const turns = extractJsonlReplayUserTurns(transcriptBytes);
+    const cells: { pi: RuntimeParityCell[]; codex: RuntimeParityCell[] } = {
+      pi: [],
+      codex: [],
+    };
+    const drift: Array<RuntimeParityResult["drift"]> = [];
+    let firstDriftAtTurn: number | undefined;
+
+    for (const turn of turns) {
+      const parity = await runRuntimeParityScenario({
+        scenarioId: `${path.basename(transcriptPath)}#turn-${turn.turn}`,
+        runCell: async (runtime) =>
+          runCell({
+            runtime,
+            transcriptPath,
+            turn,
+            turns,
+            providerMode: input.providerMode,
+          }),
+      });
+      cells.pi.push(parity.cells.pi);
+      cells.codex.push(parity.cells.codex);
+      drift.push(parity.drift);
+      if (firstDriftAtTurn === undefined && parity.drift !== "none") {
+        firstDriftAtTurn = turn.turn;
+      }
+    }
+
+    transcripts.push({
+      transcriptPath,
+      userTurnCount: turns.length,
+      cells,
+      drift,
+      ...(firstDriftAtTurn !== undefined ? { firstDriftAtTurn } : {}),
+    });
+  }
+
+  return { transcripts };
+}
+
+export function renderJsonlReplayMarkdownReport(report: JsonlReplayMarkdownReport): string {
+  const totalTurns = report.transcripts.reduce((sum, entry) => sum + entry.userTurnCount, 0);
+  const driftedTranscripts = report.transcripts.filter(
+    (entry) => entry.firstDriftAtTurn !== undefined,
+  );
+  const lines = [
+    `# OpenClaw JSONL Replay Report - ${report.runtimePair[0]} vs ${report.runtimePair[1]}`,
+    "",
+    `- Generated at: ${report.generatedAt}`,
+    `- Provider mode: ${report.providerMode}`,
+    `- Transcripts: ${report.transcripts.length}`,
+    `- User turns: ${totalTurns}`,
+    `- Drifted transcripts: ${driftedTranscripts.length}`,
+    "",
+    "| Transcript | User turns | First drift turn | Drift sequence |",
+    "| --- | ---: | ---: | --- |",
+  ];
+
+  for (const transcript of report.transcripts) {
+    lines.push(
+      `| ${path.basename(transcript.transcriptPath)} | ${transcript.userTurnCount} | ${transcript.firstDriftAtTurn ?? ""} | ${transcript.drift.join(", ")} |`,
+    );
+  }
+
+  return `${lines.join("\n").trimEnd()}\n`;
+}
--- a/qa/scenarios/jsonl-replay/approval-denial-retry.jsonl
+++ b/qa/scenarios/jsonl-replay/approval-denial-retry.jsonl
@@ -0,0 +1,6 @@
+{"message":{"role":"system","content":"Curated replay fixture: approval denial then safe retry."}}
+{"message":{"role":"user","content":"Read the guarded config file, but do not modify anything without approval."}}
+{"message":{"role":"assistant","content":"I will request the read and stop before any write."}}
+{"message":{"role":"tool","content":"approval denied for write attempt; read-only path remains allowed"}}
+{"message":{"role":"user","content":"Retry with read-only inspection and summarize the safe path."}}
+{"message":{"role":"assistant","content":"The safe path is read-only inspection followed by a summary."}}
--- a/qa/scenarios/jsonl-replay/gateway-restart-recovery.jsonl
+++ b/qa/scenarios/jsonl-replay/gateway-restart-recovery.jsonl
@@ -0,0 +1,6 @@
+{"message":{"role":"system","content":"Curated replay fixture: gateway restart recovery."}}
+{"message":{"role":"user","content":"Start a long workspace check and record the marker RESTART-RECOVERY-A."}}
+{"message":{"role":"assistant","content":"Started the check and recorded RESTART-RECOVERY-A."}}
+{"message":{"role":"tool","content":"gateway restarted before final delivery; session state recovered"}}
+{"message":{"role":"user","content":"Continue after the restart and confirm no duplicate marker was sent."}}
+{"message":{"role":"assistant","content":"Recovered after restart and did not duplicate the marker."}}
--- a/qa/scenarios/jsonl-replay/plan-mode-boundaries.jsonl
+++ b/qa/scenarios/jsonl-replay/plan-mode-boundaries.jsonl
@@ -0,0 +1,8 @@
+{"message":{"role":"system","content":"Curated JSONL replay fixture: plan-mode boundary smoke. Synthetic data only."}}
+{"message":{"role":"tool","toolName":"status","content":"prelude status row before any user turn"}}
+{"message":{"role":"user","content":"Enter plan mode and report the mode fields only."}}
+{"message":{"role":"assistant","content":"inPlanMode=true mode=plan"}}
+{"message":{"role":"user","content":[{"type":"text","text":"Attempt a write while plan approval is pending."},{"type":"tool_result","content":"ignored tool result block"}]}}
+{"message":{"role":"assistant","content":"The write is blocked while plan approval is pending."}}
+{"message":{"role":"user","content":"Exit plan mode and stop after reporting status."}}
+{"message":{"role":"assistant","content":"inPlanMode=false mode=normal"}}
--- a/qa/scenarios/jsonl-replay/plugin-lifecycle-searchable-tools.jsonl
+++ b/qa/scenarios/jsonl-replay/plugin-lifecycle-searchable-tools.jsonl
@@ -0,0 +1,6 @@
+{"message":{"role":"system","content":"Curated replay fixture: plugin lifecycle with searchable tools."}}
+{"message":{"role":"user","content":"Install the demo plugin and discover its OpenClaw integration tool before calling it."}}
+{"message":{"role":"assistant","content":"The plugin is installed; I will discover the integration tool before use."}}
+{"message":{"role":"tool","content":"searchable tool discovery returned openclaw.demo.status"}}
+{"message":{"role":"user","content":"Call the discovered status tool and summarize the result."}}
+{"message":{"role":"assistant","content":"The discovered plugin status tool returned healthy."}}
--- a/qa/scenarios/jsonl-replay/recovery-partial-session.jsonl
+++ b/qa/scenarios/jsonl-replay/recovery-partial-session.jsonl
@@ -0,0 +1,4 @@
+{"message":{"role":"system","content":"Curated JSONL replay fixture: partial recovery session. Synthetic data only."}}
+{"message":{"role":"user","content":"A runtime produced an empty assistant response after a safe read. Summarize the recovery plan."}}
+{"message":{"role":"assistant","content":"Retry once with the same safe read, preserve tool results, and avoid mutating actions."}}
+{"message":{"role":"user","content":"Now replay the follow-up with only the safe read context and no network dependency."}}
--- a/qa/scenarios/jsonl-replay/repo-triage-tool-loop.jsonl
+++ b/qa/scenarios/jsonl-replay/repo-triage-tool-loop.jsonl
@@ -0,0 +1,7 @@
+{"message":{"role":"system","content":"Curated JSONL replay fixture: repository triage. Synthetic data only; no private transcript content."}}
+{"message":{"role":"user","content":"Review the QA fixture index and identify one missing runtime coverage row."}}
+{"message":{"role":"assistant","content":[{"type":"tool_use","id":"fixture_tool_1","name":"read","input":{"path":"qa/scenarios/index.md"}}]}}
+{"message":{"role":"tool","toolName":"read","content":"Runtime coverage index includes basic channel and model rows."}}
+{"message":{"role":"assistant","content":"The index has channel and model rows; runtime replay coverage is a good follow-up."}}
+{"message":{"role":"user","content":"Draft the smallest next test without editing production wiring."}}
+{"message":{"role":"assistant","content":"Add a fixture-only replay test and leave CLI wiring for the parent branch."}}
--- a/qa/scenarios/jsonl-replay/workspace-edit-loop.jsonl
+++ b/qa/scenarios/jsonl-replay/workspace-edit-loop.jsonl
@@ -0,0 +1,6 @@
+{"message":{"role":"system","content":"Curated replay fixture: workspace edit loop."}}
+{"message":{"role":"user","content":"Read README.md, patch the typo in the local draft, and show the diff summary."}}
+{"message":{"role":"assistant","content":"I read README.md, applied the local typo patch, and summarized the diff."}}
+{"message":{"role":"tool","content":"patch applied to draft.md with one changed line"}}
+{"message":{"role":"user","content":"Run the lightweight check and tell me whether the patch is safe."}}
+{"message":{"role":"assistant","content":"The lightweight check passed and the patch is safe."}}