feat(qa-lab): select runtime parity tiers

2026-06-06 05:51:15 +08:00 · 2026-05-18 00:16:04 +08:00
parent ea72414e1c
commit 79212f9869
8 changed files with 177 additions and 3 deletions
--- a/.github/workflows/openclaw-release-checks.yml
+++ b/.github/workflows/openclaw-release-checks.yml
@@ -897,6 +897,7 @@ jobs:
        run: pnpm build

      - name: Run runtime parity lane
+        id: runtime_parity_lane
        run: |
          set -euo pipefail
          pnpm openclaw qa suite \
@@ -908,6 +909,19 @@ jobs:
            --runtime-pair pi,codex \
            --output-dir ".artifacts/qa-e2e/runtime-parity"

+      - name: Run standard runtime parity tier
+        if: ${{ always() && steps.runtime_parity_lane.outcome != 'skipped' && steps.runtime_parity_lane.outcome != 'cancelled' }}
+        run: |
+          set -euo pipefail
+          pnpm openclaw qa suite \
+            --provider-mode mock-openai \
+            --runtime-parity-tier standard \
+            --concurrency "${QA_PARITY_CONCURRENCY}" \
+            --model "${OPENCLAW_CI_OPENAI_MODEL}" \
+            --alt-model "openai/gpt-5.5-alt" \
+            --runtime-pair pi,codex \
+            --output-dir ".artifacts/qa-e2e/runtime-parity-standard"
+
      - name: Generate runtime parity report
        if: always()
        run: |
@@ -918,6 +932,16 @@ jobs:
            --summary .artifacts/qa-e2e/runtime-parity/qa-suite-summary.json \
            --output-dir .artifacts/qa-e2e/runtime-parity-report

+      - name: Generate standard runtime parity report
+        if: always()
+        run: |
+          set -euo pipefail
+          pnpm openclaw qa parity-report \
+            --repo-root . \
+            --runtime-axis \
+            --summary .artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json \
+            --output-dir .artifacts/qa-e2e/runtime-parity-standard-report
+
      - name: Upload runtime parity artifacts
        if: always()
        uses: actions/upload-artifact@v4
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
 - Plugins/messages: add presentation capability limits for channel renderers, adapt rich message controls before native rendering, and mark legacy `interactive`/Slack directive producer APIs as deprecated.
 - Proxy: support HTTPS managed forward-proxy endpoints and scoped `proxy.tls.caFile` CA trust for proxy endpoint TLS. (#79171) Thanks @jesse-merhi.
 - QA-Lab: add first-hour 20-turn and optional 100-turn runtime parity scenarios, with tier metadata for standard and soak QA gates. Fixes #80338; refs #80337. Thanks @100yenadmin.
+- QA-Lab: add `openclaw qa suite --runtime-parity-tier` and wire the standard Codex-vs-Pi tier into release checks separately from optional/live-only/soak lanes. Fixes #80337. Thanks @100yenadmin.
 - QA-Lab: add a live-only Codex Pi-shaped Read vocabulary canary so runtime parity catches native workspace-read prompt compatibility drift. (#80323) Thanks @100yenadmin.
 - QA-Lab: add live-only harness self-health scenarios for plugin hook crashes, manifest contract errors, and WebChat direct-reply self-message routing. (#80323) Thanks @100yenadmin.
 - QA-Lab: add runtime tool fixture scenarios and coverage reporting for Codex-native workspace tools, OpenClaw dynamic tools, and optional plugin-backed tools. Fixes #80173. Thanks @100yenadmin.
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -782,6 +782,66 @@ describe("qa cli runtime", () => {
    });
  });

+  it("expands runtime parity tier selections onto the suite scenario list", async () => {
+    await runQaSuiteCommand({
+      repoRoot: "/tmp/openclaw-repo",
+      runtimeParityTier: ["standard"],
+      scenarioIds: ["channel-chat-baseline", "runtime-tool-bash"],
+    });
+
+    expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
+      repoRoot: path.resolve("/tmp/openclaw-repo"),
+      scenarioIds: [
+        "channel-chat-baseline",
+        "runtime-tool-bash",
+        "runtime-first-hour-20-turn",
+        "runtime-tool-apply-patch",
+        "runtime-tool-edit",
+        "runtime-tool-exec",
+        "runtime-tool-fs-list",
+        "runtime-tool-fs-read",
+        "runtime-tool-fs-write",
+        "runtime-tool-grep",
+        "runtime-tool-image-generate",
+        "runtime-tool-session-status",
+        "runtime-tool-sessions-spawn",
+        "runtime-tool-web-fetch",
+        "runtime-tool-web-search",
+      ],
+    });
+  });
+
+  it("accepts comma-separated runtime parity tier filters", async () => {
+    await runQaSuiteCommand({
+      repoRoot: "/tmp/openclaw-repo",
+      runtimeParityTier: ["optional,soak"],
+    });
+
+    expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
+      scenarioIds: [
+        "runtime-soak-100-turn",
+        "runtime-tool-memory-add",
+        "runtime-tool-memory-recall",
+        "runtime-tool-message-tool",
+        "runtime-tool-skill-invocation",
+        "runtime-tool-tavily-extract",
+        "runtime-tool-tavily-search",
+        "runtime-tool-tts",
+      ],
+    });
+  });
+
+  it("rejects unknown runtime parity tier filters", async () => {
+    await expect(
+      runQaSuiteCommand({
+        repoRoot: "/tmp/openclaw-repo",
+        runtimeParityTier: ["standardish"],
+      }),
+    ).rejects.toThrow(
+      '--runtime-parity-tier must be one of standard, optional, live-only, soak, got "standardish".',
+    );
+  });
+
  it("rejects unknown suite packs", async () => {
    await expect(
      runQaSuiteCommand({
--- a/extensions/qa-lab/src/cli.runtime.ts
+++ b/extensions/qa-lab/src/cli.runtime.ts
@@ -42,7 +42,11 @@ import {
  type QaProviderModeInput,
 } from "./run-config.js";
 import type { RuntimeId } from "./runtime-parity.js";
-import { readQaScenarioPack } from "./scenario-catalog.js";
+import {
+  QA_RUNTIME_PARITY_TIERS,
+  readQaScenarioPack,
+  type QaRuntimeParityTier,
+} from "./scenario-catalog.js";
 import { resolveQaScenarioPackScenarioIds } from "./scenario-packs.js";
 import { runQaSuiteFromRuntime } from "./suite-launch.runtime.js";
 import { readQaSuiteFailedScenarioCountFromSummary } from "./suite-summary.js";
@@ -163,6 +167,47 @@ function parseQaRuntimePair(value: string | undefined): [RuntimeId, RuntimeId] |
  return ["pi", "codex"];
 }

+function parseQaRuntimeParityTierFilters(input: string[] | undefined): QaRuntimeParityTier[] {
+  const rawValues = [
+    ...new Set(
+      (input ?? [])
+        .flatMap((value) => value.split(","))
+        .map((value) => value.trim().toLowerCase())
+        .filter(Boolean),
+    ),
+  ];
+  const validTiers = new Set<string>(QA_RUNTIME_PARITY_TIERS);
+  for (const value of rawValues) {
+    if (!validTiers.has(value)) {
+      throw new Error(
+        `--runtime-parity-tier must be one of ${QA_RUNTIME_PARITY_TIERS.join(", ")}, got "${value}".`,
+      );
+    }
+  }
+  return rawValues as QaRuntimeParityTier[];
+}
+
+function resolveQaRuntimeParityTierScenarioIds(params: {
+  scenarioIds: string[];
+  runtimeParityTiers: readonly QaRuntimeParityTier[];
+}): string[] {
+  if (params.runtimeParityTiers.length === 0) {
+    return params.scenarioIds;
+  }
+  const tierSet = new Set(params.runtimeParityTiers);
+  const matchingScenarioIds = readQaScenarioPack()
+    .scenarios.filter(
+      (scenario) => scenario.runtimeParityTier && tierSet.has(scenario.runtimeParityTier),
+    )
+    .map((scenario) => scenario.id);
+  if (matchingScenarioIds.length === 0) {
+    throw new Error(
+      `--runtime-parity-tier matched no scenarios for ${params.runtimeParityTiers.join(", ")}.`,
+    );
+  }
+  return [...new Set([...params.scenarioIds, ...matchingScenarioIds])];
+}
+
 async function readQaFailedScenarioCountFromSummary(summaryPath: string) {
  let summaryText: string;
  try {
@@ -513,17 +558,23 @@ export async function runQaSuiteCommand(opts: {
  disk?: string;
  preflight?: boolean;
  runtimePair?: string;
+  runtimeParityTier?: string[];
 }) {
  const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
  const transportId = normalizeQaTransportId(opts.transportId);
  const runner = (opts.runner ?? "host").trim().toLowerCase();
-  const scenarioIds = resolveQaScenarioPackScenarioIds({
+  const explicitScenarioIds = resolveQaScenarioPackScenarioIds({
    pack: opts.pack,
    scenarioIds: resolveQaParityPackScenarioIds({
      parityPack: opts.parityPack,
      scenarioIds: opts.scenarioIds,
    }),
  });
+  const runtimeParityTiers = parseQaRuntimeParityTierFilters(opts.runtimeParityTier);
+  const scenarioIds = resolveQaRuntimeParityTierScenarioIds({
+    scenarioIds: explicitScenarioIds,
+    runtimeParityTiers,
+  });
  const allowFailures = opts.allowFailures === true;
  if (runner !== "host" && runner !== "multipass") {
    throw new Error(`--runner must be one of host or multipass, got "${opts.runner}".`);
--- a/extensions/qa-lab/src/cli.test.ts
+++ b/extensions/qa-lab/src/cli.test.ts
@@ -576,6 +576,22 @@ describe("qa cli registration", () => {
    expect(options.pack).toBe("personal-agent");
  });

+  it("forwards --runtime-parity-tier for suite runs", async () => {
+    await program.parseAsync([
+      "node",
+      "openclaw",
+      "qa",
+      "suite",
+      "--runtime-parity-tier",
+      "standard",
+      "--runtime-parity-tier",
+      "optional,soak",
+    ]);
+
+    const options = requireQaSuiteOptions();
+    expect(options.runtimeParityTier).toEqual(["standard", "optional,soak"]);
+  });
+
  it("routes credential add flags into the qa runtime command", async () => {
    await program.parseAsync([
      "node",
--- a/extensions/qa-lab/src/cli.ts
+++ b/extensions/qa-lab/src/cli.ts
@@ -51,6 +51,7 @@ async function runQaSuite(opts: {
  disk?: string;
  preflight?: boolean;
  runtimePair?: string;
+  runtimeParityTier?: string[];
 }) {
  const runtime = await loadQaLabCliRuntime();
  await runtime.runQaSuiteCommand(opts);
@@ -286,6 +287,12 @@ export function registerQaLabCli(program: Command) {
    .option("--memory <size>", "Multipass memory size")
    .option("--disk <size>", "Multipass disk size")
    .option("--runtime-pair <pair>", "Run each scenario under both runtimes, e.g. pi,codex")
+    .option(
+      "--runtime-parity-tier <tier>",
+      "Add scenarios tagged with runtimeParityTier (standard, optional, live-only, soak; repeatable or comma-separated)",
+      collectString,
+      [],
+    )
    .action(
      async (opts: {
        repoRoot?: string;
@@ -310,6 +317,7 @@ export function registerQaLabCli(program: Command) {
        disk?: string;
        preflight?: boolean;
        runtimePair?: string;
+        runtimeParityTier?: string[];
      }) => {
        await runQaSuite({
          repoRoot: opts.repoRoot,
@@ -334,6 +342,7 @@ export function registerQaLabCli(program: Command) {
          disk: opts.disk,
          preflight: opts.preflight,
          runtimePair: opts.runtimePair,
+          runtimeParityTier: opts.runtimeParityTier,
        });
      },
    );
--- a/extensions/qa-lab/src/scenario-catalog.ts
+++ b/extensions/qa-lab/src/scenario-catalog.ts
@@ -93,7 +93,8 @@ const qaScenarioGatewayRuntimeSchema = z.object({
  forwardHostHome: z.boolean().optional(),
 });

-const qaRuntimeParityTierSchema = z.enum(["standard", "optional", "live-only", "soak"]);
+export const QA_RUNTIME_PARITY_TIERS = ["standard", "optional", "live-only", "soak"] as const;
+const qaRuntimeParityTierSchema = z.enum(QA_RUNTIME_PARITY_TIERS);

 const qaFlowCallActionSchema = z.object({
  call: z.string().trim().min(1),
--- a/qa/scenarios/index.md
+++ b/qa/scenarios/index.md
@@ -25,6 +25,18 @@ Coverage tracking:
 - treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid
 - keep source-path tracking in the report, not in the scenario schema

+Runtime parity tiers:
+
+- `standard`: required Codex-vs-Pi mock gate coverage for first-hour depth and
+  default runtime-tool fixtures; selected with
+  `openclaw qa suite --runtime-pair pi,codex --runtime-parity-tier standard`
+- `optional`: profile-, plugin-, or external-service-dependent runtime-tool
+  fixtures that stay out of the default release gate
+- `live-only`: scenarios that need real provider/runtime behavior rather than
+  mock-openai fixtures
+- `soak`: long-running scheduled or Testbox lanes such as the 100-turn parity
+  soak
+
 Theme directories:

 - `agents/` - agent behavior, instructions, subagent flows, and persisted child-link regressions