feat(qa-lab): select runtime parity tiers

This commit is contained in:
Vincent Koc
2026-05-18 00:16:04 +08:00
parent ea72414e1c
commit 79212f9869
8 changed files with 177 additions and 3 deletions

View File

@@ -897,6 +897,7 @@ jobs:
run: pnpm build
- name: Run runtime parity lane
id: runtime_parity_lane
run: |
set -euo pipefail
pnpm openclaw qa suite \
@@ -908,6 +909,19 @@ jobs:
--runtime-pair pi,codex \
--output-dir ".artifacts/qa-e2e/runtime-parity"
- name: Run standard runtime parity tier
if: ${{ always() && steps.runtime_parity_lane.outcome != 'skipped' && steps.runtime_parity_lane.outcome != 'cancelled' }}
run: |
set -euo pipefail
pnpm openclaw qa suite \
--provider-mode mock-openai \
--runtime-parity-tier standard \
--concurrency "${QA_PARITY_CONCURRENCY}" \
--model "${OPENCLAW_CI_OPENAI_MODEL}" \
--alt-model "openai/gpt-5.5-alt" \
--runtime-pair pi,codex \
--output-dir ".artifacts/qa-e2e/runtime-parity-standard"
- name: Generate runtime parity report
if: always()
run: |
@@ -918,6 +932,16 @@ jobs:
--summary .artifacts/qa-e2e/runtime-parity/qa-suite-summary.json \
--output-dir .artifacts/qa-e2e/runtime-parity-report
- name: Generate standard runtime parity report
if: always()
run: |
set -euo pipefail
pnpm openclaw qa parity-report \
--repo-root . \
--runtime-axis \
--summary .artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json \
--output-dir .artifacts/qa-e2e/runtime-parity-standard-report
- name: Upload runtime parity artifacts
if: always()
uses: actions/upload-artifact@v4

View File

@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
- Plugins/messages: add presentation capability limits for channel renderers, adapt rich message controls before native rendering, and mark legacy `interactive`/Slack directive producer APIs as deprecated.
- Proxy: support HTTPS managed forward-proxy endpoints and scoped `proxy.tls.caFile` CA trust for proxy endpoint TLS. (#79171) Thanks @jesse-merhi.
- QA-Lab: add first-hour 20-turn and optional 100-turn runtime parity scenarios, with tier metadata for standard and soak QA gates. Fixes #80338; refs #80337. Thanks @100yenadmin.
- QA-Lab: add `openclaw qa suite --runtime-parity-tier` and wire the standard Codex-vs-Pi tier into release checks separately from optional/live-only/soak lanes. Fixes #80337. Thanks @100yenadmin.
- QA-Lab: add a live-only Codex Pi-shaped Read vocabulary canary so runtime parity catches native workspace-read prompt compatibility drift. (#80323) Thanks @100yenadmin.
- QA-Lab: add live-only harness self-health scenarios for plugin hook crashes, manifest contract errors, and WebChat direct-reply self-message routing. (#80323) Thanks @100yenadmin.
- QA-Lab: add runtime tool fixture scenarios and coverage reporting for Codex-native workspace tools, OpenClaw dynamic tools, and optional plugin-backed tools. Fixes #80173. Thanks @100yenadmin.

View File

@@ -782,6 +782,66 @@ describe("qa cli runtime", () => {
});
});
it("expands runtime parity tier selections onto the suite scenario list", async () => {
await runQaSuiteCommand({
repoRoot: "/tmp/openclaw-repo",
runtimeParityTier: ["standard"],
scenarioIds: ["channel-chat-baseline", "runtime-tool-bash"],
});
expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
repoRoot: path.resolve("/tmp/openclaw-repo"),
scenarioIds: [
"channel-chat-baseline",
"runtime-tool-bash",
"runtime-first-hour-20-turn",
"runtime-tool-apply-patch",
"runtime-tool-edit",
"runtime-tool-exec",
"runtime-tool-fs-list",
"runtime-tool-fs-read",
"runtime-tool-fs-write",
"runtime-tool-grep",
"runtime-tool-image-generate",
"runtime-tool-session-status",
"runtime-tool-sessions-spawn",
"runtime-tool-web-fetch",
"runtime-tool-web-search",
],
});
});
it("accepts comma-separated runtime parity tier filters", async () => {
await runQaSuiteCommand({
repoRoot: "/tmp/openclaw-repo",
runtimeParityTier: ["optional,soak"],
});
expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
scenarioIds: [
"runtime-soak-100-turn",
"runtime-tool-memory-add",
"runtime-tool-memory-recall",
"runtime-tool-message-tool",
"runtime-tool-skill-invocation",
"runtime-tool-tavily-extract",
"runtime-tool-tavily-search",
"runtime-tool-tts",
],
});
});
it("rejects unknown runtime parity tier filters", async () => {
await expect(
runQaSuiteCommand({
repoRoot: "/tmp/openclaw-repo",
runtimeParityTier: ["standardish"],
}),
).rejects.toThrow(
'--runtime-parity-tier must be one of standard, optional, live-only, soak, got "standardish".',
);
});
it("rejects unknown suite packs", async () => {
await expect(
runQaSuiteCommand({

View File

@@ -42,7 +42,11 @@ import {
type QaProviderModeInput,
} from "./run-config.js";
import type { RuntimeId } from "./runtime-parity.js";
import { readQaScenarioPack } from "./scenario-catalog.js";
import {
QA_RUNTIME_PARITY_TIERS,
readQaScenarioPack,
type QaRuntimeParityTier,
} from "./scenario-catalog.js";
import { resolveQaScenarioPackScenarioIds } from "./scenario-packs.js";
import { runQaSuiteFromRuntime } from "./suite-launch.runtime.js";
import { readQaSuiteFailedScenarioCountFromSummary } from "./suite-summary.js";
@@ -163,6 +167,47 @@ function parseQaRuntimePair(value: string | undefined): [RuntimeId, RuntimeId] |
return ["pi", "codex"];
}
function parseQaRuntimeParityTierFilters(input: string[] | undefined): QaRuntimeParityTier[] {
const rawValues = [
...new Set(
(input ?? [])
.flatMap((value) => value.split(","))
.map((value) => value.trim().toLowerCase())
.filter(Boolean),
),
];
const validTiers = new Set<string>(QA_RUNTIME_PARITY_TIERS);
for (const value of rawValues) {
if (!validTiers.has(value)) {
throw new Error(
`--runtime-parity-tier must be one of ${QA_RUNTIME_PARITY_TIERS.join(", ")}, got "${value}".`,
);
}
}
return rawValues as QaRuntimeParityTier[];
}
function resolveQaRuntimeParityTierScenarioIds(params: {
scenarioIds: string[];
runtimeParityTiers: readonly QaRuntimeParityTier[];
}): string[] {
if (params.runtimeParityTiers.length === 0) {
return params.scenarioIds;
}
const tierSet = new Set(params.runtimeParityTiers);
const matchingScenarioIds = readQaScenarioPack()
.scenarios.filter(
(scenario) => scenario.runtimeParityTier && tierSet.has(scenario.runtimeParityTier),
)
.map((scenario) => scenario.id);
if (matchingScenarioIds.length === 0) {
throw new Error(
`--runtime-parity-tier matched no scenarios for ${params.runtimeParityTiers.join(", ")}.`,
);
}
return [...new Set([...params.scenarioIds, ...matchingScenarioIds])];
}
async function readQaFailedScenarioCountFromSummary(summaryPath: string) {
let summaryText: string;
try {
@@ -513,17 +558,23 @@ export async function runQaSuiteCommand(opts: {
disk?: string;
preflight?: boolean;
runtimePair?: string;
runtimeParityTier?: string[];
}) {
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
const transportId = normalizeQaTransportId(opts.transportId);
const runner = (opts.runner ?? "host").trim().toLowerCase();
const scenarioIds = resolveQaScenarioPackScenarioIds({
const explicitScenarioIds = resolveQaScenarioPackScenarioIds({
pack: opts.pack,
scenarioIds: resolveQaParityPackScenarioIds({
parityPack: opts.parityPack,
scenarioIds: opts.scenarioIds,
}),
});
const runtimeParityTiers = parseQaRuntimeParityTierFilters(opts.runtimeParityTier);
const scenarioIds = resolveQaRuntimeParityTierScenarioIds({
scenarioIds: explicitScenarioIds,
runtimeParityTiers,
});
const allowFailures = opts.allowFailures === true;
if (runner !== "host" && runner !== "multipass") {
throw new Error(`--runner must be one of host or multipass, got "${opts.runner}".`);

View File

@@ -576,6 +576,22 @@ describe("qa cli registration", () => {
expect(options.pack).toBe("personal-agent");
});
it("forwards --runtime-parity-tier for suite runs", async () => {
await program.parseAsync([
"node",
"openclaw",
"qa",
"suite",
"--runtime-parity-tier",
"standard",
"--runtime-parity-tier",
"optional,soak",
]);
const options = requireQaSuiteOptions();
expect(options.runtimeParityTier).toEqual(["standard", "optional,soak"]);
});
it("routes credential add flags into the qa runtime command", async () => {
await program.parseAsync([
"node",

View File

@@ -51,6 +51,7 @@ async function runQaSuite(opts: {
disk?: string;
preflight?: boolean;
runtimePair?: string;
runtimeParityTier?: string[];
}) {
const runtime = await loadQaLabCliRuntime();
await runtime.runQaSuiteCommand(opts);
@@ -286,6 +287,12 @@ export function registerQaLabCli(program: Command) {
.option("--memory <size>", "Multipass memory size")
.option("--disk <size>", "Multipass disk size")
.option("--runtime-pair <pair>", "Run each scenario under both runtimes, e.g. pi,codex")
.option(
"--runtime-parity-tier <tier>",
"Add scenarios tagged with runtimeParityTier (standard, optional, live-only, soak; repeatable or comma-separated)",
collectString,
[],
)
.action(
async (opts: {
repoRoot?: string;
@@ -310,6 +317,7 @@ export function registerQaLabCli(program: Command) {
disk?: string;
preflight?: boolean;
runtimePair?: string;
runtimeParityTier?: string[];
}) => {
await runQaSuite({
repoRoot: opts.repoRoot,
@@ -334,6 +342,7 @@ export function registerQaLabCli(program: Command) {
disk: opts.disk,
preflight: opts.preflight,
runtimePair: opts.runtimePair,
runtimeParityTier: opts.runtimeParityTier,
});
},
);

View File

@@ -93,7 +93,8 @@ const qaScenarioGatewayRuntimeSchema = z.object({
forwardHostHome: z.boolean().optional(),
});
const qaRuntimeParityTierSchema = z.enum(["standard", "optional", "live-only", "soak"]);
export const QA_RUNTIME_PARITY_TIERS = ["standard", "optional", "live-only", "soak"] as const;
const qaRuntimeParityTierSchema = z.enum(QA_RUNTIME_PARITY_TIERS);
const qaFlowCallActionSchema = z.object({
call: z.string().trim().min(1),

View File

@@ -25,6 +25,18 @@ Coverage tracking:
- treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid
- keep source-path tracking in the report, not in the scenario schema
Runtime parity tiers:
- `standard`: required Codex-vs-Pi mock gate coverage for first-hour depth and
default runtime-tool fixtures; selected with
`openclaw qa suite --runtime-pair pi,codex --runtime-parity-tier standard`
- `optional`: profile-, plugin-, or external-service-dependent runtime-tool
fixtures that stay out of the default release gate
- `live-only`: scenarios that need real provider/runtime behavior rather than
mock-openai fixtures
- `soak`: long-running scheduled or Testbox lanes such as the 100-turn parity
soak
Theme directories:
- `agents/` - agent behavior, instructions, subagent flows, and persisted child-link regressions