mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-06 05:51:15 +08:00
feat(qa-lab): select runtime parity tiers
This commit is contained in:
24
.github/workflows/openclaw-release-checks.yml
vendored
24
.github/workflows/openclaw-release-checks.yml
vendored
@@ -897,6 +897,7 @@ jobs:
|
||||
run: pnpm build
|
||||
|
||||
- name: Run runtime parity lane
|
||||
id: runtime_parity_lane
|
||||
run: |
|
||||
set -euo pipefail
|
||||
pnpm openclaw qa suite \
|
||||
@@ -908,6 +909,19 @@ jobs:
|
||||
--runtime-pair pi,codex \
|
||||
--output-dir ".artifacts/qa-e2e/runtime-parity"
|
||||
|
||||
- name: Run standard runtime parity tier
|
||||
if: ${{ always() && steps.runtime_parity_lane.outcome != 'skipped' && steps.runtime_parity_lane.outcome != 'cancelled' }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
pnpm openclaw qa suite \
|
||||
--provider-mode mock-openai \
|
||||
--runtime-parity-tier standard \
|
||||
--concurrency "${QA_PARITY_CONCURRENCY}" \
|
||||
--model "${OPENCLAW_CI_OPENAI_MODEL}" \
|
||||
--alt-model "openai/gpt-5.5-alt" \
|
||||
--runtime-pair pi,codex \
|
||||
--output-dir ".artifacts/qa-e2e/runtime-parity-standard"
|
||||
|
||||
- name: Generate runtime parity report
|
||||
if: always()
|
||||
run: |
|
||||
@@ -918,6 +932,16 @@ jobs:
|
||||
--summary .artifacts/qa-e2e/runtime-parity/qa-suite-summary.json \
|
||||
--output-dir .artifacts/qa-e2e/runtime-parity-report
|
||||
|
||||
- name: Generate standard runtime parity report
|
||||
if: always()
|
||||
run: |
|
||||
set -euo pipefail
|
||||
pnpm openclaw qa parity-report \
|
||||
--repo-root . \
|
||||
--runtime-axis \
|
||||
--summary .artifacts/qa-e2e/runtime-parity-standard/qa-suite-summary.json \
|
||||
--output-dir .artifacts/qa-e2e/runtime-parity-standard-report
|
||||
|
||||
- name: Upload runtime parity artifacts
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
|
||||
@@ -18,6 +18,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Plugins/messages: add presentation capability limits for channel renderers, adapt rich message controls before native rendering, and mark legacy `interactive`/Slack directive producer APIs as deprecated.
|
||||
- Proxy: support HTTPS managed forward-proxy endpoints and scoped `proxy.tls.caFile` CA trust for proxy endpoint TLS. (#79171) Thanks @jesse-merhi.
|
||||
- QA-Lab: add first-hour 20-turn and optional 100-turn runtime parity scenarios, with tier metadata for standard and soak QA gates. Fixes #80338; refs #80337. Thanks @100yenadmin.
|
||||
- QA-Lab: add `openclaw qa suite --runtime-parity-tier` and wire the standard Codex-vs-Pi tier into release checks separately from optional/live-only/soak lanes. Fixes #80337. Thanks @100yenadmin.
|
||||
- QA-Lab: add a live-only Codex Pi-shaped Read vocabulary canary so runtime parity catches native workspace-read prompt compatibility drift. (#80323) Thanks @100yenadmin.
|
||||
- QA-Lab: add live-only harness self-health scenarios for plugin hook crashes, manifest contract errors, and WebChat direct-reply self-message routing. (#80323) Thanks @100yenadmin.
|
||||
- QA-Lab: add runtime tool fixture scenarios and coverage reporting for Codex-native workspace tools, OpenClaw dynamic tools, and optional plugin-backed tools. Fixes #80173. Thanks @100yenadmin.
|
||||
|
||||
@@ -782,6 +782,66 @@ describe("qa cli runtime", () => {
|
||||
});
|
||||
});
|
||||
|
||||
it("expands runtime parity tier selections onto the suite scenario list", async () => {
|
||||
await runQaSuiteCommand({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
runtimeParityTier: ["standard"],
|
||||
scenarioIds: ["channel-chat-baseline", "runtime-tool-bash"],
|
||||
});
|
||||
|
||||
expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
|
||||
repoRoot: path.resolve("/tmp/openclaw-repo"),
|
||||
scenarioIds: [
|
||||
"channel-chat-baseline",
|
||||
"runtime-tool-bash",
|
||||
"runtime-first-hour-20-turn",
|
||||
"runtime-tool-apply-patch",
|
||||
"runtime-tool-edit",
|
||||
"runtime-tool-exec",
|
||||
"runtime-tool-fs-list",
|
||||
"runtime-tool-fs-read",
|
||||
"runtime-tool-fs-write",
|
||||
"runtime-tool-grep",
|
||||
"runtime-tool-image-generate",
|
||||
"runtime-tool-session-status",
|
||||
"runtime-tool-sessions-spawn",
|
||||
"runtime-tool-web-fetch",
|
||||
"runtime-tool-web-search",
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it("accepts comma-separated runtime parity tier filters", async () => {
|
||||
await runQaSuiteCommand({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
runtimeParityTier: ["optional,soak"],
|
||||
});
|
||||
|
||||
expectFields(mockFirstObjectArg(runQaSuiteFromRuntime), {
|
||||
scenarioIds: [
|
||||
"runtime-soak-100-turn",
|
||||
"runtime-tool-memory-add",
|
||||
"runtime-tool-memory-recall",
|
||||
"runtime-tool-message-tool",
|
||||
"runtime-tool-skill-invocation",
|
||||
"runtime-tool-tavily-extract",
|
||||
"runtime-tool-tavily-search",
|
||||
"runtime-tool-tts",
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it("rejects unknown runtime parity tier filters", async () => {
|
||||
await expect(
|
||||
runQaSuiteCommand({
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
runtimeParityTier: ["standardish"],
|
||||
}),
|
||||
).rejects.toThrow(
|
||||
'--runtime-parity-tier must be one of standard, optional, live-only, soak, got "standardish".',
|
||||
);
|
||||
});
|
||||
|
||||
it("rejects unknown suite packs", async () => {
|
||||
await expect(
|
||||
runQaSuiteCommand({
|
||||
|
||||
@@ -42,7 +42,11 @@ import {
|
||||
type QaProviderModeInput,
|
||||
} from "./run-config.js";
|
||||
import type { RuntimeId } from "./runtime-parity.js";
|
||||
import { readQaScenarioPack } from "./scenario-catalog.js";
|
||||
import {
|
||||
QA_RUNTIME_PARITY_TIERS,
|
||||
readQaScenarioPack,
|
||||
type QaRuntimeParityTier,
|
||||
} from "./scenario-catalog.js";
|
||||
import { resolveQaScenarioPackScenarioIds } from "./scenario-packs.js";
|
||||
import { runQaSuiteFromRuntime } from "./suite-launch.runtime.js";
|
||||
import { readQaSuiteFailedScenarioCountFromSummary } from "./suite-summary.js";
|
||||
@@ -163,6 +167,47 @@ function parseQaRuntimePair(value: string | undefined): [RuntimeId, RuntimeId] |
|
||||
return ["pi", "codex"];
|
||||
}
|
||||
|
||||
function parseQaRuntimeParityTierFilters(input: string[] | undefined): QaRuntimeParityTier[] {
|
||||
const rawValues = [
|
||||
...new Set(
|
||||
(input ?? [])
|
||||
.flatMap((value) => value.split(","))
|
||||
.map((value) => value.trim().toLowerCase())
|
||||
.filter(Boolean),
|
||||
),
|
||||
];
|
||||
const validTiers = new Set<string>(QA_RUNTIME_PARITY_TIERS);
|
||||
for (const value of rawValues) {
|
||||
if (!validTiers.has(value)) {
|
||||
throw new Error(
|
||||
`--runtime-parity-tier must be one of ${QA_RUNTIME_PARITY_TIERS.join(", ")}, got "${value}".`,
|
||||
);
|
||||
}
|
||||
}
|
||||
return rawValues as QaRuntimeParityTier[];
|
||||
}
|
||||
|
||||
function resolveQaRuntimeParityTierScenarioIds(params: {
|
||||
scenarioIds: string[];
|
||||
runtimeParityTiers: readonly QaRuntimeParityTier[];
|
||||
}): string[] {
|
||||
if (params.runtimeParityTiers.length === 0) {
|
||||
return params.scenarioIds;
|
||||
}
|
||||
const tierSet = new Set(params.runtimeParityTiers);
|
||||
const matchingScenarioIds = readQaScenarioPack()
|
||||
.scenarios.filter(
|
||||
(scenario) => scenario.runtimeParityTier && tierSet.has(scenario.runtimeParityTier),
|
||||
)
|
||||
.map((scenario) => scenario.id);
|
||||
if (matchingScenarioIds.length === 0) {
|
||||
throw new Error(
|
||||
`--runtime-parity-tier matched no scenarios for ${params.runtimeParityTiers.join(", ")}.`,
|
||||
);
|
||||
}
|
||||
return [...new Set([...params.scenarioIds, ...matchingScenarioIds])];
|
||||
}
|
||||
|
||||
async function readQaFailedScenarioCountFromSummary(summaryPath: string) {
|
||||
let summaryText: string;
|
||||
try {
|
||||
@@ -513,17 +558,23 @@ export async function runQaSuiteCommand(opts: {
|
||||
disk?: string;
|
||||
preflight?: boolean;
|
||||
runtimePair?: string;
|
||||
runtimeParityTier?: string[];
|
||||
}) {
|
||||
const repoRoot = path.resolve(opts.repoRoot ?? process.cwd());
|
||||
const transportId = normalizeQaTransportId(opts.transportId);
|
||||
const runner = (opts.runner ?? "host").trim().toLowerCase();
|
||||
const scenarioIds = resolveQaScenarioPackScenarioIds({
|
||||
const explicitScenarioIds = resolveQaScenarioPackScenarioIds({
|
||||
pack: opts.pack,
|
||||
scenarioIds: resolveQaParityPackScenarioIds({
|
||||
parityPack: opts.parityPack,
|
||||
scenarioIds: opts.scenarioIds,
|
||||
}),
|
||||
});
|
||||
const runtimeParityTiers = parseQaRuntimeParityTierFilters(opts.runtimeParityTier);
|
||||
const scenarioIds = resolveQaRuntimeParityTierScenarioIds({
|
||||
scenarioIds: explicitScenarioIds,
|
||||
runtimeParityTiers,
|
||||
});
|
||||
const allowFailures = opts.allowFailures === true;
|
||||
if (runner !== "host" && runner !== "multipass") {
|
||||
throw new Error(`--runner must be one of host or multipass, got "${opts.runner}".`);
|
||||
|
||||
@@ -576,6 +576,22 @@ describe("qa cli registration", () => {
|
||||
expect(options.pack).toBe("personal-agent");
|
||||
});
|
||||
|
||||
it("forwards --runtime-parity-tier for suite runs", async () => {
|
||||
await program.parseAsync([
|
||||
"node",
|
||||
"openclaw",
|
||||
"qa",
|
||||
"suite",
|
||||
"--runtime-parity-tier",
|
||||
"standard",
|
||||
"--runtime-parity-tier",
|
||||
"optional,soak",
|
||||
]);
|
||||
|
||||
const options = requireQaSuiteOptions();
|
||||
expect(options.runtimeParityTier).toEqual(["standard", "optional,soak"]);
|
||||
});
|
||||
|
||||
it("routes credential add flags into the qa runtime command", async () => {
|
||||
await program.parseAsync([
|
||||
"node",
|
||||
|
||||
@@ -51,6 +51,7 @@ async function runQaSuite(opts: {
|
||||
disk?: string;
|
||||
preflight?: boolean;
|
||||
runtimePair?: string;
|
||||
runtimeParityTier?: string[];
|
||||
}) {
|
||||
const runtime = await loadQaLabCliRuntime();
|
||||
await runtime.runQaSuiteCommand(opts);
|
||||
@@ -286,6 +287,12 @@ export function registerQaLabCli(program: Command) {
|
||||
.option("--memory <size>", "Multipass memory size")
|
||||
.option("--disk <size>", "Multipass disk size")
|
||||
.option("--runtime-pair <pair>", "Run each scenario under both runtimes, e.g. pi,codex")
|
||||
.option(
|
||||
"--runtime-parity-tier <tier>",
|
||||
"Add scenarios tagged with runtimeParityTier (standard, optional, live-only, soak; repeatable or comma-separated)",
|
||||
collectString,
|
||||
[],
|
||||
)
|
||||
.action(
|
||||
async (opts: {
|
||||
repoRoot?: string;
|
||||
@@ -310,6 +317,7 @@ export function registerQaLabCli(program: Command) {
|
||||
disk?: string;
|
||||
preflight?: boolean;
|
||||
runtimePair?: string;
|
||||
runtimeParityTier?: string[];
|
||||
}) => {
|
||||
await runQaSuite({
|
||||
repoRoot: opts.repoRoot,
|
||||
@@ -334,6 +342,7 @@ export function registerQaLabCli(program: Command) {
|
||||
disk: opts.disk,
|
||||
preflight: opts.preflight,
|
||||
runtimePair: opts.runtimePair,
|
||||
runtimeParityTier: opts.runtimeParityTier,
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
@@ -93,7 +93,8 @@ const qaScenarioGatewayRuntimeSchema = z.object({
|
||||
forwardHostHome: z.boolean().optional(),
|
||||
});
|
||||
|
||||
const qaRuntimeParityTierSchema = z.enum(["standard", "optional", "live-only", "soak"]);
|
||||
export const QA_RUNTIME_PARITY_TIERS = ["standard", "optional", "live-only", "soak"] as const;
|
||||
const qaRuntimeParityTierSchema = z.enum(QA_RUNTIME_PARITY_TIERS);
|
||||
|
||||
const qaFlowCallActionSchema = z.object({
|
||||
call: z.string().trim().min(1),
|
||||
|
||||
@@ -25,6 +25,18 @@ Coverage tracking:
|
||||
- treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid
|
||||
- keep source-path tracking in the report, not in the scenario schema
|
||||
|
||||
Runtime parity tiers:
|
||||
|
||||
- `standard`: required Codex-vs-Pi mock gate coverage for first-hour depth and
|
||||
default runtime-tool fixtures; selected with
|
||||
`openclaw qa suite --runtime-pair pi,codex --runtime-parity-tier standard`
|
||||
- `optional`: profile-, plugin-, or external-service-dependent runtime-tool
|
||||
fixtures that stay out of the default release gate
|
||||
- `live-only`: scenarios that need real provider/runtime behavior rather than
|
||||
mock-openai fixtures
|
||||
- `soak`: long-running scheduled or Testbox lanes such as the 100-turn parity
|
||||
soak
|
||||
|
||||
Theme directories:
|
||||
|
||||
- `agents/` - agent behavior, instructions, subagent flows, and persisted child-link regressions
|
||||
|
||||
Reference in New Issue
Block a user