mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-06 05:51:15 +08:00
test(qa-lab): trace scenario issue evidence
This commit is contained in:
@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
|
||||
- QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin.
|
||||
- QA-Lab: include the optional 100-turn runtime parity soak in release-soak artifacts so long-run Codex/Pi transcript drift stays visible outside the default gate. (#80395) Thanks @100yenadmin.
|
||||
- QA-Lab: add a personal-agent failure recovery scenario that checks honest partial status, retry boundaries, and local recovery artifacts. (#83872) Thanks @iFiras-Max1.
|
||||
- QA-Lab: add GitHub issue evidence metadata to audited runtime scenarios so parity and tool-fixture coverage links back to the source threads.
|
||||
- Tests/perf: isolate doctor core health check unit coverage from real skills/workspace discovery so `doctor-core-checks` no longer dominates unit perf while keeping one real skills-readiness smoke. (#84493) Thanks @frankekn.
|
||||
|
||||
### Fixes
|
||||
|
||||
@@ -108,14 +108,39 @@ describe("qa scenario catalog", () => {
|
||||
const soak = readQaScenarioById("runtime-soak-100-turn");
|
||||
|
||||
expect(firstHour.runtimeParityTier).toBe("standard");
|
||||
expect(firstHour.evidence?.github).toContain(
|
||||
"https://github.com/openclaw/openclaw/issues/80364",
|
||||
);
|
||||
expect(readQaScenarioExecutionConfig(firstHour.id)).toMatchObject({
|
||||
runtimeParityComparison: "outcome-only",
|
||||
turnCount: 20,
|
||||
});
|
||||
expect(soak.runtimeParityTier).toBe("soak");
|
||||
expect(soak.evidence?.github).toContain(
|
||||
"https://github.com/openclaw/openclaw/issues/80395",
|
||||
);
|
||||
expect(readQaScenarioExecutionConfig(soak.id)).toMatchObject({ turnCount: 100 });
|
||||
});
|
||||
|
||||
it("loads audited GitHub evidence metadata from scenario markdown", () => {
|
||||
const pack = readQaScenarioPack();
|
||||
const scenariosWithEvidence = pack.scenarios.filter(
|
||||
(scenario) => (scenario.evidence?.github?.length ?? 0) > 0,
|
||||
);
|
||||
const evidenceUrls = scenariosWithEvidence.flatMap(
|
||||
(scenario) => scenario.evidence?.github ?? [],
|
||||
);
|
||||
|
||||
expect(scenariosWithEvidence.map((scenario) => scenario.id)).toContain(
|
||||
"codex-pi-shaped-read-vocabulary",
|
||||
);
|
||||
expect(evidenceUrls).toContain("https://github.com/openclaw/openclaw/pull/80323");
|
||||
expect(evidenceUrls).toContain("https://github.com/openclaw/openclaw/issues/80312");
|
||||
for (const url of evidenceUrls) {
|
||||
expect(url).toMatch(/^https:\/\/github\.com\/openclaw\/openclaw\/(?:issues|pull)\/\d+$/);
|
||||
}
|
||||
});
|
||||
|
||||
it("loads runtime tool fixture metadata for standard and optional lanes", () => {
|
||||
const applyPatch = readQaScenarioById("runtime-tool-apply-patch");
|
||||
const messageTool = readQaScenarioById("runtime-tool-message-tool");
|
||||
|
||||
@@ -93,6 +93,41 @@ const qaScenarioGatewayRuntimeSchema = z.object({
|
||||
forwardHostHome: z.boolean().optional(),
|
||||
});
|
||||
|
||||
function isOpenClawGitHubIssueOrPullUrl(value: string): boolean {
|
||||
try {
|
||||
const parsed = new URL(value);
|
||||
return (
|
||||
parsed.hostname === "github.com" &&
|
||||
/^\/openclaw\/openclaw\/(?:issues|pull)\/[1-9]\d*$/.test(parsed.pathname)
|
||||
);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
const qaScenarioEvidenceGithubUrlSchema = z
|
||||
.string()
|
||||
.trim()
|
||||
.url()
|
||||
.refine(isOpenClawGitHubIssueOrPullUrl, {
|
||||
message: "evidence.github entries must be openclaw/openclaw issue or PR URLs",
|
||||
});
|
||||
|
||||
const qaScenarioEvidenceSchema = z
|
||||
.object({
|
||||
github: z.array(qaScenarioEvidenceGithubUrlSchema).min(1).optional(),
|
||||
})
|
||||
.superRefine((evidence, ctx) => {
|
||||
if (evidence.github?.length) {
|
||||
return;
|
||||
}
|
||||
ctx.addIssue({
|
||||
code: z.ZodIssueCode.custom,
|
||||
path: ["github"],
|
||||
message: "evidence.github must include at least one URL",
|
||||
});
|
||||
});
|
||||
|
||||
export const QA_RUNTIME_PARITY_TIERS = ["standard", "optional", "live-only", "soak"] as const;
|
||||
const qaRuntimeParityTierSchema = z.enum(QA_RUNTIME_PARITY_TIERS);
|
||||
|
||||
@@ -181,6 +216,7 @@ const qaSeedScenarioSchema = z.object({
|
||||
category: z.string().trim().min(1).optional(),
|
||||
runtimeParityTier: qaRuntimeParityTierSchema.optional(),
|
||||
coverage: qaScenarioCoverageSchema.optional(),
|
||||
evidence: qaScenarioEvidenceSchema.optional(),
|
||||
surfaces: z.array(z.string().trim().min(1)).min(1).optional(),
|
||||
risk: z.enum(["low", "medium", "high"]).optional(),
|
||||
capabilities: z.array(z.string().trim().min(1)).optional(),
|
||||
|
||||
@@ -5,8 +5,8 @@ Single source of truth for repo-backed QA suite bootstrap data.
|
||||
|
||||
- `index.md` defines pack-level bootstrap data
|
||||
- each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow`
|
||||
- scenario markdown may also define coverage IDs, category metadata, required plugins,
|
||||
lane filters, runtime parity tiers, and gateway config patching
|
||||
- scenario markdown may also define coverage IDs, evidence links, category metadata,
|
||||
required plugins, lane filters, runtime parity tiers, and gateway config patching
|
||||
|
||||
- kickoff mission
|
||||
- QA operator identity
|
||||
@@ -20,6 +20,9 @@ Coverage tracking:
|
||||
- prefer reusing an existing feature ID over minting a scenario-shaped ID
|
||||
- avoid copying the scenario title into coverage IDs
|
||||
- use `pnpm openclaw qa coverage` to render the current inventory
|
||||
- use `evidence.github` for full `https://github.com/openclaw/openclaw/issues/<n>` or
|
||||
`https://github.com/openclaw/openclaw/pull/<n>` links when a scenario directly protects
|
||||
a reported regression, RFC, or accepted PR behavior
|
||||
- use `runtimeParityTier` for runtime-pair gate membership: `standard`,
|
||||
`optional`, `live-only`, or `soak`
|
||||
- treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid
|
||||
|
||||
@@ -11,6 +11,10 @@ coverage:
|
||||
secondary:
|
||||
- runtime.prompt-compatibility
|
||||
- tools.fs.read
|
||||
evidence:
|
||||
github:
|
||||
- https://github.com/openclaw/openclaw/pull/80323
|
||||
- https://github.com/openclaw/openclaw/issues/81734
|
||||
objective: Verify Codex-mode agents can satisfy legacy Pi-shaped "Read tool" wording through the native Codex workspace-read capability instead of stopping because duplicate OpenClaw dynamic read is intentionally filtered.
|
||||
successCriteria:
|
||||
- Agent reads the seeded workspace file and replies with the exact marker line.
|
||||
|
||||
@@ -10,6 +10,11 @@ coverage:
|
||||
- runtime.first-hour-20
|
||||
secondary:
|
||||
- runtime.long-context
|
||||
evidence:
|
||||
github:
|
||||
- https://github.com/openclaw/openclaw/issues/80171
|
||||
- https://github.com/openclaw/openclaw/issues/80337
|
||||
- https://github.com/openclaw/openclaw/issues/80364
|
||||
objective: Verify both runtimes preserve a same-session conversation across the required 20-turn maintainer gate.
|
||||
successCriteria:
|
||||
- The same QA session accepts 20 sequential user turns.
|
||||
|
||||
@@ -10,6 +10,11 @@ coverage:
|
||||
- runtime.soak-100
|
||||
secondary:
|
||||
- runtime.long-context
|
||||
evidence:
|
||||
github:
|
||||
- https://github.com/openclaw/openclaw/issues/80171
|
||||
- https://github.com/openclaw/openclaw/issues/80338
|
||||
- https://github.com/openclaw/openclaw/issues/80395
|
||||
objective: Provide an optional long-run soak that can be scheduled or run in Testbox without entering the maintainer default gate.
|
||||
successCriteria:
|
||||
- The same QA session accepts 100 sequential user turns.
|
||||
|
||||
@@ -8,6 +8,9 @@ runtimeParityTier: standard
|
||||
coverage:
|
||||
primary:
|
||||
- tools.apply-patch
|
||||
evidence:
|
||||
github:
|
||||
- https://github.com/openclaw/openclaw/issues/80320
|
||||
objective: Verify apply_patch behavior is tracked across Pi and Codex while Codex owns patching natively.
|
||||
successCriteria:
|
||||
- Pi may expose OpenClaw apply_patch while Codex app-server mode may omit duplicate OpenClaw dynamic apply_patch.
|
||||
|
||||
@@ -8,6 +8,9 @@ runtimeParityTier: standard
|
||||
coverage:
|
||||
primary:
|
||||
- tools.fs.read
|
||||
evidence:
|
||||
github:
|
||||
- https://github.com/openclaw/openclaw/issues/80312
|
||||
objective: Verify file read behavior is tracked across Pi and Codex while Codex owns read natively.
|
||||
successCriteria:
|
||||
- Pi may expose OpenClaw read while Codex app-server mode may omit duplicate OpenClaw dynamic read.
|
||||
|
||||
Reference in New Issue
Block a user