From efb7e4742fa67da81959cd54a224c324577fd128 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 22 May 2026 00:51:32 +0800 Subject: [PATCH] test(qa-lab): trace scenario issue evidence --- CHANGELOG.md | 1 + .../qa-lab/src/scenario-catalog.test.ts | 25 +++++++++++++ extensions/qa-lab/src/scenario-catalog.ts | 36 +++++++++++++++++++ qa/scenarios/index.md | 7 ++-- .../codex-pi-shaped-read-vocabulary.md | 4 +++ qa/scenarios/runtime/first-hour-20-turn.md | 5 +++ qa/scenarios/runtime/soak-100-turn.md | 5 +++ qa/scenarios/runtime/tools/apply-patch.md | 3 ++ qa/scenarios/runtime/tools/fs-read.md | 3 ++ 9 files changed, 87 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b02bfe2e2c8..118c6eb0872e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai - QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin. - QA-Lab: include the optional 100-turn runtime parity soak in release-soak artifacts so long-run Codex/Pi transcript drift stays visible outside the default gate. (#80395) Thanks @100yenadmin. - QA-Lab: add a personal-agent failure recovery scenario that checks honest partial status, retry boundaries, and local recovery artifacts. (#83872) Thanks @iFiras-Max1. +- QA-Lab: add GitHub issue evidence metadata to audited runtime scenarios so parity and tool-fixture coverage links back to the source threads. - Tests/perf: isolate doctor core health check unit coverage from real skills/workspace discovery so `doctor-core-checks` no longer dominates unit perf while keeping one real skills-readiness smoke. (#84493) Thanks @frankekn. ### Fixes diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index 9d9e49e4b00d..ecbe4a10da57 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -108,14 +108,39 @@ describe("qa scenario catalog", () => { const soak = readQaScenarioById("runtime-soak-100-turn"); expect(firstHour.runtimeParityTier).toBe("standard"); + expect(firstHour.evidence?.github).toContain( + "https://github.com/openclaw/openclaw/issues/80364", + ); expect(readQaScenarioExecutionConfig(firstHour.id)).toMatchObject({ runtimeParityComparison: "outcome-only", turnCount: 20, }); expect(soak.runtimeParityTier).toBe("soak"); + expect(soak.evidence?.github).toContain( + "https://github.com/openclaw/openclaw/issues/80395", + ); expect(readQaScenarioExecutionConfig(soak.id)).toMatchObject({ turnCount: 100 }); }); + it("loads audited GitHub evidence metadata from scenario markdown", () => { + const pack = readQaScenarioPack(); + const scenariosWithEvidence = pack.scenarios.filter( + (scenario) => (scenario.evidence?.github?.length ?? 0) > 0, + ); + const evidenceUrls = scenariosWithEvidence.flatMap( + (scenario) => scenario.evidence?.github ?? [], + ); + + expect(scenariosWithEvidence.map((scenario) => scenario.id)).toContain( + "codex-pi-shaped-read-vocabulary", + ); + expect(evidenceUrls).toContain("https://github.com/openclaw/openclaw/pull/80323"); + expect(evidenceUrls).toContain("https://github.com/openclaw/openclaw/issues/80312"); + for (const url of evidenceUrls) { + expect(url).toMatch(/^https:\/\/github\.com\/openclaw\/openclaw\/(?:issues|pull)\/\d+$/); + } + }); + it("loads runtime tool fixture metadata for standard and optional lanes", () => { const applyPatch = readQaScenarioById("runtime-tool-apply-patch"); const messageTool = readQaScenarioById("runtime-tool-message-tool"); diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts index ab0a82c05934..f3eed7243826 100644 --- a/extensions/qa-lab/src/scenario-catalog.ts +++ b/extensions/qa-lab/src/scenario-catalog.ts @@ -93,6 +93,41 @@ const qaScenarioGatewayRuntimeSchema = z.object({ forwardHostHome: z.boolean().optional(), }); +function isOpenClawGitHubIssueOrPullUrl(value: string): boolean { + try { + const parsed = new URL(value); + return ( + parsed.hostname === "github.com" && + /^\/openclaw\/openclaw\/(?:issues|pull)\/[1-9]\d*$/.test(parsed.pathname) + ); + } catch { + return false; + } +} + +const qaScenarioEvidenceGithubUrlSchema = z + .string() + .trim() + .url() + .refine(isOpenClawGitHubIssueOrPullUrl, { + message: "evidence.github entries must be openclaw/openclaw issue or PR URLs", + }); + +const qaScenarioEvidenceSchema = z + .object({ + github: z.array(qaScenarioEvidenceGithubUrlSchema).min(1).optional(), + }) + .superRefine((evidence, ctx) => { + if (evidence.github?.length) { + return; + } + ctx.addIssue({ + code: z.ZodIssueCode.custom, + path: ["github"], + message: "evidence.github must include at least one URL", + }); + }); + export const QA_RUNTIME_PARITY_TIERS = ["standard", "optional", "live-only", "soak"] as const; const qaRuntimeParityTierSchema = z.enum(QA_RUNTIME_PARITY_TIERS); @@ -181,6 +216,7 @@ const qaSeedScenarioSchema = z.object({ category: z.string().trim().min(1).optional(), runtimeParityTier: qaRuntimeParityTierSchema.optional(), coverage: qaScenarioCoverageSchema.optional(), + evidence: qaScenarioEvidenceSchema.optional(), surfaces: z.array(z.string().trim().min(1)).min(1).optional(), risk: z.enum(["low", "medium", "high"]).optional(), capabilities: z.array(z.string().trim().min(1)).optional(), diff --git a/qa/scenarios/index.md b/qa/scenarios/index.md index 8f0ee210377f..03d9869d3564 100644 --- a/qa/scenarios/index.md +++ b/qa/scenarios/index.md @@ -5,8 +5,8 @@ Single source of truth for repo-backed QA suite bootstrap data. - `index.md` defines pack-level bootstrap data - each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow` -- scenario markdown may also define coverage IDs, category metadata, required plugins, - lane filters, runtime parity tiers, and gateway config patching +- scenario markdown may also define coverage IDs, evidence links, category metadata, + required plugins, lane filters, runtime parity tiers, and gateway config patching - kickoff mission - QA operator identity @@ -20,6 +20,9 @@ Coverage tracking: - prefer reusing an existing feature ID over minting a scenario-shaped ID - avoid copying the scenario title into coverage IDs - use `pnpm openclaw qa coverage` to render the current inventory +- use `evidence.github` for full `https://github.com/openclaw/openclaw/issues/` or + `https://github.com/openclaw/openclaw/pull/` links when a scenario directly protects + a reported regression, RFC, or accepted PR behavior - use `runtimeParityTier` for runtime-pair gate membership: `standard`, `optional`, `live-only`, or `soak` - treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid diff --git a/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md b/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md index c44a227bafcd..cd6d486ef7b9 100644 --- a/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md +++ b/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md @@ -11,6 +11,10 @@ coverage: secondary: - runtime.prompt-compatibility - tools.fs.read +evidence: + github: + - https://github.com/openclaw/openclaw/pull/80323 + - https://github.com/openclaw/openclaw/issues/81734 objective: Verify Codex-mode agents can satisfy legacy Pi-shaped "Read tool" wording through the native Codex workspace-read capability instead of stopping because duplicate OpenClaw dynamic read is intentionally filtered. successCriteria: - Agent reads the seeded workspace file and replies with the exact marker line. diff --git a/qa/scenarios/runtime/first-hour-20-turn.md b/qa/scenarios/runtime/first-hour-20-turn.md index d9041c45eecb..0e59e20774c0 100644 --- a/qa/scenarios/runtime/first-hour-20-turn.md +++ b/qa/scenarios/runtime/first-hour-20-turn.md @@ -10,6 +10,11 @@ coverage: - runtime.first-hour-20 secondary: - runtime.long-context +evidence: + github: + - https://github.com/openclaw/openclaw/issues/80171 + - https://github.com/openclaw/openclaw/issues/80337 + - https://github.com/openclaw/openclaw/issues/80364 objective: Verify both runtimes preserve a same-session conversation across the required 20-turn maintainer gate. successCriteria: - The same QA session accepts 20 sequential user turns. diff --git a/qa/scenarios/runtime/soak-100-turn.md b/qa/scenarios/runtime/soak-100-turn.md index d4f4caa8b806..d0600da5ae9f 100644 --- a/qa/scenarios/runtime/soak-100-turn.md +++ b/qa/scenarios/runtime/soak-100-turn.md @@ -10,6 +10,11 @@ coverage: - runtime.soak-100 secondary: - runtime.long-context +evidence: + github: + - https://github.com/openclaw/openclaw/issues/80171 + - https://github.com/openclaw/openclaw/issues/80338 + - https://github.com/openclaw/openclaw/issues/80395 objective: Provide an optional long-run soak that can be scheduled or run in Testbox without entering the maintainer default gate. successCriteria: - The same QA session accepts 100 sequential user turns. diff --git a/qa/scenarios/runtime/tools/apply-patch.md b/qa/scenarios/runtime/tools/apply-patch.md index 3ce8c3fd5f3d..e526aa5df6e5 100644 --- a/qa/scenarios/runtime/tools/apply-patch.md +++ b/qa/scenarios/runtime/tools/apply-patch.md @@ -8,6 +8,9 @@ runtimeParityTier: standard coverage: primary: - tools.apply-patch +evidence: + github: + - https://github.com/openclaw/openclaw/issues/80320 objective: Verify apply_patch behavior is tracked across Pi and Codex while Codex owns patching natively. successCriteria: - Pi may expose OpenClaw apply_patch while Codex app-server mode may omit duplicate OpenClaw dynamic apply_patch. diff --git a/qa/scenarios/runtime/tools/fs-read.md b/qa/scenarios/runtime/tools/fs-read.md index 24d2ca03c1a9..e1344292fc9f 100644 --- a/qa/scenarios/runtime/tools/fs-read.md +++ b/qa/scenarios/runtime/tools/fs-read.md @@ -8,6 +8,9 @@ runtimeParityTier: standard coverage: primary: - tools.fs.read +evidence: + github: + - https://github.com/openclaw/openclaw/issues/80312 objective: Verify file read behavior is tracked across Pi and Codex while Codex owns read natively. successCriteria: - Pi may expose OpenClaw read while Codex app-server mode may omit duplicate OpenClaw dynamic read.