diff --git a/CHANGELOG.md b/CHANGELOG.md index 99228af47b8f..ea08dc5c48ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,6 @@ Docs: https://docs.openclaw.ai - QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin. - QA-Lab: include the optional 100-turn runtime parity soak in release-soak artifacts so long-run Codex/Pi transcript drift stays visible outside the default gate. (#80395) Thanks @100yenadmin. - QA-Lab: add a personal-agent failure recovery scenario that checks honest partial status, retry boundaries, and local recovery artifacts. (#83872) Thanks @iFiras-Max1. -- QA-Lab: add GitHub issue evidence metadata to audited runtime scenarios so parity and tool-fixture coverage links back to the source threads. - QA-Lab: include an opt-in `update.run` package self-upgrade sentinel for destructive latest-package recovery checks. - Tests/perf: isolate doctor core health check unit coverage from real skills/workspace discovery so `doctor-core-checks` no longer dominates unit perf while keeping one real skills-readiness smoke. (#84493) Thanks @frankekn. diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index dc711ca648ef..82e94bcf4cd0 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -108,39 +108,14 @@ describe("qa scenario catalog", () => { const soak = readQaScenarioById("runtime-soak-100-turn"); expect(firstHour.runtimeParityTier).toBe("standard"); - expect(firstHour.evidence?.github).toContain( - "https://github.com/openclaw/openclaw/issues/80364", - ); expect(readQaScenarioExecutionConfig(firstHour.id)).toMatchObject({ runtimeParityComparison: "outcome-only", turnCount: 20, }); expect(soak.runtimeParityTier).toBe("soak"); - expect(soak.evidence?.github).toContain( - "https://github.com/openclaw/openclaw/issues/80395", - ); expect(readQaScenarioExecutionConfig(soak.id)).toMatchObject({ turnCount: 100 }); }); - it("loads audited GitHub evidence metadata from scenario markdown", () => { - const pack = readQaScenarioPack(); - const scenariosWithEvidence = pack.scenarios.filter( - (scenario) => (scenario.evidence?.github?.length ?? 0) > 0, - ); - const evidenceUrls = scenariosWithEvidence.flatMap( - (scenario) => scenario.evidence?.github ?? [], - ); - - expect(scenariosWithEvidence.map((scenario) => scenario.id)).toContain( - "codex-pi-shaped-read-vocabulary", - ); - expect(evidenceUrls).toContain("https://github.com/openclaw/openclaw/pull/80323"); - expect(evidenceUrls).toContain("https://github.com/openclaw/openclaw/issues/80312"); - for (const url of evidenceUrls) { - expect(url).toMatch(/^https:\/\/github\.com\/openclaw\/openclaw\/(?:issues|pull)\/\d+$/); - } - }); - it("loads runtime tool fixture metadata for standard and optional lanes", () => { const applyPatch = readQaScenarioById("runtime-tool-apply-patch"); const messageTool = readQaScenarioById("runtime-tool-message-tool"); diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts index f3eed7243826..ab0a82c05934 100644 --- a/extensions/qa-lab/src/scenario-catalog.ts +++ b/extensions/qa-lab/src/scenario-catalog.ts @@ -93,41 +93,6 @@ const qaScenarioGatewayRuntimeSchema = z.object({ forwardHostHome: z.boolean().optional(), }); -function isOpenClawGitHubIssueOrPullUrl(value: string): boolean { - try { - const parsed = new URL(value); - return ( - parsed.hostname === "github.com" && - /^\/openclaw\/openclaw\/(?:issues|pull)\/[1-9]\d*$/.test(parsed.pathname) - ); - } catch { - return false; - } -} - -const qaScenarioEvidenceGithubUrlSchema = z - .string() - .trim() - .url() - .refine(isOpenClawGitHubIssueOrPullUrl, { - message: "evidence.github entries must be openclaw/openclaw issue or PR URLs", - }); - -const qaScenarioEvidenceSchema = z - .object({ - github: z.array(qaScenarioEvidenceGithubUrlSchema).min(1).optional(), - }) - .superRefine((evidence, ctx) => { - if (evidence.github?.length) { - return; - } - ctx.addIssue({ - code: z.ZodIssueCode.custom, - path: ["github"], - message: "evidence.github must include at least one URL", - }); - }); - export const QA_RUNTIME_PARITY_TIERS = ["standard", "optional", "live-only", "soak"] as const; const qaRuntimeParityTierSchema = z.enum(QA_RUNTIME_PARITY_TIERS); @@ -216,7 +181,6 @@ const qaSeedScenarioSchema = z.object({ category: z.string().trim().min(1).optional(), runtimeParityTier: qaRuntimeParityTierSchema.optional(), coverage: qaScenarioCoverageSchema.optional(), - evidence: qaScenarioEvidenceSchema.optional(), surfaces: z.array(z.string().trim().min(1)).min(1).optional(), risk: z.enum(["low", "medium", "high"]).optional(), capabilities: z.array(z.string().trim().min(1)).optional(), diff --git a/qa/scenarios/index.md b/qa/scenarios/index.md index 03d9869d3564..8f0ee210377f 100644 --- a/qa/scenarios/index.md +++ b/qa/scenarios/index.md @@ -5,8 +5,8 @@ Single source of truth for repo-backed QA suite bootstrap data. - `index.md` defines pack-level bootstrap data - each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow` -- scenario markdown may also define coverage IDs, evidence links, category metadata, - required plugins, lane filters, runtime parity tiers, and gateway config patching +- scenario markdown may also define coverage IDs, category metadata, required plugins, + lane filters, runtime parity tiers, and gateway config patching - kickoff mission - QA operator identity @@ -20,9 +20,6 @@ Coverage tracking: - prefer reusing an existing feature ID over minting a scenario-shaped ID - avoid copying the scenario title into coverage IDs - use `pnpm openclaw qa coverage` to render the current inventory -- use `evidence.github` for full `https://github.com/openclaw/openclaw/issues/` or - `https://github.com/openclaw/openclaw/pull/` links when a scenario directly protects - a reported regression, RFC, or accepted PR behavior - use `runtimeParityTier` for runtime-pair gate membership: `standard`, `optional`, `live-only`, or `soak` - treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid diff --git a/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md b/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md index cd6d486ef7b9..c44a227bafcd 100644 --- a/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md +++ b/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md @@ -11,10 +11,6 @@ coverage: secondary: - runtime.prompt-compatibility - tools.fs.read -evidence: - github: - - https://github.com/openclaw/openclaw/pull/80323 - - https://github.com/openclaw/openclaw/issues/81734 objective: Verify Codex-mode agents can satisfy legacy Pi-shaped "Read tool" wording through the native Codex workspace-read capability instead of stopping because duplicate OpenClaw dynamic read is intentionally filtered. successCriteria: - Agent reads the seeded workspace file and replies with the exact marker line. diff --git a/qa/scenarios/runtime/first-hour-20-turn.md b/qa/scenarios/runtime/first-hour-20-turn.md index 0e59e20774c0..d9041c45eecb 100644 --- a/qa/scenarios/runtime/first-hour-20-turn.md +++ b/qa/scenarios/runtime/first-hour-20-turn.md @@ -10,11 +10,6 @@ coverage: - runtime.first-hour-20 secondary: - runtime.long-context -evidence: - github: - - https://github.com/openclaw/openclaw/issues/80171 - - https://github.com/openclaw/openclaw/issues/80337 - - https://github.com/openclaw/openclaw/issues/80364 objective: Verify both runtimes preserve a same-session conversation across the required 20-turn maintainer gate. successCriteria: - The same QA session accepts 20 sequential user turns. diff --git a/qa/scenarios/runtime/soak-100-turn.md b/qa/scenarios/runtime/soak-100-turn.md index d0600da5ae9f..d4f4caa8b806 100644 --- a/qa/scenarios/runtime/soak-100-turn.md +++ b/qa/scenarios/runtime/soak-100-turn.md @@ -10,11 +10,6 @@ coverage: - runtime.soak-100 secondary: - runtime.long-context -evidence: - github: - - https://github.com/openclaw/openclaw/issues/80171 - - https://github.com/openclaw/openclaw/issues/80338 - - https://github.com/openclaw/openclaw/issues/80395 objective: Provide an optional long-run soak that can be scheduled or run in Testbox without entering the maintainer default gate. successCriteria: - The same QA session accepts 100 sequential user turns. diff --git a/qa/scenarios/runtime/tools/apply-patch.md b/qa/scenarios/runtime/tools/apply-patch.md index e526aa5df6e5..3ce8c3fd5f3d 100644 --- a/qa/scenarios/runtime/tools/apply-patch.md +++ b/qa/scenarios/runtime/tools/apply-patch.md @@ -8,9 +8,6 @@ runtimeParityTier: standard coverage: primary: - tools.apply-patch -evidence: - github: - - https://github.com/openclaw/openclaw/issues/80320 objective: Verify apply_patch behavior is tracked across Pi and Codex while Codex owns patching natively. successCriteria: - Pi may expose OpenClaw apply_patch while Codex app-server mode may omit duplicate OpenClaw dynamic apply_patch. diff --git a/qa/scenarios/runtime/tools/fs-read.md b/qa/scenarios/runtime/tools/fs-read.md index e1344292fc9f..24d2ca03c1a9 100644 --- a/qa/scenarios/runtime/tools/fs-read.md +++ b/qa/scenarios/runtime/tools/fs-read.md @@ -8,9 +8,6 @@ runtimeParityTier: standard coverage: primary: - tools.fs.read -evidence: - github: - - https://github.com/openclaw/openclaw/issues/80312 objective: Verify file read behavior is tracked across Pi and Codex while Codex owns read natively. successCriteria: - Pi may expose OpenClaw read while Codex app-server mode may omit duplicate OpenClaw dynamic read.