From efb7e4742fa67da81959cd54a224c324577fd128 Mon Sep 17 00:00:00 2001
From: Vincent Koc <vincentkoc@ieee.org>
Date: Fri, 22 May 2026 00:51:32 +0800
Subject: [PATCH] test(qa-lab): trace scenario issue evidence

---
 CHANGELOG.md                                  |  1 +
 .../qa-lab/src/scenario-catalog.test.ts       | 25 +++++++++++++
 extensions/qa-lab/src/scenario-catalog.ts     | 36 +++++++++++++++++++
 qa/scenarios/index.md                         |  7 ++--
 .../codex-pi-shaped-read-vocabulary.md        |  4 +++
 qa/scenarios/runtime/first-hour-20-turn.md    |  5 +++
 qa/scenarios/runtime/soak-100-turn.md         |  5 +++
 qa/scenarios/runtime/tools/apply-patch.md     |  3 ++
 qa/scenarios/runtime/tools/fs-read.md         |  3 ++
 9 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7b02bfe2e2c8..118c6eb0872e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
 - QA-Lab: add curated mock JSONL replay fixtures and first-drift reporting for runtime-parity audits. (#80323, refs #80176) Thanks @100yenadmin.
 - QA-Lab: include the optional 100-turn runtime parity soak in release-soak artifacts so long-run Codex/Pi transcript drift stays visible outside the default gate. (#80395) Thanks @100yenadmin.
 - QA-Lab: add a personal-agent failure recovery scenario that checks honest partial status, retry boundaries, and local recovery artifacts. (#83872) Thanks @iFiras-Max1.
+- QA-Lab: add GitHub issue evidence metadata to audited runtime scenarios so parity and tool-fixture coverage links back to the source threads.
 - Tests/perf: isolate doctor core health check unit coverage from real skills/workspace discovery so `doctor-core-checks` no longer dominates unit perf while keeping one real skills-readiness smoke. (#84493) Thanks @frankekn.
 
 ### Fixes
diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts
index 9d9e49e4b00d..ecbe4a10da57 100644
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -108,14 +108,39 @@ describe("qa scenario catalog", () => {
     const soak = readQaScenarioById("runtime-soak-100-turn");
 
     expect(firstHour.runtimeParityTier).toBe("standard");
+    expect(firstHour.evidence?.github).toContain(
+      "https://github.com/openclaw/openclaw/issues/80364",
+    );
     expect(readQaScenarioExecutionConfig(firstHour.id)).toMatchObject({
       runtimeParityComparison: "outcome-only",
       turnCount: 20,
     });
     expect(soak.runtimeParityTier).toBe("soak");
+    expect(soak.evidence?.github).toContain(
+      "https://github.com/openclaw/openclaw/issues/80395",
+    );
     expect(readQaScenarioExecutionConfig(soak.id)).toMatchObject({ turnCount: 100 });
   });
 
+  it("loads audited GitHub evidence metadata from scenario markdown", () => {
+    const pack = readQaScenarioPack();
+    const scenariosWithEvidence = pack.scenarios.filter(
+      (scenario) => (scenario.evidence?.github?.length ?? 0) > 0,
+    );
+    const evidenceUrls = scenariosWithEvidence.flatMap(
+      (scenario) => scenario.evidence?.github ?? [],
+    );
+
+    expect(scenariosWithEvidence.map((scenario) => scenario.id)).toContain(
+      "codex-pi-shaped-read-vocabulary",
+    );
+    expect(evidenceUrls).toContain("https://github.com/openclaw/openclaw/pull/80323");
+    expect(evidenceUrls).toContain("https://github.com/openclaw/openclaw/issues/80312");
+    for (const url of evidenceUrls) {
+      expect(url).toMatch(/^https:\/\/github\.com\/openclaw\/openclaw\/(?:issues|pull)\/\d+$/);
+    }
+  });
+
   it("loads runtime tool fixture metadata for standard and optional lanes", () => {
     const applyPatch = readQaScenarioById("runtime-tool-apply-patch");
     const messageTool = readQaScenarioById("runtime-tool-message-tool");
diff --git a/extensions/qa-lab/src/scenario-catalog.ts b/extensions/qa-lab/src/scenario-catalog.ts
index ab0a82c05934..f3eed7243826 100644
--- a/extensions/qa-lab/src/scenario-catalog.ts
+++ b/extensions/qa-lab/src/scenario-catalog.ts
@@ -93,6 +93,41 @@ const qaScenarioGatewayRuntimeSchema = z.object({
   forwardHostHome: z.boolean().optional(),
 });
 
+function isOpenClawGitHubIssueOrPullUrl(value: string): boolean {
+  try {
+    const parsed = new URL(value);
+    return (
+      parsed.hostname === "github.com" &&
+      /^\/openclaw\/openclaw\/(?:issues|pull)\/[1-9]\d*$/.test(parsed.pathname)
+    );
+  } catch {
+    return false;
+  }
+}
+
+const qaScenarioEvidenceGithubUrlSchema = z
+  .string()
+  .trim()
+  .url()
+  .refine(isOpenClawGitHubIssueOrPullUrl, {
+    message: "evidence.github entries must be openclaw/openclaw issue or PR URLs",
+  });
+
+const qaScenarioEvidenceSchema = z
+  .object({
+    github: z.array(qaScenarioEvidenceGithubUrlSchema).min(1).optional(),
+  })
+  .superRefine((evidence, ctx) => {
+    if (evidence.github?.length) {
+      return;
+    }
+    ctx.addIssue({
+      code: z.ZodIssueCode.custom,
+      path: ["github"],
+      message: "evidence.github must include at least one URL",
+    });
+  });
+
 export const QA_RUNTIME_PARITY_TIERS = ["standard", "optional", "live-only", "soak"] as const;
 const qaRuntimeParityTierSchema = z.enum(QA_RUNTIME_PARITY_TIERS);
 
@@ -181,6 +216,7 @@ const qaSeedScenarioSchema = z.object({
   category: z.string().trim().min(1).optional(),
   runtimeParityTier: qaRuntimeParityTierSchema.optional(),
   coverage: qaScenarioCoverageSchema.optional(),
+  evidence: qaScenarioEvidenceSchema.optional(),
   surfaces: z.array(z.string().trim().min(1)).min(1).optional(),
   risk: z.enum(["low", "medium", "high"]).optional(),
   capabilities: z.array(z.string().trim().min(1)).optional(),
diff --git a/qa/scenarios/index.md b/qa/scenarios/index.md
index 8f0ee210377f..03d9869d3564 100644
--- a/qa/scenarios/index.md
+++ b/qa/scenarios/index.md
@@ -5,8 +5,8 @@ Single source of truth for repo-backed QA suite bootstrap data.
 
 - `index.md` defines pack-level bootstrap data
 - each nested `*.md` scenario defines one runnable test via `qa-scenario` + `qa-flow`
-- scenario markdown may also define coverage IDs, category metadata, required plugins,
-  lane filters, runtime parity tiers, and gateway config patching
+- scenario markdown may also define coverage IDs, evidence links, category metadata,
+  required plugins, lane filters, runtime parity tiers, and gateway config patching
 
 - kickoff mission
 - QA operator identity
@@ -20,6 +20,9 @@ Coverage tracking:
 - prefer reusing an existing feature ID over minting a scenario-shaped ID
 - avoid copying the scenario title into coverage IDs
 - use `pnpm openclaw qa coverage` to render the current inventory
+- use `evidence.github` for full `https://github.com/openclaw/openclaw/issues/<n>` or
+  `https://github.com/openclaw/openclaw/pull/<n>` links when a scenario directly protects
+  a reported regression, RFC, or accepted PR behavior
 - use `runtimeParityTier` for runtime-pair gate membership: `standard`,
   `optional`, `live-only`, or `soak`
 - treat the old `coverage: ["id"]` / `coverage: - id` list shape as invalid
diff --git a/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md b/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md
index c44a227bafcd..cd6d486ef7b9 100644
--- a/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md
+++ b/qa/scenarios/runtime/codex-pi-shaped-read-vocabulary.md
@@ -11,6 +11,10 @@ coverage:
   secondary:
     - runtime.prompt-compatibility
     - tools.fs.read
+evidence:
+  github:
+    - https://github.com/openclaw/openclaw/pull/80323
+    - https://github.com/openclaw/openclaw/issues/81734
 objective: Verify Codex-mode agents can satisfy legacy Pi-shaped "Read tool" wording through the native Codex workspace-read capability instead of stopping because duplicate OpenClaw dynamic read is intentionally filtered.
 successCriteria:
   - Agent reads the seeded workspace file and replies with the exact marker line.
diff --git a/qa/scenarios/runtime/first-hour-20-turn.md b/qa/scenarios/runtime/first-hour-20-turn.md
index d9041c45eecb..0e59e20774c0 100644
--- a/qa/scenarios/runtime/first-hour-20-turn.md
+++ b/qa/scenarios/runtime/first-hour-20-turn.md
@@ -10,6 +10,11 @@ coverage:
     - runtime.first-hour-20
   secondary:
     - runtime.long-context
+evidence:
+  github:
+    - https://github.com/openclaw/openclaw/issues/80171
+    - https://github.com/openclaw/openclaw/issues/80337
+    - https://github.com/openclaw/openclaw/issues/80364
 objective: Verify both runtimes preserve a same-session conversation across the required 20-turn maintainer gate.
 successCriteria:
   - The same QA session accepts 20 sequential user turns.
diff --git a/qa/scenarios/runtime/soak-100-turn.md b/qa/scenarios/runtime/soak-100-turn.md
index d4f4caa8b806..d0600da5ae9f 100644
--- a/qa/scenarios/runtime/soak-100-turn.md
+++ b/qa/scenarios/runtime/soak-100-turn.md
@@ -10,6 +10,11 @@ coverage:
     - runtime.soak-100
   secondary:
     - runtime.long-context
+evidence:
+  github:
+    - https://github.com/openclaw/openclaw/issues/80171
+    - https://github.com/openclaw/openclaw/issues/80338
+    - https://github.com/openclaw/openclaw/issues/80395
 objective: Provide an optional long-run soak that can be scheduled or run in Testbox without entering the maintainer default gate.
 successCriteria:
   - The same QA session accepts 100 sequential user turns.
diff --git a/qa/scenarios/runtime/tools/apply-patch.md b/qa/scenarios/runtime/tools/apply-patch.md
index 3ce8c3fd5f3d..e526aa5df6e5 100644
--- a/qa/scenarios/runtime/tools/apply-patch.md
+++ b/qa/scenarios/runtime/tools/apply-patch.md
@@ -8,6 +8,9 @@ runtimeParityTier: standard
 coverage:
   primary:
     - tools.apply-patch
+evidence:
+  github:
+    - https://github.com/openclaw/openclaw/issues/80320
 objective: Verify apply_patch behavior is tracked across Pi and Codex while Codex owns patching natively.
 successCriteria:
   - Pi may expose OpenClaw apply_patch while Codex app-server mode may omit duplicate OpenClaw dynamic apply_patch.
diff --git a/qa/scenarios/runtime/tools/fs-read.md b/qa/scenarios/runtime/tools/fs-read.md
index 24d2ca03c1a9..e1344292fc9f 100644
--- a/qa/scenarios/runtime/tools/fs-read.md
+++ b/qa/scenarios/runtime/tools/fs-read.md
@@ -8,6 +8,9 @@ runtimeParityTier: standard
 coverage:
   primary:
     - tools.fs.read
+evidence:
+  github:
+    - https://github.com/openclaw/openclaw/issues/80312
 objective: Verify file read behavior is tracked across Pi and Codex while Codex owns read natively.
 successCriteria:
   - Pi may expose OpenClaw read while Codex app-server mode may omit duplicate OpenClaw dynamic read.