From 1926982c4c39e23eba82722df84b8215e1850243 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sun, 17 May 2026 23:12:20 +0800 Subject: [PATCH] fix(qa-lab): refresh parity model targets --- .../workflows/control-ui-locale-refresh.yml | 2 +- .../mantis-discord-status-reactions.yml | 4 +- .../workflows/mantis-slack-desktop-smoke.yml | 4 +- .github/workflows/mantis-telegram-live.yml | 2 +- ...nclaw-cross-os-release-checks-reusable.yml | 2 +- .../openclaw-live-and-e2e-checks-reusable.yml | 2 +- .github/workflows/openclaw-performance.yml | 18 ++--- .github/workflows/openclaw-release-checks.yml | 12 +-- .../workflows/qa-live-transports-convex.yml | 14 ++-- CHANGELOG.md | 1 + docs/ci.md | 8 +- docs/concepts/qa-e2e-automation.md | 14 ++-- .../gpt55-codex-agentic-parity-maintainers.md | 10 +-- docs/help/gpt55-codex-agentic-parity.md | 18 ++--- docs/help/testing.md | 4 +- .../qa-lab/src/agentic-parity-report.test.ts | 80 +++++++++---------- .../qa-lab/src/agentic-parity-report.ts | 2 +- extensions/qa-lab/src/character-eval.test.ts | 10 +-- extensions/qa-lab/src/cli.runtime.test.ts | 14 ++-- extensions/qa-lab/src/cli.test.ts | 8 +- .../qa-lab/src/gateway-log-sentinel.test.ts | 2 +- extensions/qa-lab/src/live-timeout.test.ts | 8 +- .../providers/live-frontier/character-eval.ts | 6 +- .../src/providers/live-frontier/parity.ts | 2 +- .../src/providers/mock-openai/server.test.ts | 52 ++++++------ .../src/providers/mock-openai/server.ts | 12 +-- .../src/providers/shared/mock-model-config.ts | 4 +- .../qa-lab/src/qa-gateway-config.test.ts | 4 +- extensions/qa-lab/src/suite-planning.test.ts | 4 +- .../qa-lab/src/suite.summary-json.test.ts | 6 +- .../models/anthropic-opus-api-key-smoke.md | 6 +- .../anthropic-opus-setup-token-smoke.md | 6 +- scripts/openclaw-cross-os-release-checks.ts | 2 +- src/infra/run-node.test.ts | 8 +- .../trigger-handling-test-harness.ts | 12 +-- .../openclaw-cross-os-release-checks.test.ts | 8 +- .../package-acceptance-workflow.test.ts | 2 +- 37 files changed, 187 insertions(+), 186 deletions(-) diff --git a/.github/workflows/control-ui-locale-refresh.yml b/.github/workflows/control-ui-locale-refresh.yml index f7ac7004adb0..4529e86df0f3 100644 --- a/.github/workflows/control-ui-locale-refresh.yml +++ b/.github/workflows/control-ui-locale-refresh.yml @@ -138,7 +138,7 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENCLAW_DOCS_I18N_OPENAI_API_KEY || secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENCLAW_CONTROL_UI_I18N_PROVIDER: ${{ secrets.ANTHROPIC_API_KEY != '' && 'anthropic' || 'openai' }} - OPENCLAW_CONTROL_UI_I18N_MODEL: ${{ secrets.ANTHROPIC_API_KEY != '' && 'claude-opus-4-6' || vars.OPENCLAW_CI_OPENAI_MODEL_BARE }} + OPENCLAW_CONTROL_UI_I18N_MODEL: ${{ secrets.ANTHROPIC_API_KEY != '' && 'claude-opus-4-7' || vars.OPENCLAW_CI_OPENAI_MODEL_BARE }} OPENCLAW_CONTROL_UI_I18N_THINKING: low OPENCLAW_CONTROL_UI_I18N_AUTH_OPTIONAL: "1" LOCALE: ${{ matrix.locale }} diff --git a/.github/workflows/mantis-discord-status-reactions.yml b/.github/workflows/mantis-discord-status-reactions.yml index d159a6cb02de..0bd813cb9e1b 100644 --- a/.github/workflows/mantis-discord-status-reactions.yml +++ b/.github/workflows/mantis-discord-status-reactions.yml @@ -349,8 +349,8 @@ jobs: --repo-root "$repo_root" \ --output-dir "$output_dir" \ --provider-mode live-frontier \ - --model openai/gpt-5.4 \ - --alt-model openai/gpt-5.4 \ + --model openai/gpt-5.5 \ + --alt-model openai/gpt-5.5 \ --fast \ --credential-source convex \ --credential-role ci \ diff --git a/.github/workflows/mantis-slack-desktop-smoke.yml b/.github/workflows/mantis-slack-desktop-smoke.yml index 6b71b9fac6c5..a608b0bc9800 100644 --- a/.github/workflows/mantis-slack-desktop-smoke.yml +++ b/.github/workflows/mantis-slack-desktop-smoke.yml @@ -281,8 +281,8 @@ jobs: --credential-role ci \ --provider-mode live-frontier \ --hydrate-mode "$HYDRATE_MODE" \ - --model openai/gpt-5.4 \ - --alt-model openai/gpt-5.4 \ + --model openai/gpt-5.5 \ + --alt-model openai/gpt-5.5 \ --fast \ --scenario "$SCENARIO_ID" \ "${keep_args[@]}" \ diff --git a/.github/workflows/mantis-telegram-live.yml b/.github/workflows/mantis-telegram-live.yml index 2287921c02be..400b5b8333db 100644 --- a/.github/workflows/mantis-telegram-live.yml +++ b/.github/workflows/mantis-telegram-live.yml @@ -386,7 +386,7 @@ jobs: output_rel=".artifacts/qa-e2e/mantis/telegram-live" root="$candidate_repo/$output_rel" echo "output_dir=${root}" >> "$GITHUB_OUTPUT" - model="${OPENCLAW_CI_OPENAI_MODEL:-openai/gpt-5.4}" + model="${OPENCLAW_CI_OPENAI_MODEL:-openai/gpt-5.5}" scenario_args=() if [[ -n "${SCENARIO_INPUT// }" ]]; then diff --git a/.github/workflows/openclaw-cross-os-release-checks-reusable.yml b/.github/workflows/openclaw-cross-os-release-checks-reusable.yml index c90260b22da6..b94220dc22f8 100644 --- a/.github/workflows/openclaw-cross-os-release-checks-reusable.yml +++ b/.github/workflows/openclaw-cross-os-release-checks-reusable.yml @@ -186,7 +186,7 @@ env: PNPM_VERSION: "11.0.8" OPENCLAW_REPOSITORY: openclaw/openclaw TSX_VERSION: "4.21.0" - OPENCLAW_CROSS_OS_OPENAI_MODEL: ${{ inputs.openai_model || vars.OPENCLAW_CROSS_OS_OPENAI_MODEL || 'openai/gpt-5.4' }} + OPENCLAW_CROSS_OS_OPENAI_MODEL: ${{ inputs.openai_model || vars.OPENCLAW_CROSS_OS_OPENAI_MODEL || 'openai/gpt-5.5' }} jobs: prepare: diff --git a/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml b/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml index dbceabb72ab1..326b671ce4db 100644 --- a/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml +++ b/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml @@ -1911,7 +1911,7 @@ jobs: - suite_id: native-live-src-gateway-profiles-anthropic-opus suite_group: native-live-src-gateway-profiles-anthropic label: Native live gateway profiles Anthropic Opus - command: OPENCLAW_LIVE_GATEWAY_PROVIDERS=anthropic OPENCLAW_LIVE_GATEWAY_MODELS=anthropic/claude-opus-4-7,anthropic/claude-opus-4-6 node .release-harness/scripts/test-live-shard.mjs native-live-src-gateway-profiles + command: OPENCLAW_LIVE_GATEWAY_PROVIDERS=anthropic OPENCLAW_LIVE_GATEWAY_MODELS=anthropic/claude-opus-4-7 node .release-harness/scripts/test-live-shard.mjs native-live-src-gateway-profiles timeout_minutes: 30 profile_env_only: false advisory: true diff --git a/.github/workflows/openclaw-performance.yml b/.github/workflows/openclaw-performance.yml index 44c61b7c5b33..25fbbef79165 100644 --- a/.github/workflows/openclaw-performance.yml +++ b/.github/workflows/openclaw-performance.yml @@ -30,8 +30,8 @@ on: required: false default: false type: boolean - live_gpt54: - description: Run the live OpenAI GPT 5.4 agent-turn lane + live_openai_candidate: + description: Run the live OpenAI GPT 5.5 agent-turn lane required: false default: false type: boolean @@ -57,7 +57,7 @@ env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true" OCM_VERSION: v0.2.15 KOVA_REPOSITORY: openclaw/Kova - PERFORMANCE_MODEL_ID: gpt-5.4 + PERFORMANCE_MODEL_ID: gpt-5.5 jobs: kova: @@ -82,8 +82,8 @@ jobs: deep_profile: "true" live: "false" include_filters: "scenario:fresh-install scenario:gateway-performance scenario:agent-cold-warm-message" - - lane: live-gpt54 - title: Kova live OpenAI GPT 5.4 agent turn + - lane: live-openai-candidate + title: Kova live OpenAI GPT 5.5 agent turn auth: live repeat: "1" deep_profile: "false" @@ -119,9 +119,9 @@ jobs: run_lane=false reason="deep_profile input is false" fi - if [[ "$LANE_ID" == "live-gpt54" && "${{ github.event_name }}" != "schedule" && "${{ inputs.live_gpt54 || 'false' }}" != "true" ]]; then + if [[ "$LANE_ID" == "live-openai-candidate" && "${{ github.event_name }}" != "schedule" && "${{ inputs.live_openai_candidate || 'false' }}" != "true" ]]; then run_lane=false - reason="live_gpt54 input is false" + reason="live_openai_candidate input is false" fi echo "run=$run_lane" >> "$GITHUB_OUTPUT" if [[ "$run_lane" != "true" ]]; then @@ -200,7 +200,7 @@ jobs: chmod 0755 "$HOME/.local/bin/kova" echo "$HOME/.local/bin" >> "$GITHUB_PATH" - - name: Pin Kova OpenAI model to GPT 5.4 + - name: Pin Kova OpenAI model to GPT 5.5 if: steps.lane.outputs.run == 'true' shell: bash run: | @@ -244,7 +244,7 @@ jobs: run: | set -euo pipefail if [[ -z "${OPENAI_API_KEY:-}" ]]; then - echo "OPENAI_API_KEY is not configured; live GPT 5.4 lane will be skipped." >> "$GITHUB_STEP_SUMMARY" + echo "OPENAI_API_KEY is not configured; live GPT 5.5 lane will be skipped." >> "$GITHUB_STEP_SUMMARY" exit 0 fi kova setup --ci --json diff --git a/.github/workflows/openclaw-release-checks.yml b/.github/workflows/openclaw-release-checks.yml index 8fc41f4d1cab..b69376d60655 100644 --- a/.github/workflows/openclaw-release-checks.yml +++ b/.github/workflows/openclaw-release-checks.yml @@ -542,7 +542,7 @@ jobs: candidate_file_name: openclaw-current.tgz candidate_version: ${{ needs.prepare_release_package.outputs.package_version }} candidate_source_sha: ${{ needs.prepare_release_package.outputs.source_sha }} - openai_model: openai/gpt-5.4 + openai_model: openai/gpt-5.5 ubuntu_runner: ubuntu-24.04 windows_runner: windows-2025 macos_runner: macos-26 @@ -724,9 +724,9 @@ jobs: matrix: include: - lane: candidate - output_dir: gpt54 + output_dir: openai-candidate - lane: baseline - output_dir: opus46 + output_dir: anthropic-baseline env: QA_PARITY_CONCURRENCY: "1" OPENCLAW_QA_TRANSPORT_READY_TIMEOUT_MS: "180000" @@ -772,7 +772,7 @@ jobs: ;; baseline) model="anthropic/claude-opus-4-7" - alt_model="anthropic/claude-sonnet-4-7" + alt_model="anthropic/claude-sonnet-4-6" ;; *) echo "Unknown QA parity lane: ${QA_PARITY_LANE}" >&2 @@ -841,8 +841,8 @@ jobs: run: | pnpm openclaw qa parity-report \ --repo-root . \ - --candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \ - --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \ + --candidate-summary .artifacts/qa-e2e/openai-candidate/qa-suite-summary.json \ + --baseline-summary .artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json \ --candidate-label "${OPENCLAW_CI_OPENAI_MODEL}" \ --baseline-label anthropic/claude-opus-4-7 \ --output-dir .artifacts/qa-e2e/parity diff --git a/.github/workflows/qa-live-transports-convex.yml b/.github/workflows/qa-live-transports-convex.yml index e02e9e1e92ac..96167052ba53 100644 --- a/.github/workflows/qa-live-transports-convex.yml +++ b/.github/workflows/qa-live-transports-convex.yml @@ -198,7 +198,7 @@ jobs: --concurrency "${QA_PARITY_CONCURRENCY}" \ --model "${OPENCLAW_CI_OPENAI_MODEL}" \ --alt-model openai/gpt-5.5-alt \ - --output-dir .artifacts/qa-e2e/gpt54 + --output-dir .artifacts/qa-e2e/openai-candidate - name: Run Opus 4.7 lane run: | @@ -207,15 +207,15 @@ jobs: --parity-pack agentic \ --concurrency "${QA_PARITY_CONCURRENCY}" \ --model anthropic/claude-opus-4-7 \ - --alt-model anthropic/claude-sonnet-4-7 \ - --output-dir .artifacts/qa-e2e/opus46 + --alt-model anthropic/claude-sonnet-4-6 \ + --output-dir .artifacts/qa-e2e/anthropic-baseline - name: Generate parity report run: | pnpm openclaw qa parity-report \ --repo-root . \ - --candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \ - --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \ + --candidate-summary .artifacts/qa-e2e/openai-candidate/qa-suite-summary.json \ + --baseline-summary .artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json \ --candidate-label "${OPENCLAW_CI_OPENAI_MODEL}" \ --baseline-label anthropic/claude-opus-4-7 \ --output-dir .artifacts/qa-e2e/parity @@ -565,8 +565,8 @@ jobs: --repo-root . \ --output-dir "${output_dir}" \ --provider-mode live-frontier \ - --model openai/gpt-5.4 \ - --alt-model openai/gpt-5.4 \ + --model openai/gpt-5.5 \ + --alt-model openai/gpt-5.5 \ --fast \ --credential-source convex \ --credential-role ci \ diff --git a/CHANGELOG.md b/CHANGELOG.md index 558a40b62592..1077bb02595b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai - QA-Lab/qa-channel: attach redacted agent tool-start traces to outbound `QaBusMessage` records so scenarios can assert actual tool use instead of relying only on reply text. Fixes #67637. Thanks @100yenadmin. - QA-Lab: fail live runtime parity reports when assistant-message usage is missing, preventing `0 vs 0` live token rows from being reported as passing proof. Fixes #80411. Thanks @100yenadmin. - QA-Lab: fail Codex-backed OpenAI live runtime-pair runs before launching isolated workers when no portable Codex auth is available, while staging API-key fallbacks and configured Codex keys for isolated QA agents. Fixes #80412. Thanks @100yenadmin. +- QA-Lab: refresh parity gates, mock frontier fixtures, model scenarios, and workflow artifact lanes to compare GPT-5.5 against Claude Opus 4.7. Fixes #74262. Thanks @100yenadmin. - QA-Lab: stop returning Control UI bearer tokens from unauthenticated bootstrap payloads and bind Docker harness ports to loopback-only host addresses. (#66355) Thanks @pgondhi987. - Mac app: avoid a SwiftUI metadata crash when rendering the Cron Jobs settings pane. - Agents/OpenAI streams: yield via `setTimeout(0)` instead of `setImmediate` between bursty Responses chunks so abort timers can fire during the yield, keeping cancel-on-timeout responsive on hot streams. Refs #82462. diff --git a/docs/ci.md b/docs/ci.md index 5274d757600d..f6fd19238220 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -35,7 +35,7 @@ OpenClaw CI runs on every push to `main` and every pull request. The `preflight` | `macos-swift` | Swift lint, build, and tests for the macOS app | macOS-relevant changes | | `android` | Android unit tests for both flavors plus one debug APK build | Android-relevant changes | | `test-performance-agent` | Daily Codex slow-test optimization after trusted activity | Main CI success or manual dispatch | -| `openclaw-performance` | Daily/on-demand Kova runtime performance reports with mock-provider, deep-profile, and GPT 5.4 live lanes | Scheduled and manual dispatch | +| `openclaw-performance` | Daily/on-demand Kova runtime performance reports with mock-provider, deep-profile, and GPT 5.5 live lanes | Scheduled and manual dispatch | ## Fail-fast order @@ -138,7 +138,7 @@ pnpm perf:kova:summary --report .artifacts/kova/reports/mock-provider/report.jso ```bash gh workflow run openclaw-performance.yml --ref main -f profile=diagnostic -f repeat=3 -gh workflow run openclaw-performance.yml --ref main -f profile=smoke -f repeat=1 -f deep_profile=true -f live_gpt54=true +gh workflow run openclaw-performance.yml --ref main -f profile=smoke -f repeat=1 -f deep_profile=true -f live_openai_candidate=true gh workflow run openclaw-performance.yml --ref main -f target_ref=v2026.5.2 -f profile=diagnostic -f repeat=3 ``` @@ -148,7 +148,7 @@ The workflow installs OCM from a pinned release and Kova from `openclaw/Kova` at - `mock-provider`: Kova diagnostic scenarios against a local-build runtime with deterministic fake OpenAI-compatible auth. - `mock-deep-profile`: CPU/heap/trace profiling for startup, gateway, and agent-turn hotspots. -- `live-gpt54`: a real OpenAI `openai/gpt-5.4` agent turn, skipped when `OPENAI_API_KEY` is unavailable. +- `live-openai-candidate`: a real OpenAI `openai/gpt-5.5` agent turn, skipped when `OPENAI_API_KEY` is unavailable. The mock-provider lane also runs OpenClaw-native source probes after the Kova pass: gateway boot timing and memory across default, hook, and 50-plugin startup cases; repeated mock-OpenAI `channel-chat-baseline` hello loops; and CLI startup commands against the booted gateway. The source probe Markdown summary lives at `source/index.md` in the report bundle, with raw JSON beside it. @@ -269,7 +269,7 @@ For the dedicated update and plugin testing policy, including local commands, Docker lanes, Package Acceptance inputs, release defaults, and failure triage, see [Testing updates and plugins](/help/testing-updates-plugins). -Release checks call Package Acceptance with `source=artifact`, the prepared release package artifact, `suite_profile=custom`, `docker_lanes='doctor-switch update-channel-switch skill-install update-corrupt-plugin upgrade-survivor published-upgrade-survivor update-restart-auth plugins-offline plugin-update'`, and `telegram_mode=mock-openai`. This keeps package migration, update, live ClawHub skill install, stale-plugin-dependency cleanup, configured-plugin install repair, offline plugin, plugin-update, and Telegram proof on the same resolved package tarball. Set `release_package_spec` on Full Release Validation or OpenClaw Release Checks after publishing a beta to run the same matrix against the shipped npm package without rebuilding; set `package_acceptance_package_spec` only when Package Acceptance needs a different package from the rest of release validation. Cross-OS release checks still cover OS-specific onboarding, installer, and platform behavior; package/update product validation should start with Package Acceptance. The `published-upgrade-survivor` Docker lane validates one published package baseline per run in the blocking release path. In Package Acceptance, the resolved `package-under-test` tarball is always the candidate and `published_upgrade_survivor_baseline` selects the fallback published baseline, defaulting to `openclaw@latest`; failed-lane rerun commands preserve that baseline. Full Release Validation with `run_release_soak=true` or `release_profile=full` sets `published_upgrade_survivor_baselines='last-stable-4 2026.4.23 2026.5.2 2026.4.15'` and `published_upgrade_survivor_scenarios=reported-issues` to expand across the four latest stable npm releases plus pinned plugin-compatibility boundary releases and issue-shaped fixtures for Feishu config, preserved bootstrap/persona files, configured OpenClaw plugin installs, tilde log paths, and stale legacy plugin dependency roots. Multi-baseline published-upgrade survivor selections are sharded by baseline into separate targeted Docker runner jobs. The separate `Update Migration` workflow uses the `update-migration` Docker lane with `all-since-2026.4.23` and `plugin-deps-cleanup` when the question is exhaustive published update cleanup, not normal Full Release CI breadth. Local aggregate runs can pass exact package specs with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPECS`, keep a single lane with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPEC` such as `openclaw@2026.4.15`, or set `OPENCLAW_UPGRADE_SURVIVOR_SCENARIOS` for the scenario matrix. The published lane configures the baseline with a baked `openclaw config set` command recipe, records recipe steps in `summary.json`, and probes `/healthz`, `/readyz`, plus RPC status after Gateway start. The Windows packaged and installer fresh lanes also verify that an installed package can import a browser-control override from a raw absolute Windows path. The OpenAI cross-OS agent-turn smoke defaults to `OPENCLAW_CROSS_OS_OPENAI_MODEL` when set, otherwise `openai/gpt-5.4`, so the install and gateway proof stays on a GPT-5 test model while avoiding GPT-4.x defaults. +Release checks call Package Acceptance with `source=artifact`, the prepared release package artifact, `suite_profile=custom`, `docker_lanes='doctor-switch update-channel-switch skill-install update-corrupt-plugin upgrade-survivor published-upgrade-survivor update-restart-auth plugins-offline plugin-update'`, and `telegram_mode=mock-openai`. This keeps package migration, update, live ClawHub skill install, stale-plugin-dependency cleanup, configured-plugin install repair, offline plugin, plugin-update, and Telegram proof on the same resolved package tarball. Set `release_package_spec` on Full Release Validation or OpenClaw Release Checks after publishing a beta to run the same matrix against the shipped npm package without rebuilding; set `package_acceptance_package_spec` only when Package Acceptance needs a different package from the rest of release validation. Cross-OS release checks still cover OS-specific onboarding, installer, and platform behavior; package/update product validation should start with Package Acceptance. The `published-upgrade-survivor` Docker lane validates one published package baseline per run in the blocking release path. In Package Acceptance, the resolved `package-under-test` tarball is always the candidate and `published_upgrade_survivor_baseline` selects the fallback published baseline, defaulting to `openclaw@latest`; failed-lane rerun commands preserve that baseline. Full Release Validation with `run_release_soak=true` or `release_profile=full` sets `published_upgrade_survivor_baselines='last-stable-4 2026.4.23 2026.5.2 2026.4.15'` and `published_upgrade_survivor_scenarios=reported-issues` to expand across the four latest stable npm releases plus pinned plugin-compatibility boundary releases and issue-shaped fixtures for Feishu config, preserved bootstrap/persona files, configured OpenClaw plugin installs, tilde log paths, and stale legacy plugin dependency roots. Multi-baseline published-upgrade survivor selections are sharded by baseline into separate targeted Docker runner jobs. The separate `Update Migration` workflow uses the `update-migration` Docker lane with `all-since-2026.4.23` and `plugin-deps-cleanup` when the question is exhaustive published update cleanup, not normal Full Release CI breadth. Local aggregate runs can pass exact package specs with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPECS`, keep a single lane with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPEC` such as `openclaw@2026.4.15`, or set `OPENCLAW_UPGRADE_SURVIVOR_SCENARIOS` for the scenario matrix. The published lane configures the baseline with a baked `openclaw config set` command recipe, records recipe steps in `summary.json`, and probes `/healthz`, `/readyz`, plus RPC status after Gateway start. The Windows packaged and installer fresh lanes also verify that an installed package can import a browser-control override from a raw absolute Windows path. The OpenAI cross-OS agent-turn smoke defaults to `OPENCLAW_CROSS_OS_OPENAI_MODEL` when set, otherwise `openai/gpt-5.5`, so the install and gateway proof stays on a GPT-5 test model while avoiding GPT-4.x defaults. ### Legacy compatibility windows diff --git a/docs/concepts/qa-e2e-automation.md b/docs/concepts/qa-e2e-automation.md index 187d1f6cb636..06ba341d853f 100644 --- a/docs/concepts/qa-e2e-automation.md +++ b/docs/concepts/qa-e2e-automation.md @@ -175,7 +175,7 @@ For an agent/CV style desktop task, run: pnpm openclaw qa mantis visual-task \ --browser-url https://example.net \ --expect-text "Example Domain" \ - --vision-model openai/gpt-5.4 + --vision-model openai/gpt-5.5 ``` `visual-task` leases or reuses a Crabbox desktop/browser machine, starts @@ -370,8 +370,8 @@ Run the Mantis status-reaction scenario explicitly: pnpm openclaw qa discord \ --scenario discord-status-reactions-tool-only \ --provider-mode live-frontier \ - --model openai/gpt-5.4 \ - --alt-model openai/gpt-5.4 \ + --model openai/gpt-5.5 \ + --alt-model openai/gpt-5.5 \ --fast ``` @@ -780,13 +780,13 @@ pnpm openclaw qa character-eval \ --model openai/gpt-5.5,thinking=medium,fast \ --model openai/gpt-5.2,thinking=xhigh \ --model openai/gpt-5,thinking=xhigh \ - --model anthropic/claude-opus-4-6,thinking=high \ + --model anthropic/claude-opus-4-7,thinking=high \ --model anthropic/claude-sonnet-4-6,thinking=high \ --model zai/glm-5.1,thinking=high \ --model moonshot/kimi-k2.5,thinking=high \ --model google/gemini-3.1-pro-preview,thinking=high \ --judge-model openai/gpt-5.5,thinking=xhigh,fast \ - --judge-model anthropic/claude-opus-4-6,thinking=high \ + --judge-model anthropic/claude-opus-4-7,thinking=high \ --blind-judge-models \ --concurrency 16 \ --judge-concurrency 16 @@ -817,13 +817,13 @@ Candidate and judge model runs both default to concurrency 16. Lower `--concurrency` or `--judge-concurrency` when provider limits or local gateway pressure make a run too noisy. When no candidate `--model` is passed, the character eval defaults to -`openai/gpt-5.5`, `openai/gpt-5.2`, `openai/gpt-5`, `anthropic/claude-opus-4-6`, +`openai/gpt-5.5`, `openai/gpt-5.2`, `openai/gpt-5`, `anthropic/claude-opus-4-7`, `anthropic/claude-sonnet-4-6`, `zai/glm-5.1`, `moonshot/kimi-k2.5`, and `google/gemini-3.1-pro-preview` when no `--model` is passed. When no `--judge-model` is passed, the judges default to `openai/gpt-5.5,thinking=xhigh,fast` and -`anthropic/claude-opus-4-6,thinking=high`. +`anthropic/claude-opus-4-7,thinking=high`. ## Related docs diff --git a/docs/help/gpt55-codex-agentic-parity-maintainers.md b/docs/help/gpt55-codex-agentic-parity-maintainers.md index e683a259d057..2cf69baadc9d 100644 --- a/docs/help/gpt55-codex-agentic-parity-maintainers.md +++ b/docs/help/gpt55-codex-agentic-parity-maintainers.md @@ -59,7 +59,7 @@ Does not own: Owns: -- first-wave GPT-5.5 vs Opus 4.6 scenario pack +- first-wave GPT-5.5 vs Opus 4.7 scenario pack - parity documentation - parity report and release-gate mechanics @@ -123,7 +123,7 @@ Expected artifacts from PR D: ## Release gate -Do not claim GPT-5.5 parity or superiority over Opus 4.6 until: +Do not claim GPT-5.5 parity or superiority over Opus 4.7 until: - PR A, PR B, and PR C are merged - PR D runs the first-wave parity pack cleanly @@ -133,7 +133,7 @@ Do not claim GPT-5.5 parity or superiority over Opus 4.6 until: ```mermaid flowchart LR A["PR A-C merged"] --> B["Run GPT-5.5 parity pack"] - A --> C["Run Opus 4.6 parity pack"] + A --> C["Run Opus 4.7 parity pack"] B --> D["qa-suite-summary.json"] C --> E["qa-suite-summary.json"] D --> F["qa parity-report"] @@ -146,7 +146,7 @@ flowchart LR The parity harness is not the only evidence source. Keep this split explicit in review: -- PR D owns the scenario-based GPT-5.5 vs Opus 4.6 comparison +- PR D owns the scenario-based GPT-5.5 vs Opus 4.7 comparison - PR B deterministic suites still own auth/proxy/DNS and full-access truthfulness evidence ## Quick maintainer merge workflow @@ -179,7 +179,7 @@ If any one of the evidence bar items is missing, request changes instead of merg | No fake progress or fake tool completion | PR A + PR D | parity fake-success count plus scenario-level report details | | No false `/elevated full` guidance | PR B | deterministic runtime-truthfulness suites | | Replay/liveness failures remain explicit | PR C + PR D | lifecycle/replay suites plus `compaction-retry-mutating-tool` | -| GPT-5.5 matches or beats Opus 4.6 | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | +| GPT-5.5 matches or beats Opus 4.7 | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | ## Reviewer shorthand: before vs after diff --git a/docs/help/gpt55-codex-agentic-parity.md b/docs/help/gpt55-codex-agentic-parity.md index d833a556a239..dd9ed85dfa0e 100644 --- a/docs/help/gpt55-codex-agentic-parity.md +++ b/docs/help/gpt55-codex-agentic-parity.md @@ -13,7 +13,7 @@ OpenClaw already worked well with tool-using frontier models, but GPT-5.5 and Co - they could use strict OpenAI/Codex tool schemas incorrectly - they could ask for `/elevated full` even when full access was impossible - they could lose long-running task state during replay or compaction -- parity claims against Claude Opus 4.6 were based on anecdotes instead of repeatable scenarios +- parity claims against Claude Opus 4.7 were based on anecdotes instead of repeatable scenarios This parity program fixes those gaps in four reviewable slices. @@ -51,7 +51,7 @@ The tool-compat work reduces schema friction for strict OpenAI/Codex tool regist ### PR D: parity harness -This slice adds the first-wave QA-lab parity pack so GPT-5.5 and Opus 4.6 can be exercised through the same scenarios and compared using shared evidence. +This slice adds the first-wave QA-lab parity pack so GPT-5.5 and Opus 4.7 can be exercised through the same scenarios and compared using shared evidence. The parity pack is the proof layer. It does not change runtime behavior by itself. @@ -60,8 +60,8 @@ After you have two `qa-suite-summary.json` artifacts, generate the release-gate ```bash pnpm openclaw qa parity-report \ --repo-root . \ - --candidate-summary .artifacts/qa-e2e/gpt55/qa-suite-summary.json \ - --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \ + --candidate-summary .artifacts/qa-e2e/openai-candidate/qa-suite-summary.json \ + --baseline-summary .artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json \ --output-dir .artifacts/qa-e2e/parity ``` @@ -122,7 +122,7 @@ flowchart TD ```mermaid flowchart LR A["Merged runtime slices (PR A-C)"] --> B["Run GPT-5.5 parity pack"] - A --> C["Run Opus 4.6 parity pack"] + A --> C["Run Opus 4.7 parity pack"] B --> D["qa-suite-summary.json"] C --> E["qa-suite-summary.json"] D --> F["openclaw qa parity-report"] @@ -178,7 +178,7 @@ Required outcomes: - no fake completion without real execution - no incorrect `/elevated full` guidance - no silent replay or compaction abandonment -- parity-pack metrics that are at least as strong as the agreed Opus 4.6 baseline +- parity-pack metrics that are at least as strong as the agreed Opus 4.7 baseline For the first-wave harness, the gate compares: @@ -189,7 +189,7 @@ For the first-wave harness, the gate compares: Parity evidence is intentionally split across two layers: -- PR D proves same-scenario GPT-5.5 vs Opus 4.6 behavior with QA-lab +- PR D proves same-scenario GPT-5.5 vs Opus 4.7 behavior with QA-lab - PR B deterministic suites prove auth, proxy, DNS, and `/elevated full` truthfulness outside the harness ## Goal-to-evidence matrix @@ -200,13 +200,13 @@ Parity evidence is intentionally split across two layers: | GPT-5.5 no longer fakes progress or fake tool completion | PR A + PR D | parity report scenario outcomes and fake-success count | no suspicious pass results and no commentary-only completion | | GPT-5.5 no longer gives false `/elevated full` guidance | PR B | deterministic truthfulness suites | blocked reasons and full-access hints stay runtime-accurate | | Replay/liveness failures stay explicit | PR C + PR D | PR C lifecycle/replay suites plus `compaction-retry-mutating-tool` | mutating work keeps replay-unsafety explicit instead of silently disappearing | -| GPT-5.5 matches or beats Opus 4.6 on the agreed metrics | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | same scenario coverage and no regression on completion, stop behavior, or valid tool use | +| GPT-5.5 matches or beats Opus 4.7 on the agreed metrics | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | same scenario coverage and no regression on completion, stop behavior, or valid tool use | ## How to read the parity verdict Use the verdict in `qa-agentic-parity-summary.json` as the final machine-readable decision for the first-wave parity pack. -- `pass` means GPT-5.5 covered the same scenarios as Opus 4.6 and did not regress on the agreed aggregate metrics. +- `pass` means GPT-5.5 covered the same scenarios as Opus 4.7 and did not regress on the agreed aggregate metrics. - `fail` means at least one hard gate tripped: weaker completion, worse unintended stops, weaker valid tool use, any fake-success case, or mismatched scenario coverage. - "shared/base CI issue" is not itself a parity result. If CI noise outside PR D blocks a run, the verdict should wait for a clean merged-runtime execution instead of being inferred from branch-era logs. - Auth, proxy, DNS, and `/elevated full` truthfulness still come from PR B's deterministic suites, so the final release claim needs both: a passing PR D parity verdict and green PR B truthfulness coverage. diff --git a/docs/help/testing.md b/docs/help/testing.md index 5d8124ee4d27..9928d9161b2d 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -47,9 +47,9 @@ When debugging real providers/models (requires real creds): - Live suite (models + gateway tool/image probes): `pnpm test:live` - Target one live file quietly: `pnpm test:live -- src/agents/models.profiles.live.test.ts` - Runtime performance reports: dispatch `OpenClaw Performance` with - `live_gpt54=true` for a real `openai/gpt-5.4` agent turn or + `live_openai_candidate=true` for a real `openai/gpt-5.5` agent turn or `deep_profile=true` for Kova CPU/heap/trace artifacts. Daily scheduled runs - publish mock-provider, deep-profile, and GPT 5.4 lane artifacts to + publish mock-provider, deep-profile, and GPT 5.5 lane artifacts to `openclaw/clawgrit-reports` when `CLAWGRIT_REPORTS_TOKEN` is configured. The mock-provider report also includes source-level gateway boot, memory, plugin-pressure, repeated fake-model hello-loop, and CLI startup numbers. diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts index 2472ee2ee10d..9c9c3d87f1f8 100644 --- a/extensions/qa-lab/src/agentic-parity-report.test.ts +++ b/extensions/qa-lab/src/agentic-parity-report.test.ts @@ -153,7 +153,7 @@ describe("qa agentic parity report", () => { it("fails the parity gate when the candidate regresses against baseline", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: [ { name: "Approval turn tool followthrough", status: "pass" }, @@ -181,10 +181,10 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "openai/gpt-5.5 completion rate 80.0% is below anthropic/claude-opus-4-6 100.0%.", + "openai/gpt-5.5 completion rate 80.0% is below anthropic/claude-opus-4-7 100.0%.", ); expect(comparison.failures).toContain( - "openai/gpt-5.5 unintended-stop rate 20.0% exceeds anthropic/claude-opus-4-6 0.0%.", + "openai/gpt-5.5 unintended-stop rate 20.0% exceeds anthropic/claude-opus-4-7 0.0%.", ); }); @@ -199,7 +199,7 @@ describe("qa agentic parity report", () => { ]; const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: baselineScenarios.filter( (scenario) => scenario.name !== "Extra non-parity lane", @@ -211,14 +211,14 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Scenario coverage mismatch for Extra non-parity lane: openai/gpt-5.5=missing, anthropic/claude-opus-4-6=pass.", + "Scenario coverage mismatch for Extra non-parity lane: openai/gpt-5.5=missing, anthropic/claude-opus-4-7=pass.", ); }); it("reports each missing required parity scenario exactly once (no double-counting)", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }], }, @@ -260,7 +260,7 @@ describe("qa agentic parity report", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: summaryWithExtras, baselineSummary: scopedSummary, comparedAt: "2026-04-11T00:00:00.000Z", @@ -282,7 +282,7 @@ describe("qa agentic parity report", () => { it("fails the parity gate when required parity scenarios are missing on both sides", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }], }, @@ -294,14 +294,14 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Missing required parity scenario coverage for Image understanding from attachment: openai/gpt-5.5=missing, anthropic/claude-opus-4-6=missing.", + "Missing required parity scenario coverage for Image understanding from attachment: openai/gpt-5.5=missing, anthropic/claude-opus-4-7=missing.", ); }); it("fails the parity gate when required parity scenarios are skipped", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: [ { name: "Approval turn tool followthrough", status: "pass" }, @@ -325,7 +325,7 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Missing required parity scenario coverage for Compaction retry after mutating tool: openai/gpt-5.5=skip, anthropic/claude-opus-4-6=skip.", + "Missing required parity scenario coverage for Compaction retry after mutating tool: openai/gpt-5.5=skip, anthropic/claude-opus-4-7=skip.", ); }); @@ -342,7 +342,7 @@ describe("qa agentic parity report", () => { }); const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: scenariosWithBothFail }, baselineSummary: { scenarios: scenariosWithBothFail }, comparedAt: "2026-04-11T00:00:00.000Z", @@ -350,7 +350,7 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-6=fail.", + "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-7=fail.", ); // Metric comparisons are relative, so a same-on-both-sides failure // must not appear as a relative metric failure. The required-scenario @@ -370,7 +370,7 @@ describe("qa agentic parity report", () => { }); const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: candidateWithOneFail }, baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, comparedAt: "2026-04-11T00:00:00.000Z", @@ -378,7 +378,7 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-6=pass.", + "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-7=pass.", ); }); @@ -387,7 +387,7 @@ describe("qa agentic parity report", () => { // below is the isolated gate failure under test (no coverage-gap noise). const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, }, @@ -401,7 +401,7 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toEqual([ - "anthropic/claude-opus-4-6 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.", + "anthropic/claude-opus-4-7 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.", ]); }); @@ -571,14 +571,14 @@ status=done`, expect(() => buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: parityPassScenarios, - run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-6" }, + run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-7" }, }, baselineSummary: { scenarios: parityPassScenarios, - run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-6" }, + run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-7" }, }, comparedAt: "2026-04-11T00:00:00.000Z", }), @@ -593,7 +593,7 @@ status=done`, expect(() => buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: parityPassScenarios, run: { primaryProvider: "openai" }, @@ -612,7 +612,7 @@ status=done`, it("accepts matching run.primaryProvider labels without throwing", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { @@ -625,8 +625,8 @@ status=done`, scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "anthropic", - primaryModel: "anthropic/claude-opus-4-6", - primaryModelName: "claude-opus-4-6", + primaryModel: "anthropic/claude-opus-4-7", + primaryModelName: "claude-opus-4-7", }, }, comparedAt: "2026-04-11T00:00:00.000Z", @@ -639,7 +639,7 @@ status=done`, // work against those, trusting the caller-supplied label. const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, comparedAt: "2026-04-11T00:00:00.000Z", @@ -650,7 +650,7 @@ status=done`, it("skips provider verification for arbitrary display labels when run metadata is present", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "GPT-5.5 candidate", - baselineLabel: "Opus 4.6 baseline", + baselineLabel: "Opus 4.7 baseline", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { @@ -663,8 +663,8 @@ status=done`, scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "anthropic", - primaryModel: "anthropic/claude-opus-4-6", - primaryModelName: "claude-opus-4-6", + primaryModel: "anthropic/claude-opus-4-7", + primaryModelName: "claude-opus-4-7", }, }, comparedAt: "2026-04-11T00:00:00.000Z", @@ -676,7 +676,7 @@ status=done`, it("skips provider verification for mixed-case or decorated display labels", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "Candidate: GPT-5.5", - baselineLabel: "Opus 4.6 / baseline", + baselineLabel: "Opus 4.7 / baseline", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { @@ -689,8 +689,8 @@ status=done`, scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "anthropic", - primaryModel: "anthropic/claude-opus-4-6", - primaryModelName: "claude-opus-4-6", + primaryModel: "anthropic/claude-opus-4-7", + primaryModelName: "claude-opus-4-7", }, }, comparedAt: "2026-04-11T00:00:00.000Z", @@ -703,7 +703,7 @@ status=done`, expect(() => buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { @@ -716,8 +716,8 @@ status=done`, scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "anthropic", - primaryModel: "anthropic/claude-opus-4-6", - primaryModelName: "claude-opus-4-6", + primaryModel: "anthropic/claude-opus-4-7", + primaryModelName: "claude-opus-4-7", }, }, comparedAt: "2026-04-11T00:00:00.000Z", @@ -730,7 +730,7 @@ status=done`, it("accepts colon-delimited structured labels when provider and model both match", () => { const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai:gpt-5.5", - baselineLabel: "anthropic:claude-opus-4-6", + baselineLabel: "anthropic:claude-opus-4-7", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { @@ -743,8 +743,8 @@ status=done`, scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "anthropic", - primaryModel: "anthropic/claude-opus-4-6", - primaryModelName: "claude-opus-4-6", + primaryModel: "anthropic/claude-opus-4-7", + primaryModelName: "claude-opus-4-7", }, }, comparedAt: "2026-04-11T00:00:00.000Z", @@ -759,7 +759,7 @@ status=done`, // added by the second-wave expansion. const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5", - baselineLabel: "anthropic/claude-opus-4-6", + baselineLabel: "anthropic/claude-opus-4-7", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, comparedAt: "2026-04-11T00:00:00.000Z", @@ -768,7 +768,7 @@ status=done`, const report = renderQaAgenticParityMarkdownReport(comparison); expect(report).toContain( - "# OpenClaw Agentic Parity Report — openai/gpt-5.5 vs anthropic/claude-opus-4-6", + "# OpenClaw Agentic Parity Report — openai/gpt-5.5 vs anthropic/claude-opus-4-7", ); expect(report).toContain("| Completion rate | 100.0% | 100.0% |"); expect(report).toContain("### Approval turn tool followthrough"); @@ -779,7 +779,7 @@ status=done`, // Regression for the loop-7 Copilot finding: callers that configure // non-gpt-5.5 / non-opus labels (for example an internal candidate vs // another candidate) must see the labels in the rendered H1 instead of - // the hardcoded "GPT-5.5 / Opus 4.6" title that would otherwise confuse + // the hardcoded "GPT-5.5 / Opus 4.7" title that would otherwise confuse // readers of saved reports. const comparison = buildQaAgenticParityComparison({ candidateLabel: "openai/gpt-5.5-alt", diff --git a/extensions/qa-lab/src/agentic-parity-report.ts b/extensions/qa-lab/src/agentic-parity-report.ts index 1d1aba152c59..d73251f65afd 100644 --- a/extensions/qa-lab/src/agentic-parity-report.ts +++ b/extensions/qa-lab/src/agentic-parity-report.ts @@ -564,7 +564,7 @@ export function renderQaAgenticParityMarkdownReport(comparison: QaAgenticParityC // Title is parametrized from the candidate / baseline labels so reports // for any candidate/baseline pair (not only gpt-5.5 vs opus 4.6) render // with an accurate header. The default CLI labels are still - // openai/gpt-5.5 vs anthropic/claude-opus-4-6, but the helper works for + // openai/gpt-5.5 vs anthropic/claude-opus-4-7, but the helper works for // any parity comparison a caller configures. const lines = [ `# OpenClaw Agentic Parity Report — ${comparison.candidateLabel} vs ${comparison.baselineLabel}`, diff --git a/extensions/qa-lab/src/character-eval.test.ts b/extensions/qa-lab/src/character-eval.test.ts index 5b5046151edc..22d97466369b 100644 --- a/extensions/qa-lab/src/character-eval.test.ts +++ b/extensions/qa-lab/src/character-eval.test.ts @@ -274,7 +274,7 @@ describe("runQaCharacterEval", () => { { model: "openai/gpt-5.5", rank: 1, score: 8, summary: "ok" }, { model: "openai/gpt-5.2", rank: 2, score: 7.5, summary: "ok" }, { model: "openai/gpt-5", rank: 3, score: 7.2, summary: "ok" }, - { model: "anthropic/claude-opus-4-6", rank: 4, score: 7, summary: "ok" }, + { model: "anthropic/claude-opus-4-7", rank: 4, score: 7, summary: "ok" }, { model: "anthropic/claude-sonnet-4-6", rank: 5, score: 6.8, summary: "ok" }, { model: "zai/glm-5.1", rank: 6, score: 6.3, summary: "ok" }, { model: "moonshot/kimi-k2.5", rank: 7, score: 6.2, summary: "ok" }, @@ -294,7 +294,7 @@ describe("runQaCharacterEval", () => { "openai/gpt-5.5", "openai/gpt-5.2", "openai/gpt-5", - "anthropic/claude-opus-4-6", + "anthropic/claude-opus-4-7", "anthropic/claude-sonnet-4-6", "zai/glm-5.1", "moonshot/kimi-k2.5", @@ -323,7 +323,7 @@ describe("runQaCharacterEval", () => { expect(runJudge).toHaveBeenCalledTimes(2); expect(runJudge.mock.calls.map(([params]) => params.judgeModel)).toEqual([ "openai/gpt-5.5", - "anthropic/claude-opus-4-6", + "anthropic/claude-opus-4-7", ]); expect(runJudge.mock.calls.map(([params]) => params.judgeThinkingDefault)).toEqual([ "xhigh", @@ -577,11 +577,11 @@ describe("runQaCharacterEval", () => { candidateModelOptions: { "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: false }, }, - judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-6"], + judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-7"], judgeThinkingDefault: "medium", judgeModelOptions: { "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: true }, - "anthropic/claude-opus-4-6": { thinkingDefault: "high" }, + "anthropic/claude-opus-4-7": { thinkingDefault: "high" }, }, runSuite, runJudge, diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts index 96ab12effe44..73311918fca7 100644 --- a/extensions/qa-lab/src/cli.runtime.test.ts +++ b/extensions/qa-lab/src/cli.runtime.test.ts @@ -622,7 +622,7 @@ describe("qa cli runtime", () => { repoRoot: "/tmp/openclaw-repo", providerMode: "mock-openai", primaryModel: "openai/gpt-5.5", - alternateModel: "anthropic/claude-opus-4-6", + alternateModel: "anthropic/claude-opus-4-7", preflight: true, }); @@ -632,7 +632,7 @@ describe("qa cli runtime", () => { transportId: "qa-channel", providerMode: "mock-openai", primaryModel: "openai/gpt-5.5", - alternateModel: "anthropic/claude-opus-4-6", + alternateModel: "anthropic/claude-opus-4-7", scenarioIds: ["approval-turn-tool-followthrough"], concurrency: 1, }); @@ -930,7 +930,7 @@ describe("qa cli runtime", () => { fast: true, thinking: "medium", modelThinking: ["codex-cli/test-model=medium"], - judgeModel: ["openai/gpt-5.5,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"], + judgeModel: ["openai/gpt-5.5,thinking=xhigh,fast", "anthropic/claude-opus-4-7,thinking=high"], judgeTimeoutMs: 180_000, blindJudgeModels: true, concurrency: 4, @@ -951,10 +951,10 @@ describe("qa cli runtime", () => { "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: false }, "codex-cli/test-model": { thinkingDefault: "high", fastMode: true }, }, - judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-6"], + judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-7"], judgeModelOptions: { "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: true }, - "anthropic/claude-opus-4-6": { thinkingDefault: "high" }, + "anthropic/claude-opus-4-7": { thinkingDefault: "high" }, }, judgeTimeoutMs: 180_000, judgeBlindModels: true, @@ -1285,7 +1285,7 @@ describe("qa cli runtime", () => { providerMode: "mock-openai", parityPack: "agentic", primaryModel: "openai/gpt-5.5", - alternateModel: "anthropic/claude-opus-4-6", + alternateModel: "anthropic/claude-opus-4-7", }); expect(runQaSuiteFromRuntime).toHaveBeenCalledWith({ @@ -1294,7 +1294,7 @@ describe("qa cli runtime", () => { transportId: "qa-channel", providerMode: "mock-openai", primaryModel: "openai/gpt-5.5", - alternateModel: "anthropic/claude-opus-4-6", + alternateModel: "anthropic/claude-opus-4-7", fastMode: undefined, scenarioIds: [ "approval-turn-tool-followthrough", diff --git a/extensions/qa-lab/src/cli.test.ts b/extensions/qa-lab/src/cli.test.ts index 6da79240107f..97bc91e7d274 100644 --- a/extensions/qa-lab/src/cli.test.ts +++ b/extensions/qa-lab/src/cli.test.ts @@ -346,9 +346,9 @@ describe("qa cli registration", () => { "--provider-mode", "live-frontier", "--model", - "openai/gpt-5.4", + "openai/gpt-5.5", "--alt-model", - "openai/gpt-5.4", + "openai/gpt-5.5", "--scenario", "slack-canary", "--credential-source", @@ -360,7 +360,7 @@ describe("qa cli registration", () => { ]); expect(runMantisSlackDesktopSmokeCommand).toHaveBeenCalledWith({ - alternateModel: "openai/gpt-5.4", + alternateModel: "openai/gpt-5.5", crabboxBin: "/tmp/crabbox", credentialRole: "maintainer", credentialSource: "env", @@ -371,7 +371,7 @@ describe("qa cli registration", () => { leaseId: "cbx_123abc", machineClass: "beast", outputDir: ".artifacts/qa-e2e/mantis/slack-desktop", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", provider: "hetzner", providerMode: "live-frontier", repoRoot: "/tmp/openclaw-repo", diff --git a/extensions/qa-lab/src/gateway-log-sentinel.test.ts b/extensions/qa-lab/src/gateway-log-sentinel.test.ts index 8cde3a317efe..8f7d77ae9a98 100644 --- a/extensions/qa-lab/src/gateway-log-sentinel.test.ts +++ b/extensions/qa-lab/src/gateway-log-sentinel.test.ts @@ -16,7 +16,7 @@ describe("gateway log sentinels", () => { "[plugins] plugin must declare contracts.tools for: runtime_tool", "2026-05-13T00:00:04Z codex app-server attempt timed out after 180000ms", "2026-05-13T00:00:05Z codex_app_server progress stalled for run abc123", - "2026-05-13T00:00:06Z cron payload model openai/gpt-5.4 is not in model allowlist", + "2026-05-13T00:00:06Z cron payload model openai/gpt-5.5 is not in model allowlist", "2026-05-13T00:00:07Z OpenAI quota exceeded for live-frontier request", ].join("\n"), ); diff --git a/extensions/qa-lab/src/live-timeout.test.ts b/extensions/qa-lab/src/live-timeout.test.ts index c696a9c9e540..fa216cbc2980 100644 --- a/extensions/qa-lab/src/live-timeout.test.ts +++ b/extensions/qa-lab/src/live-timeout.test.ts @@ -8,7 +8,7 @@ describe("qa live timeout policy", () => { { providerMode: "mock-openai", primaryModel: "anthropic/claude-sonnet-4-6", - alternateModel: "anthropic/claude-opus-4-6", + alternateModel: "anthropic/claude-opus-4-7", }, 30_000, ), @@ -47,7 +47,7 @@ describe("qa live timeout policy", () => { { providerMode: "live-frontier", primaryModel: "anthropic/claude-sonnet-4-6", - alternateModel: "anthropic/claude-opus-4-6", + alternateModel: "anthropic/claude-opus-4-7", }, 30_000, ), @@ -60,10 +60,10 @@ describe("qa live timeout policy", () => { { providerMode: "live-frontier", primaryModel: "anthropic/claude-sonnet-4-6", - alternateModel: "anthropic/claude-opus-4-6", + alternateModel: "anthropic/claude-opus-4-7", }, 30_000, - "anthropic/claude-opus-4-6", + "anthropic/claude-opus-4-7", ), ).toBe(240_000); }); diff --git a/extensions/qa-lab/src/providers/live-frontier/character-eval.ts b/extensions/qa-lab/src/providers/live-frontier/character-eval.ts index 5019ff5b8dc2..89f5fa275496 100644 --- a/extensions/qa-lab/src/providers/live-frontier/character-eval.ts +++ b/extensions/qa-lab/src/providers/live-frontier/character-eval.ts @@ -9,7 +9,7 @@ export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([ "openai/gpt-5.5", "openai/gpt-5.2", "openai/gpt-5", - "anthropic/claude-opus-4-6", + "anthropic/claude-opus-4-7", "anthropic/claude-sonnet-4-6", "zai/glm-5.1", "moonshot/kimi-k2.5", @@ -25,12 +25,12 @@ export const QA_FRONTIER_CHARACTER_THINKING_BY_MODEL: Readonly > = Object.freeze({ "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: true }, - "anthropic/claude-opus-4-6": { thinkingDefault: "high" }, + "anthropic/claude-opus-4-7": { thinkingDefault: "high" }, }); diff --git a/extensions/qa-lab/src/providers/live-frontier/parity.ts b/extensions/qa-lab/src/providers/live-frontier/parity.ts index 62bcd5556ce1..a0874010b6d0 100644 --- a/extensions/qa-lab/src/providers/live-frontier/parity.ts +++ b/extensions/qa-lab/src/providers/live-frontier/parity.ts @@ -1,2 +1,2 @@ export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.5"; -export const QA_FRONTIER_PARITY_BASELINE_LABEL = "anthropic/claude-opus-4-6"; +export const QA_FRONTIER_PARITY_BASELINE_LABEL = "anthropic/claude-opus-4-7"; diff --git a/extensions/qa-lab/src/providers/mock-openai/server.test.ts b/extensions/qa-lab/src/providers/mock-openai/server.test.ts index ed6cd9dddb2e..db2f56cebbe2 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts @@ -2727,7 +2727,7 @@ describe("qa mock openai server", () => { headers: { "content-type": "application/json" }, body: JSON.stringify({ stream: false, - model: "mock-openai/gpt-5.4", + model: "mock-openai/gpt-5.5", input: [ { role: "user", @@ -2783,7 +2783,7 @@ describe("qa mock openai server", () => { headers: { "content-type": "application/json" }, body: JSON.stringify({ stream: false, - model: "mock-openai/gpt-5.4", + model: "mock-openai/gpt-5.5", input: [ { role: "user", @@ -2956,7 +2956,7 @@ describe("qa mock openai server", () => { expect(outputText(await response.json())).toBe("NO_REPLY"); }); - it("advertises Anthropic claude-opus-4-6 baseline model on /v1/models", async () => { + it("advertises Anthropic claude-opus-4-7 baseline model on /v1/models", async () => { const server = await startQaMockOpenAiServer({ host: "127.0.0.1", port: 0, @@ -2969,7 +2969,7 @@ describe("qa mock openai server", () => { expect(response.status).toBe(200); const body = (await response.json()) as { data: Array<{ id: string }> }; const ids = body.data.map((entry) => entry.id); - expect(ids).toContain("claude-opus-4-6"); + expect(ids).toContain("claude-opus-4-7"); expect(ids).toContain("gpt-5.5"); }); @@ -2986,7 +2986,7 @@ describe("qa mock openai server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "claude-opus-4-6", + model: "claude-opus-4-7", max_tokens: 256, messages: [ { @@ -3011,7 +3011,7 @@ describe("qa mock openai server", () => { }; expect(body.type).toBe("message"); expect(body.role).toBe("assistant"); - expect(body.model).toBe("claude-opus-4-6"); + expect(body.model).toBe("claude-opus-4-7"); expect(body.stop_reason).toBe("tool_use"); const toolUseBlock = body.content.find((block) => block.type === "tool_use") as | { name: string; input: Record } @@ -3022,7 +3022,7 @@ describe("qa mock openai server", () => { const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); expect(debugResponse.status).toBe(200); const debugPayload = requireRecord(await debugResponse.json(), "debug request"); - expect(debugPayload.model).toBe("claude-opus-4-6"); + expect(debugPayload.model).toBe("claude-opus-4-7"); expect(debugPayload.plannedToolName).toBe("read"); }); @@ -3033,7 +3033,7 @@ describe("qa mock openai server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "claude-opus-4-6", + model: "claude-opus-4-7", max_tokens: 256, tools: [ { @@ -3073,7 +3073,7 @@ describe("qa mock openai server", () => { const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); expect(debugResponse.status).toBe(200); const debugPayload = requireRecord(await debugResponse.json(), "debug request"); - expect(debugPayload.model).toBe("claude-opus-4-6"); + expect(debugPayload.model).toBe("claude-opus-4-7"); expect(debugPayload.plannedToolName).toBe("sessions_spawn"); }); @@ -3097,7 +3097,7 @@ describe("qa mock openai server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "claude-opus-4-6", + model: "claude-opus-4-7", max_tokens: 256, messages: [ { @@ -3171,7 +3171,7 @@ describe("qa mock openai server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "claude-opus-4-6", + model: "claude-opus-4-7", max_tokens: 256, messages: [ { @@ -3252,7 +3252,7 @@ describe("qa mock openai server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "claude-opus-4-6", + model: "claude-opus-4-7", max_tokens: 256, stream: true, messages: [ @@ -3293,7 +3293,7 @@ describe("qa mock openai server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "claude-opus-4-6", + model: "claude-opus-4-7", max_tokens: 256, stream: true, messages: [ @@ -3352,7 +3352,7 @@ describe("qa mock openai server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "claude-opus-4-6", + model: "claude-opus-4-7", max_tokens: 256, stream: true, system: [ @@ -3395,7 +3395,7 @@ describe("qa mock openai server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "claude-opus-4-6", + model: "claude-opus-4-7", max_tokens: 256, stream: true, system: [ @@ -3440,7 +3440,7 @@ describe("qa mock openai server", () => { const response = await fetch(`${server.baseUrl}/v1/messages`, { method: "POST", headers: { "content-type": "application/json" }, - body: '{"model":"claude-opus-4-6","messages":[', + body: '{"model":"claude-opus-4-7","messages":[', }); expect(response.status).toBe(400); @@ -3453,12 +3453,12 @@ describe("qa mock openai server", () => { expect(body.error.message).toContain("Malformed JSON body"); }); - it("defaults empty-string Anthropic /v1/messages model to claude-opus-4-6", async () => { + it("defaults empty-string Anthropic /v1/messages model to claude-opus-4-7", async () => { // Regression for the loop-7 Copilot finding: a bare `typeof // body.model === "string"` check lets an empty-string model leak // through to `lastRequest.model` and `responseBody.model`. Empty // strings must be treated the same as absent and default to - // `"claude-opus-4-6"` so parity consumers can trust the echoed label. + // `"claude-opus-4-7"` so parity consumers can trust the echoed label. const server = await startQaMockOpenAiServer({ host: "127.0.0.1", port: 0, @@ -3483,12 +3483,12 @@ describe("qa mock openai server", () => { }); expect(response.status).toBe(200); const body = (await response.json()) as { model: string }; - expect(body.model).toBe("claude-opus-4-6"); + expect(body.model).toBe("claude-opus-4-7"); const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); expect(debugResponse.status).toBe(200); const debug = (await debugResponse.json()) as { model: string }; - expect(debug.model).toBe("claude-opus-4-6"); + expect(debug.model).toBe("claude-opus-4-7"); }); it("scripts a reasoning-only recovery sequence after a replay-safe read", async () => { @@ -3711,9 +3711,9 @@ describe("resolveProviderVariant", () => { }); it("tags prefix-qualified anthropic models", () => { - expect(resolveProviderVariant("anthropic/claude-opus-4-6")).toBe("anthropic"); - expect(resolveProviderVariant("anthropic:claude-opus-4-6")).toBe("anthropic"); - expect(resolveProviderVariant("claude-cli/claude-opus-4-6")).toBe("anthropic"); + expect(resolveProviderVariant("anthropic/claude-opus-4-7")).toBe("anthropic"); + expect(resolveProviderVariant("anthropic:claude-opus-4-7")).toBe("anthropic"); + expect(resolveProviderVariant("claude-cli/claude-opus-4-7")).toBe("anthropic"); }); it("tags bare model names by prefix", () => { @@ -3721,7 +3721,7 @@ describe("resolveProviderVariant", () => { expect(resolveProviderVariant("gpt-5.5-alt")).toBe("openai"); expect(resolveProviderVariant("gpt-4.5")).toBe("openai"); expect(resolveProviderVariant("o1-preview")).toBe("openai"); - expect(resolveProviderVariant("claude-opus-4-6")).toBe("anthropic"); + expect(resolveProviderVariant("claude-opus-4-7")).toBe("anthropic"); expect(resolveProviderVariant("claude-sonnet-4-6")).toBe("anthropic"); }); @@ -3779,7 +3779,7 @@ describe("qa mock openai server provider variant tagging", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "claude-opus-4-6", + model: "claude-opus-4-7", max_tokens: 256, messages: [{ role: "user", content: "Heartbeat check" }], }), @@ -3789,7 +3789,7 @@ describe("qa mock openai server provider variant tagging", () => { model: string; providerVariant: string; }; - expect(debug.model).toBe("claude-opus-4-6"); + expect(debug.model).toBe("claude-opus-4-7"); expect(debug.providerVariant).toBe("anthropic"); }); diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index 5eecb8edb270..be831461d038 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -81,7 +81,7 @@ export function resolveProviderVariant(model: string | undefined): MockOpenAiPro return "anthropic"; } // Fall back to model-name prefix matching for bare model strings like - // `gpt-5.5` or `claude-opus-4-6`. + // `gpt-5.5` or `claude-opus-4-7`. if (/^(?:gpt-|o1-|openai-)/.test(trimmed)) { return "openai"; } @@ -2161,7 +2161,7 @@ async function buildResponsesPayload( // // The QA parity gate needs two comparable scenario runs: one against the // "candidate" (openai/gpt-5.5) and one against the "baseline" -// (anthropic/claude-opus-4-6). The OpenAI mock above already dispatches all +// (anthropic/claude-opus-4-7). The OpenAI mock above already dispatches all // the scenario prompt branches we care about. Rather than duplicating that // machinery, the /v1/messages route below translates Anthropic request // shapes into the shared ResponsesInputItem[] format, calls the same @@ -2384,7 +2384,7 @@ function buildAnthropicMessageResponse(params: { id: `msg_mock_${Math.floor(Math.random() * 1_000_000).toString(16)}`, type: "message", role: "assistant", - model: params.model || "claude-opus-4-6", + model: params.model || "claude-opus-4-7", content, stop_reason: stopReason, stop_sequence: null, @@ -2412,7 +2412,7 @@ function buildAnthropicMessageStreamEvents(params: { id: messageId, type: "message", role: "assistant", - model: params.model || "claude-opus-4-6", + model: params.model || "claude-opus-4-7", content: [], stop_reason: null, stop_sequence: null, @@ -2511,7 +2511,7 @@ async function buildMessagesPayload( // which then confuses parity consumers that assume the mock always // echoes the real provider label. Normalize once and reuse everywhere. const normalizedModel = - typeof body.model === "string" && body.model.trim() !== "" ? body.model : "claude-opus-4-6"; + typeof body.model === "string" && body.model.trim() !== "" ? body.model : "claude-opus-4-7"; // Dispatch through the same scenario logic the /v1/responses route uses. // Preserve declared tools so route-specific adapters mirror what the // real provider request made available to the model. @@ -2556,7 +2556,7 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n { id: "gpt-5.5-alt", object: "model" }, { id: "gpt-image-1", object: "model" }, { id: "text-embedding-3-small", object: "model" }, - { id: "claude-opus-4-6", object: "model" }, + { id: "claude-opus-4-7", object: "model" }, { id: "claude-sonnet-4-6", object: "model" }, ], }); diff --git a/extensions/qa-lab/src/providers/shared/mock-model-config.ts b/extensions/qa-lab/src/providers/shared/mock-model-config.ts index 58e2eab15d50..308b7f23507f 100644 --- a/extensions/qa-lab/src/providers/shared/mock-model-config.ts +++ b/extensions/qa-lab/src/providers/shared/mock-model-config.ts @@ -71,8 +71,8 @@ function createMockAnthropicMessagesProvider(baseUrl: string): ModelProviderConf }, models: [ { - id: "claude-opus-4-6", - name: "claude-opus-4-6", + id: "claude-opus-4-7", + name: "claude-opus-4-7", api: "anthropic-messages", reasoning: false, input: ["text", "image"], diff --git a/extensions/qa-lab/src/qa-gateway-config.test.ts b/extensions/qa-lab/src/qa-gateway-config.test.ts index 1429ddd88699..42cbc0b6ba40 100644 --- a/extensions/qa-lab/src/qa-gateway-config.test.ts +++ b/extensions/qa-lab/src/qa-gateway-config.test.ts @@ -90,7 +90,7 @@ describe("buildQaGatewayConfig", () => { workspaceDir: "/tmp/qa-workspace", providerMode: "mock-openai", primaryModel: "openai/gpt-5.5", - alternateModel: "anthropic/claude-opus-4-6", + alternateModel: "anthropic/claude-opus-4-7", }); expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("openai/gpt-5.5"); @@ -101,7 +101,7 @@ describe("buildQaGatewayConfig", () => { expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080"); expect(cfg.models?.providers?.anthropic?.request).toEqual({ allowPrivateNetwork: true }); expect(cfg.models?.providers?.anthropic?.models.map((model) => model.id)).toContain( - "claude-opus-4-6", + "claude-opus-4-7", ); expect(cfg.plugins?.allow).toEqual(["acpx", "memory-core"]); }); diff --git a/extensions/qa-lab/src/suite-planning.test.ts b/extensions/qa-lab/src/suite-planning.test.ts index 6a3ffa6f8af8..32b0604e8294 100644 --- a/extensions/qa-lab/src/suite-planning.test.ts +++ b/extensions/qa-lab/src/suite-planning.test.ts @@ -174,7 +174,7 @@ describe("qa suite planning helpers", () => { makeQaSuiteTestScenario("anthropic-only", { config: { requiredProvider: "anthropic", - requiredModel: "claude-opus-4-6", + requiredModel: "claude-opus-4-7", }, }), ]; @@ -320,7 +320,7 @@ describe("qa suite planning helpers", () => { config: { requiredProvider: "openai", requiredModel: "gpt-5.5" }, }), makeQaSuiteTestScenario("anthropic-only", { - config: { requiredProvider: "anthropic", requiredModel: "claude-opus-4-6" }, + config: { requiredProvider: "anthropic", requiredModel: "claude-opus-4-7" }, }), makeQaSuiteTestScenario("claude-subscription", { config: { requiredProvider: "claude-cli", authMode: "subscription" }, diff --git a/extensions/qa-lab/src/suite.summary-json.test.ts b/extensions/qa-lab/src/suite.summary-json.test.ts index 632e448933a9..dead9540c7d8 100644 --- a/extensions/qa-lab/src/suite.summary-json.test.ts +++ b/extensions/qa-lab/src/suite.summary-json.test.ts @@ -67,12 +67,12 @@ describe("buildQaSuiteSummaryJson", () => { it("records an Anthropic baseline lane cleanly for parity runs", () => { const json = buildQaSuiteSummaryJson({ ...baseParams, - primaryModel: "anthropic/claude-opus-4-6", + primaryModel: "anthropic/claude-opus-4-7", alternateModel: "anthropic/claude-sonnet-4-6", }); - expect(json.run.primaryModel).toBe("anthropic/claude-opus-4-6"); + expect(json.run.primaryModel).toBe("anthropic/claude-opus-4-7"); expect(json.run.primaryProvider).toBe("anthropic"); - expect(json.run.primaryModelName).toBe("claude-opus-4-6"); + expect(json.run.primaryModelName).toBe("claude-opus-4-7"); expect(json.run.alternateModel).toBe("anthropic/claude-sonnet-4-6"); expect(json.run.alternateProvider).toBe("anthropic"); expect(json.run.alternateModelName).toBe("claude-sonnet-4-6"); diff --git a/qa/scenarios/models/anthropic-opus-api-key-smoke.md b/qa/scenarios/models/anthropic-opus-api-key-smoke.md index 21b1f993171c..d13b97c921fe 100644 --- a/qa/scenarios/models/anthropic-opus-api-key-smoke.md +++ b/qa/scenarios/models/anthropic-opus-api-key-smoke.md @@ -12,7 +12,7 @@ coverage: objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using API-key auth. successCriteria: - A live-frontier run fails fast unless the selected primary provider is anthropic. - - The selected primary model is Anthropic Opus 4.6. + - The selected primary model is Anthropic Opus 4.7. - The QA gateway worker has an Anthropic API key available through environment auth. - The agent replies through the regular Anthropic provider. docsRefs: @@ -24,10 +24,10 @@ codeRefs: - extensions/qa-lab/src/suite.ts execution: kind: flow - summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model anthropic/claude-opus-4-6 --alt-model anthropic/claude-opus-4-6 --scenario anthropic-opus-api-key-smoke`. + summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model anthropic/claude-opus-4-7 --alt-model anthropic/claude-opus-4-7 --scenario anthropic-opus-api-key-smoke`. config: requiredProvider: anthropic - requiredModel: claude-opus-4-6 + requiredModel: claude-opus-4-7 chatPrompt: "Anthropic Opus API key smoke. Reply exactly: ANTHROPIC-OPUS-API-KEY-OK" chatExpected: ANTHROPIC-OPUS-API-KEY-OK ``` diff --git a/qa/scenarios/models/anthropic-opus-setup-token-smoke.md b/qa/scenarios/models/anthropic-opus-setup-token-smoke.md index 231403d1e7c4..a67997ef4e6f 100644 --- a/qa/scenarios/models/anthropic-opus-setup-token-smoke.md +++ b/qa/scenarios/models/anthropic-opus-setup-token-smoke.md @@ -12,7 +12,7 @@ coverage: objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using setup-token auth. successCriteria: - A live-frontier run fails fast unless the selected primary provider is anthropic. - - The selected primary model is Anthropic Opus 4.6. + - The selected primary model is Anthropic Opus 4.7. - The QA gateway worker stages a token auth profile in the isolated agent store. - The agent replies through the regular Anthropic provider. docsRefs: @@ -24,10 +24,10 @@ codeRefs: - extensions/qa-lab/src/suite.ts execution: kind: flow - summary: Run with `OPENCLAW_LIVE_SETUP_TOKEN_VALUE= pnpm openclaw qa suite --provider-mode live-frontier --model anthropic/claude-opus-4-6 --alt-model anthropic/claude-opus-4-6 --scenario anthropic-opus-setup-token-smoke`. + summary: Run with `OPENCLAW_LIVE_SETUP_TOKEN_VALUE= pnpm openclaw qa suite --provider-mode live-frontier --model anthropic/claude-opus-4-7 --alt-model anthropic/claude-opus-4-7 --scenario anthropic-opus-setup-token-smoke`. config: requiredProvider: anthropic - requiredModel: claude-opus-4-6 + requiredModel: claude-opus-4-7 profileId: "anthropic:qa-setup-token" chatPrompt: "Anthropic Opus setup-token smoke. Reply exactly: ANTHROPIC-OPUS-SETUP-TOKEN-OK" chatExpected: ANTHROPIC-OPUS-SETUP-TOKEN-OK diff --git a/scripts/openclaw-cross-os-release-checks.ts b/scripts/openclaw-cross-os-release-checks.ts index e1f3c7056d54..086afd3f14a2 100644 --- a/scripts/openclaw-cross-os-release-checks.ts +++ b/scripts/openclaw-cross-os-release-checks.ts @@ -46,7 +46,7 @@ const providerConfig = { extensionId: "openai", secretEnv: "OPENAI_API_KEY", authChoice: "openai-api-key", - model: "openai/gpt-5.4", + model: "openai/gpt-5.5", baseUrl: "https://api.openai.com/v1", timeoutSeconds: CROSS_OS_AGENT_TURN_TIMEOUT_SECONDS, }, diff --git a/src/infra/run-node.test.ts b/src/infra/run-node.test.ts index 30211b0e0cf1..8c8e553b0523 100644 --- a/src/infra/run-node.test.ts +++ b/src/infra/run-node.test.ts @@ -1047,9 +1047,9 @@ describe("run-node script", () => { "qa", "parity-report", "--candidate-summary", - ".artifacts/qa-e2e/gpt54/qa-suite-summary.json", + ".artifacts/qa-e2e/openai-candidate/qa-suite-summary.json", "--baseline-summary", - ".artifacts/qa-e2e/opus46/qa-suite-summary.json", + ".artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json", ], env: { ...process.env, @@ -1068,9 +1068,9 @@ describe("run-node script", () => { "tsx", path.join(tmp, "scripts", "qa-parity-report.ts"), "--candidate-summary", - ".artifacts/qa-e2e/gpt54/qa-suite-summary.json", + ".artifacts/qa-e2e/openai-candidate/qa-suite-summary.json", "--baseline-summary", - ".artifacts/qa-e2e/opus46/qa-suite-summary.json", + ".artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json", ], ]); }); diff --git a/test/helpers/auto-reply/trigger-handling-test-harness.ts b/test/helpers/auto-reply/trigger-handling-test-harness.ts index 52112ff96a45..3b1bf9bf7b0f 100644 --- a/test/helpers/auto-reply/trigger-handling-test-harness.ts +++ b/test/helpers/auto-reply/trigger-handling-test-harness.ts @@ -100,17 +100,17 @@ const modelCatalogMocks = getSharedMocks("openclaw.trigger-handling.model-catalo loadModelCatalog: vi.fn().mockResolvedValue([ { provider: "anthropic", - id: "claude-opus-4-6", - name: "Claude Opus 4.5", + id: "claude-opus-4-7", + name: "Claude Opus 4.7", contextWindow: 200000, }, { provider: "openrouter", - id: "anthropic/claude-opus-4-6", - name: "Claude Opus 4.5 (OpenRouter)", + id: "anthropic/claude-opus-4-7", + name: "Claude Opus 4.7 (OpenRouter)", contextWindow: 200000, }, - { provider: "openai", id: "gpt-5.4-mini", name: "GPT-5.4 mini" }, + { provider: "openai", id: "gpt-5.5-mini", name: "GPT-5.5 mini" }, { provider: "openai", id: "gpt-5.5", name: "GPT-5.5" }, { provider: "openai-codex", id: "gpt-5.5", name: "GPT-5.5 (Codex)" }, { provider: "minimax", id: "MiniMax-M2.7", name: "MiniMax M2.7" }, @@ -284,7 +284,7 @@ export function makeCfg(home: string): OpenClawConfig { return withFastReplyConfig({ agents: { defaults: { - model: { primary: "anthropic/claude-opus-4-6" }, + model: { primary: "anthropic/claude-opus-4-7" }, workspace: join(home, "openclaw"), // Test harness: avoid 1s coalescer idle sleeps that dominate trigger suites. blockStreamingCoalesce: { idleMs: 1 }, diff --git a/test/scripts/openclaw-cross-os-release-checks.test.ts b/test/scripts/openclaw-cross-os-release-checks.test.ts index 3294afb54d44..2a1f7d949d2d 100644 --- a/test/scripts/openclaw-cross-os-release-checks.test.ts +++ b/test/scripts/openclaw-cross-os-release-checks.test.ts @@ -205,10 +205,10 @@ describe("scripts/openclaw-cross-os-release-checks", () => { OPENCLAW_CROSS_OS_MODEL: "openai/gpt-5.4-nano", })?.model, ).toBe("openai/gpt-5.4-nano"); - expect(resolveProviderConfig("openai", {})?.model).toBe("openai/gpt-5.4"); + expect(resolveProviderConfig("openai", {})?.model).toBe("openai/gpt-5.5"); }); - it("keeps release cross-OS OpenAI smoke on GPT-5.4", () => { + it("keeps release cross-OS OpenAI smoke on GPT-5.5", () => { const workflow = readFileSync( ".github/workflows/openclaw-cross-os-release-checks-reusable.yml", "utf8", @@ -216,9 +216,9 @@ describe("scripts/openclaw-cross-os-release-checks", () => { const releaseChecks = readFileSync(".github/workflows/openclaw-release-checks.yml", "utf8"); expect(workflow).toContain( - "OPENCLAW_CROSS_OS_OPENAI_MODEL: ${{ inputs.openai_model || vars.OPENCLAW_CROSS_OS_OPENAI_MODEL || 'openai/gpt-5.4' }}", + "OPENCLAW_CROSS_OS_OPENAI_MODEL: ${{ inputs.openai_model || vars.OPENCLAW_CROSS_OS_OPENAI_MODEL || 'openai/gpt-5.5' }}", ); - expect(releaseChecks).toContain("openai_model: openai/gpt-5.4"); + expect(releaseChecks).toContain("openai_model: openai/gpt-5.5"); }); it("keeps release smoke plugin allowlists focused on agent-turn essentials", () => { diff --git a/test/scripts/package-acceptance-workflow.test.ts b/test/scripts/package-acceptance-workflow.test.ts index fe5f3b017cec..d14a0132935e 100644 --- a/test/scripts/package-acceptance-workflow.test.ts +++ b/test/scripts/package-acceptance-workflow.test.ts @@ -413,7 +413,7 @@ describe("package artifact reuse", () => { expect(workflow).toContain("suite_id: native-live-src-gateway-profiles-anthropic-opus"); expect(workflow).toContain("suite_id: native-live-src-gateway-profiles-anthropic-sonnet-haiku"); expect(workflow).toContain("suite_group: native-live-src-gateway-profiles-anthropic"); - expect(workflow).toContain("anthropic/claude-opus-4-7,anthropic/claude-opus-4-6"); + expect(workflow).toContain("OPENCLAW_LIVE_GATEWAY_MODELS=anthropic/claude-opus-4-7"); expect(workflow).toContain("anthropic/claude-sonnet-4-6,anthropic/claude-haiku-4-5"); expect(workflow).toMatch( /suite_id: native-live-src-gateway-profiles-fireworks[\s\S]*?advisory: true/u,