mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-06 05:51:15 +08:00
fix(qa-lab): refresh parity model targets
This commit is contained in:
@@ -138,7 +138,7 @@ jobs:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENCLAW_DOCS_I18N_OPENAI_API_KEY || secrets.OPENAI_API_KEY }}
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
OPENCLAW_CONTROL_UI_I18N_PROVIDER: ${{ secrets.ANTHROPIC_API_KEY != '' && 'anthropic' || 'openai' }}
|
||||
OPENCLAW_CONTROL_UI_I18N_MODEL: ${{ secrets.ANTHROPIC_API_KEY != '' && 'claude-opus-4-6' || vars.OPENCLAW_CI_OPENAI_MODEL_BARE }}
|
||||
OPENCLAW_CONTROL_UI_I18N_MODEL: ${{ secrets.ANTHROPIC_API_KEY != '' && 'claude-opus-4-7' || vars.OPENCLAW_CI_OPENAI_MODEL_BARE }}
|
||||
OPENCLAW_CONTROL_UI_I18N_THINKING: low
|
||||
OPENCLAW_CONTROL_UI_I18N_AUTH_OPTIONAL: "1"
|
||||
LOCALE: ${{ matrix.locale }}
|
||||
|
||||
@@ -349,8 +349,8 @@ jobs:
|
||||
--repo-root "$repo_root" \
|
||||
--output-dir "$output_dir" \
|
||||
--provider-mode live-frontier \
|
||||
--model openai/gpt-5.4 \
|
||||
--alt-model openai/gpt-5.4 \
|
||||
--model openai/gpt-5.5 \
|
||||
--alt-model openai/gpt-5.5 \
|
||||
--fast \
|
||||
--credential-source convex \
|
||||
--credential-role ci \
|
||||
|
||||
@@ -281,8 +281,8 @@ jobs:
|
||||
--credential-role ci \
|
||||
--provider-mode live-frontier \
|
||||
--hydrate-mode "$HYDRATE_MODE" \
|
||||
--model openai/gpt-5.4 \
|
||||
--alt-model openai/gpt-5.4 \
|
||||
--model openai/gpt-5.5 \
|
||||
--alt-model openai/gpt-5.5 \
|
||||
--fast \
|
||||
--scenario "$SCENARIO_ID" \
|
||||
"${keep_args[@]}" \
|
||||
|
||||
2
.github/workflows/mantis-telegram-live.yml
vendored
2
.github/workflows/mantis-telegram-live.yml
vendored
@@ -386,7 +386,7 @@ jobs:
|
||||
output_rel=".artifacts/qa-e2e/mantis/telegram-live"
|
||||
root="$candidate_repo/$output_rel"
|
||||
echo "output_dir=${root}" >> "$GITHUB_OUTPUT"
|
||||
model="${OPENCLAW_CI_OPENAI_MODEL:-openai/gpt-5.4}"
|
||||
model="${OPENCLAW_CI_OPENAI_MODEL:-openai/gpt-5.5}"
|
||||
|
||||
scenario_args=()
|
||||
if [[ -n "${SCENARIO_INPUT// }" ]]; then
|
||||
|
||||
@@ -186,7 +186,7 @@ env:
|
||||
PNPM_VERSION: "11.0.8"
|
||||
OPENCLAW_REPOSITORY: openclaw/openclaw
|
||||
TSX_VERSION: "4.21.0"
|
||||
OPENCLAW_CROSS_OS_OPENAI_MODEL: ${{ inputs.openai_model || vars.OPENCLAW_CROSS_OS_OPENAI_MODEL || 'openai/gpt-5.4' }}
|
||||
OPENCLAW_CROSS_OS_OPENAI_MODEL: ${{ inputs.openai_model || vars.OPENCLAW_CROSS_OS_OPENAI_MODEL || 'openai/gpt-5.5' }}
|
||||
|
||||
jobs:
|
||||
prepare:
|
||||
|
||||
@@ -1911,7 +1911,7 @@ jobs:
|
||||
- suite_id: native-live-src-gateway-profiles-anthropic-opus
|
||||
suite_group: native-live-src-gateway-profiles-anthropic
|
||||
label: Native live gateway profiles Anthropic Opus
|
||||
command: OPENCLAW_LIVE_GATEWAY_PROVIDERS=anthropic OPENCLAW_LIVE_GATEWAY_MODELS=anthropic/claude-opus-4-7,anthropic/claude-opus-4-6 node .release-harness/scripts/test-live-shard.mjs native-live-src-gateway-profiles
|
||||
command: OPENCLAW_LIVE_GATEWAY_PROVIDERS=anthropic OPENCLAW_LIVE_GATEWAY_MODELS=anthropic/claude-opus-4-7 node .release-harness/scripts/test-live-shard.mjs native-live-src-gateway-profiles
|
||||
timeout_minutes: 30
|
||||
profile_env_only: false
|
||||
advisory: true
|
||||
|
||||
18
.github/workflows/openclaw-performance.yml
vendored
18
.github/workflows/openclaw-performance.yml
vendored
@@ -30,8 +30,8 @@ on:
|
||||
required: false
|
||||
default: false
|
||||
type: boolean
|
||||
live_gpt54:
|
||||
description: Run the live OpenAI GPT 5.4 agent-turn lane
|
||||
live_openai_candidate:
|
||||
description: Run the live OpenAI GPT 5.5 agent-turn lane
|
||||
required: false
|
||||
default: false
|
||||
type: boolean
|
||||
@@ -57,7 +57,7 @@ env:
|
||||
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
|
||||
OCM_VERSION: v0.2.15
|
||||
KOVA_REPOSITORY: openclaw/Kova
|
||||
PERFORMANCE_MODEL_ID: gpt-5.4
|
||||
PERFORMANCE_MODEL_ID: gpt-5.5
|
||||
|
||||
jobs:
|
||||
kova:
|
||||
@@ -82,8 +82,8 @@ jobs:
|
||||
deep_profile: "true"
|
||||
live: "false"
|
||||
include_filters: "scenario:fresh-install scenario:gateway-performance scenario:agent-cold-warm-message"
|
||||
- lane: live-gpt54
|
||||
title: Kova live OpenAI GPT 5.4 agent turn
|
||||
- lane: live-openai-candidate
|
||||
title: Kova live OpenAI GPT 5.5 agent turn
|
||||
auth: live
|
||||
repeat: "1"
|
||||
deep_profile: "false"
|
||||
@@ -119,9 +119,9 @@ jobs:
|
||||
run_lane=false
|
||||
reason="deep_profile input is false"
|
||||
fi
|
||||
if [[ "$LANE_ID" == "live-gpt54" && "${{ github.event_name }}" != "schedule" && "${{ inputs.live_gpt54 || 'false' }}" != "true" ]]; then
|
||||
if [[ "$LANE_ID" == "live-openai-candidate" && "${{ github.event_name }}" != "schedule" && "${{ inputs.live_openai_candidate || 'false' }}" != "true" ]]; then
|
||||
run_lane=false
|
||||
reason="live_gpt54 input is false"
|
||||
reason="live_openai_candidate input is false"
|
||||
fi
|
||||
echo "run=$run_lane" >> "$GITHUB_OUTPUT"
|
||||
if [[ "$run_lane" != "true" ]]; then
|
||||
@@ -200,7 +200,7 @@ jobs:
|
||||
chmod 0755 "$HOME/.local/bin/kova"
|
||||
echo "$HOME/.local/bin" >> "$GITHUB_PATH"
|
||||
|
||||
- name: Pin Kova OpenAI model to GPT 5.4
|
||||
- name: Pin Kova OpenAI model to GPT 5.5
|
||||
if: steps.lane.outputs.run == 'true'
|
||||
shell: bash
|
||||
run: |
|
||||
@@ -244,7 +244,7 @@ jobs:
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [[ -z "${OPENAI_API_KEY:-}" ]]; then
|
||||
echo "OPENAI_API_KEY is not configured; live GPT 5.4 lane will be skipped." >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "OPENAI_API_KEY is not configured; live GPT 5.5 lane will be skipped." >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 0
|
||||
fi
|
||||
kova setup --ci --json
|
||||
|
||||
12
.github/workflows/openclaw-release-checks.yml
vendored
12
.github/workflows/openclaw-release-checks.yml
vendored
@@ -542,7 +542,7 @@ jobs:
|
||||
candidate_file_name: openclaw-current.tgz
|
||||
candidate_version: ${{ needs.prepare_release_package.outputs.package_version }}
|
||||
candidate_source_sha: ${{ needs.prepare_release_package.outputs.source_sha }}
|
||||
openai_model: openai/gpt-5.4
|
||||
openai_model: openai/gpt-5.5
|
||||
ubuntu_runner: ubuntu-24.04
|
||||
windows_runner: windows-2025
|
||||
macos_runner: macos-26
|
||||
@@ -724,9 +724,9 @@ jobs:
|
||||
matrix:
|
||||
include:
|
||||
- lane: candidate
|
||||
output_dir: gpt54
|
||||
output_dir: openai-candidate
|
||||
- lane: baseline
|
||||
output_dir: opus46
|
||||
output_dir: anthropic-baseline
|
||||
env:
|
||||
QA_PARITY_CONCURRENCY: "1"
|
||||
OPENCLAW_QA_TRANSPORT_READY_TIMEOUT_MS: "180000"
|
||||
@@ -772,7 +772,7 @@ jobs:
|
||||
;;
|
||||
baseline)
|
||||
model="anthropic/claude-opus-4-7"
|
||||
alt_model="anthropic/claude-sonnet-4-7"
|
||||
alt_model="anthropic/claude-sonnet-4-6"
|
||||
;;
|
||||
*)
|
||||
echo "Unknown QA parity lane: ${QA_PARITY_LANE}" >&2
|
||||
@@ -841,8 +841,8 @@ jobs:
|
||||
run: |
|
||||
pnpm openclaw qa parity-report \
|
||||
--repo-root . \
|
||||
--candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \
|
||||
--baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
|
||||
--candidate-summary .artifacts/qa-e2e/openai-candidate/qa-suite-summary.json \
|
||||
--baseline-summary .artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json \
|
||||
--candidate-label "${OPENCLAW_CI_OPENAI_MODEL}" \
|
||||
--baseline-label anthropic/claude-opus-4-7 \
|
||||
--output-dir .artifacts/qa-e2e/parity
|
||||
|
||||
14
.github/workflows/qa-live-transports-convex.yml
vendored
14
.github/workflows/qa-live-transports-convex.yml
vendored
@@ -198,7 +198,7 @@ jobs:
|
||||
--concurrency "${QA_PARITY_CONCURRENCY}" \
|
||||
--model "${OPENCLAW_CI_OPENAI_MODEL}" \
|
||||
--alt-model openai/gpt-5.5-alt \
|
||||
--output-dir .artifacts/qa-e2e/gpt54
|
||||
--output-dir .artifacts/qa-e2e/openai-candidate
|
||||
|
||||
- name: Run Opus 4.7 lane
|
||||
run: |
|
||||
@@ -207,15 +207,15 @@ jobs:
|
||||
--parity-pack agentic \
|
||||
--concurrency "${QA_PARITY_CONCURRENCY}" \
|
||||
--model anthropic/claude-opus-4-7 \
|
||||
--alt-model anthropic/claude-sonnet-4-7 \
|
||||
--output-dir .artifacts/qa-e2e/opus46
|
||||
--alt-model anthropic/claude-sonnet-4-6 \
|
||||
--output-dir .artifacts/qa-e2e/anthropic-baseline
|
||||
|
||||
- name: Generate parity report
|
||||
run: |
|
||||
pnpm openclaw qa parity-report \
|
||||
--repo-root . \
|
||||
--candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \
|
||||
--baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
|
||||
--candidate-summary .artifacts/qa-e2e/openai-candidate/qa-suite-summary.json \
|
||||
--baseline-summary .artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json \
|
||||
--candidate-label "${OPENCLAW_CI_OPENAI_MODEL}" \
|
||||
--baseline-label anthropic/claude-opus-4-7 \
|
||||
--output-dir .artifacts/qa-e2e/parity
|
||||
@@ -565,8 +565,8 @@ jobs:
|
||||
--repo-root . \
|
||||
--output-dir "${output_dir}" \
|
||||
--provider-mode live-frontier \
|
||||
--model openai/gpt-5.4 \
|
||||
--alt-model openai/gpt-5.4 \
|
||||
--model openai/gpt-5.5 \
|
||||
--alt-model openai/gpt-5.5 \
|
||||
--fast \
|
||||
--credential-source convex \
|
||||
--credential-role ci \
|
||||
|
||||
@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
|
||||
- QA-Lab/qa-channel: attach redacted agent tool-start traces to outbound `QaBusMessage` records so scenarios can assert actual tool use instead of relying only on reply text. Fixes #67637. Thanks @100yenadmin.
|
||||
- QA-Lab: fail live runtime parity reports when assistant-message usage is missing, preventing `0 vs 0` live token rows from being reported as passing proof. Fixes #80411. Thanks @100yenadmin.
|
||||
- QA-Lab: fail Codex-backed OpenAI live runtime-pair runs before launching isolated workers when no portable Codex auth is available, while staging API-key fallbacks and configured Codex keys for isolated QA agents. Fixes #80412. Thanks @100yenadmin.
|
||||
- QA-Lab: refresh parity gates, mock frontier fixtures, model scenarios, and workflow artifact lanes to compare GPT-5.5 against Claude Opus 4.7. Fixes #74262. Thanks @100yenadmin.
|
||||
- QA-Lab: stop returning Control UI bearer tokens from unauthenticated bootstrap payloads and bind Docker harness ports to loopback-only host addresses. (#66355) Thanks @pgondhi987.
|
||||
- Mac app: avoid a SwiftUI metadata crash when rendering the Cron Jobs settings pane.
|
||||
- Agents/OpenAI streams: yield via `setTimeout(0)` instead of `setImmediate` between bursty Responses chunks so abort timers can fire during the yield, keeping cancel-on-timeout responsive on hot streams. Refs #82462.
|
||||
|
||||
@@ -35,7 +35,7 @@ OpenClaw CI runs on every push to `main` and every pull request. The `preflight`
|
||||
| `macos-swift` | Swift lint, build, and tests for the macOS app | macOS-relevant changes |
|
||||
| `android` | Android unit tests for both flavors plus one debug APK build | Android-relevant changes |
|
||||
| `test-performance-agent` | Daily Codex slow-test optimization after trusted activity | Main CI success or manual dispatch |
|
||||
| `openclaw-performance` | Daily/on-demand Kova runtime performance reports with mock-provider, deep-profile, and GPT 5.4 live lanes | Scheduled and manual dispatch |
|
||||
| `openclaw-performance` | Daily/on-demand Kova runtime performance reports with mock-provider, deep-profile, and GPT 5.5 live lanes | Scheduled and manual dispatch |
|
||||
|
||||
## Fail-fast order
|
||||
|
||||
@@ -138,7 +138,7 @@ pnpm perf:kova:summary --report .artifacts/kova/reports/mock-provider/report.jso
|
||||
|
||||
```bash
|
||||
gh workflow run openclaw-performance.yml --ref main -f profile=diagnostic -f repeat=3
|
||||
gh workflow run openclaw-performance.yml --ref main -f profile=smoke -f repeat=1 -f deep_profile=true -f live_gpt54=true
|
||||
gh workflow run openclaw-performance.yml --ref main -f profile=smoke -f repeat=1 -f deep_profile=true -f live_openai_candidate=true
|
||||
gh workflow run openclaw-performance.yml --ref main -f target_ref=v2026.5.2 -f profile=diagnostic -f repeat=3
|
||||
```
|
||||
|
||||
@@ -148,7 +148,7 @@ The workflow installs OCM from a pinned release and Kova from `openclaw/Kova` at
|
||||
|
||||
- `mock-provider`: Kova diagnostic scenarios against a local-build runtime with deterministic fake OpenAI-compatible auth.
|
||||
- `mock-deep-profile`: CPU/heap/trace profiling for startup, gateway, and agent-turn hotspots.
|
||||
- `live-gpt54`: a real OpenAI `openai/gpt-5.4` agent turn, skipped when `OPENAI_API_KEY` is unavailable.
|
||||
- `live-openai-candidate`: a real OpenAI `openai/gpt-5.5` agent turn, skipped when `OPENAI_API_KEY` is unavailable.
|
||||
|
||||
The mock-provider lane also runs OpenClaw-native source probes after the Kova pass: gateway boot timing and memory across default, hook, and 50-plugin startup cases; repeated mock-OpenAI `channel-chat-baseline` hello loops; and CLI startup commands against the booted gateway. The source probe Markdown summary lives at `source/index.md` in the report bundle, with raw JSON beside it.
|
||||
|
||||
@@ -269,7 +269,7 @@ For the dedicated update and plugin testing policy, including local commands,
|
||||
Docker lanes, Package Acceptance inputs, release defaults, and failure triage,
|
||||
see [Testing updates and plugins](/help/testing-updates-plugins).
|
||||
|
||||
Release checks call Package Acceptance with `source=artifact`, the prepared release package artifact, `suite_profile=custom`, `docker_lanes='doctor-switch update-channel-switch skill-install update-corrupt-plugin upgrade-survivor published-upgrade-survivor update-restart-auth plugins-offline plugin-update'`, and `telegram_mode=mock-openai`. This keeps package migration, update, live ClawHub skill install, stale-plugin-dependency cleanup, configured-plugin install repair, offline plugin, plugin-update, and Telegram proof on the same resolved package tarball. Set `release_package_spec` on Full Release Validation or OpenClaw Release Checks after publishing a beta to run the same matrix against the shipped npm package without rebuilding; set `package_acceptance_package_spec` only when Package Acceptance needs a different package from the rest of release validation. Cross-OS release checks still cover OS-specific onboarding, installer, and platform behavior; package/update product validation should start with Package Acceptance. The `published-upgrade-survivor` Docker lane validates one published package baseline per run in the blocking release path. In Package Acceptance, the resolved `package-under-test` tarball is always the candidate and `published_upgrade_survivor_baseline` selects the fallback published baseline, defaulting to `openclaw@latest`; failed-lane rerun commands preserve that baseline. Full Release Validation with `run_release_soak=true` or `release_profile=full` sets `published_upgrade_survivor_baselines='last-stable-4 2026.4.23 2026.5.2 2026.4.15'` and `published_upgrade_survivor_scenarios=reported-issues` to expand across the four latest stable npm releases plus pinned plugin-compatibility boundary releases and issue-shaped fixtures for Feishu config, preserved bootstrap/persona files, configured OpenClaw plugin installs, tilde log paths, and stale legacy plugin dependency roots. Multi-baseline published-upgrade survivor selections are sharded by baseline into separate targeted Docker runner jobs. The separate `Update Migration` workflow uses the `update-migration` Docker lane with `all-since-2026.4.23` and `plugin-deps-cleanup` when the question is exhaustive published update cleanup, not normal Full Release CI breadth. Local aggregate runs can pass exact package specs with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPECS`, keep a single lane with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPEC` such as `openclaw@2026.4.15`, or set `OPENCLAW_UPGRADE_SURVIVOR_SCENARIOS` for the scenario matrix. The published lane configures the baseline with a baked `openclaw config set` command recipe, records recipe steps in `summary.json`, and probes `/healthz`, `/readyz`, plus RPC status after Gateway start. The Windows packaged and installer fresh lanes also verify that an installed package can import a browser-control override from a raw absolute Windows path. The OpenAI cross-OS agent-turn smoke defaults to `OPENCLAW_CROSS_OS_OPENAI_MODEL` when set, otherwise `openai/gpt-5.4`, so the install and gateway proof stays on a GPT-5 test model while avoiding GPT-4.x defaults.
|
||||
Release checks call Package Acceptance with `source=artifact`, the prepared release package artifact, `suite_profile=custom`, `docker_lanes='doctor-switch update-channel-switch skill-install update-corrupt-plugin upgrade-survivor published-upgrade-survivor update-restart-auth plugins-offline plugin-update'`, and `telegram_mode=mock-openai`. This keeps package migration, update, live ClawHub skill install, stale-plugin-dependency cleanup, configured-plugin install repair, offline plugin, plugin-update, and Telegram proof on the same resolved package tarball. Set `release_package_spec` on Full Release Validation or OpenClaw Release Checks after publishing a beta to run the same matrix against the shipped npm package without rebuilding; set `package_acceptance_package_spec` only when Package Acceptance needs a different package from the rest of release validation. Cross-OS release checks still cover OS-specific onboarding, installer, and platform behavior; package/update product validation should start with Package Acceptance. The `published-upgrade-survivor` Docker lane validates one published package baseline per run in the blocking release path. In Package Acceptance, the resolved `package-under-test` tarball is always the candidate and `published_upgrade_survivor_baseline` selects the fallback published baseline, defaulting to `openclaw@latest`; failed-lane rerun commands preserve that baseline. Full Release Validation with `run_release_soak=true` or `release_profile=full` sets `published_upgrade_survivor_baselines='last-stable-4 2026.4.23 2026.5.2 2026.4.15'` and `published_upgrade_survivor_scenarios=reported-issues` to expand across the four latest stable npm releases plus pinned plugin-compatibility boundary releases and issue-shaped fixtures for Feishu config, preserved bootstrap/persona files, configured OpenClaw plugin installs, tilde log paths, and stale legacy plugin dependency roots. Multi-baseline published-upgrade survivor selections are sharded by baseline into separate targeted Docker runner jobs. The separate `Update Migration` workflow uses the `update-migration` Docker lane with `all-since-2026.4.23` and `plugin-deps-cleanup` when the question is exhaustive published update cleanup, not normal Full Release CI breadth. Local aggregate runs can pass exact package specs with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPECS`, keep a single lane with `OPENCLAW_UPGRADE_SURVIVOR_BASELINE_SPEC` such as `openclaw@2026.4.15`, or set `OPENCLAW_UPGRADE_SURVIVOR_SCENARIOS` for the scenario matrix. The published lane configures the baseline with a baked `openclaw config set` command recipe, records recipe steps in `summary.json`, and probes `/healthz`, `/readyz`, plus RPC status after Gateway start. The Windows packaged and installer fresh lanes also verify that an installed package can import a browser-control override from a raw absolute Windows path. The OpenAI cross-OS agent-turn smoke defaults to `OPENCLAW_CROSS_OS_OPENAI_MODEL` when set, otherwise `openai/gpt-5.5`, so the install and gateway proof stays on a GPT-5 test model while avoiding GPT-4.x defaults.
|
||||
|
||||
### Legacy compatibility windows
|
||||
|
||||
|
||||
@@ -175,7 +175,7 @@ For an agent/CV style desktop task, run:
|
||||
pnpm openclaw qa mantis visual-task \
|
||||
--browser-url https://example.net \
|
||||
--expect-text "Example Domain" \
|
||||
--vision-model openai/gpt-5.4
|
||||
--vision-model openai/gpt-5.5
|
||||
```
|
||||
|
||||
`visual-task` leases or reuses a Crabbox desktop/browser machine, starts
|
||||
@@ -370,8 +370,8 @@ Run the Mantis status-reaction scenario explicitly:
|
||||
pnpm openclaw qa discord \
|
||||
--scenario discord-status-reactions-tool-only \
|
||||
--provider-mode live-frontier \
|
||||
--model openai/gpt-5.4 \
|
||||
--alt-model openai/gpt-5.4 \
|
||||
--model openai/gpt-5.5 \
|
||||
--alt-model openai/gpt-5.5 \
|
||||
--fast
|
||||
```
|
||||
|
||||
@@ -780,13 +780,13 @@ pnpm openclaw qa character-eval \
|
||||
--model openai/gpt-5.5,thinking=medium,fast \
|
||||
--model openai/gpt-5.2,thinking=xhigh \
|
||||
--model openai/gpt-5,thinking=xhigh \
|
||||
--model anthropic/claude-opus-4-6,thinking=high \
|
||||
--model anthropic/claude-opus-4-7,thinking=high \
|
||||
--model anthropic/claude-sonnet-4-6,thinking=high \
|
||||
--model zai/glm-5.1,thinking=high \
|
||||
--model moonshot/kimi-k2.5,thinking=high \
|
||||
--model google/gemini-3.1-pro-preview,thinking=high \
|
||||
--judge-model openai/gpt-5.5,thinking=xhigh,fast \
|
||||
--judge-model anthropic/claude-opus-4-6,thinking=high \
|
||||
--judge-model anthropic/claude-opus-4-7,thinking=high \
|
||||
--blind-judge-models \
|
||||
--concurrency 16 \
|
||||
--judge-concurrency 16
|
||||
@@ -817,13 +817,13 @@ Candidate and judge model runs both default to concurrency 16. Lower
|
||||
`--concurrency` or `--judge-concurrency` when provider limits or local gateway
|
||||
pressure make a run too noisy.
|
||||
When no candidate `--model` is passed, the character eval defaults to
|
||||
`openai/gpt-5.5`, `openai/gpt-5.2`, `openai/gpt-5`, `anthropic/claude-opus-4-6`,
|
||||
`openai/gpt-5.5`, `openai/gpt-5.2`, `openai/gpt-5`, `anthropic/claude-opus-4-7`,
|
||||
`anthropic/claude-sonnet-4-6`, `zai/glm-5.1`,
|
||||
`moonshot/kimi-k2.5`, and
|
||||
`google/gemini-3.1-pro-preview` when no `--model` is passed.
|
||||
When no `--judge-model` is passed, the judges default to
|
||||
`openai/gpt-5.5,thinking=xhigh,fast` and
|
||||
`anthropic/claude-opus-4-6,thinking=high`.
|
||||
`anthropic/claude-opus-4-7,thinking=high`.
|
||||
|
||||
## Related docs
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ Does not own:
|
||||
|
||||
Owns:
|
||||
|
||||
- first-wave GPT-5.5 vs Opus 4.6 scenario pack
|
||||
- first-wave GPT-5.5 vs Opus 4.7 scenario pack
|
||||
- parity documentation
|
||||
- parity report and release-gate mechanics
|
||||
|
||||
@@ -123,7 +123,7 @@ Expected artifacts from PR D:
|
||||
|
||||
## Release gate
|
||||
|
||||
Do not claim GPT-5.5 parity or superiority over Opus 4.6 until:
|
||||
Do not claim GPT-5.5 parity or superiority over Opus 4.7 until:
|
||||
|
||||
- PR A, PR B, and PR C are merged
|
||||
- PR D runs the first-wave parity pack cleanly
|
||||
@@ -133,7 +133,7 @@ Do not claim GPT-5.5 parity or superiority over Opus 4.6 until:
|
||||
```mermaid
|
||||
flowchart LR
|
||||
A["PR A-C merged"] --> B["Run GPT-5.5 parity pack"]
|
||||
A --> C["Run Opus 4.6 parity pack"]
|
||||
A --> C["Run Opus 4.7 parity pack"]
|
||||
B --> D["qa-suite-summary.json"]
|
||||
C --> E["qa-suite-summary.json"]
|
||||
D --> F["qa parity-report"]
|
||||
@@ -146,7 +146,7 @@ flowchart LR
|
||||
|
||||
The parity harness is not the only evidence source. Keep this split explicit in review:
|
||||
|
||||
- PR D owns the scenario-based GPT-5.5 vs Opus 4.6 comparison
|
||||
- PR D owns the scenario-based GPT-5.5 vs Opus 4.7 comparison
|
||||
- PR B deterministic suites still own auth/proxy/DNS and full-access truthfulness evidence
|
||||
|
||||
## Quick maintainer merge workflow
|
||||
@@ -179,7 +179,7 @@ If any one of the evidence bar items is missing, request changes instead of merg
|
||||
| No fake progress or fake tool completion | PR A + PR D | parity fake-success count plus scenario-level report details |
|
||||
| No false `/elevated full` guidance | PR B | deterministic runtime-truthfulness suites |
|
||||
| Replay/liveness failures remain explicit | PR C + PR D | lifecycle/replay suites plus `compaction-retry-mutating-tool` |
|
||||
| GPT-5.5 matches or beats Opus 4.6 | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` |
|
||||
| GPT-5.5 matches or beats Opus 4.7 | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` |
|
||||
|
||||
## Reviewer shorthand: before vs after
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ OpenClaw already worked well with tool-using frontier models, but GPT-5.5 and Co
|
||||
- they could use strict OpenAI/Codex tool schemas incorrectly
|
||||
- they could ask for `/elevated full` even when full access was impossible
|
||||
- they could lose long-running task state during replay or compaction
|
||||
- parity claims against Claude Opus 4.6 were based on anecdotes instead of repeatable scenarios
|
||||
- parity claims against Claude Opus 4.7 were based on anecdotes instead of repeatable scenarios
|
||||
|
||||
This parity program fixes those gaps in four reviewable slices.
|
||||
|
||||
@@ -51,7 +51,7 @@ The tool-compat work reduces schema friction for strict OpenAI/Codex tool regist
|
||||
|
||||
### PR D: parity harness
|
||||
|
||||
This slice adds the first-wave QA-lab parity pack so GPT-5.5 and Opus 4.6 can be exercised through the same scenarios and compared using shared evidence.
|
||||
This slice adds the first-wave QA-lab parity pack so GPT-5.5 and Opus 4.7 can be exercised through the same scenarios and compared using shared evidence.
|
||||
|
||||
The parity pack is the proof layer. It does not change runtime behavior by itself.
|
||||
|
||||
@@ -60,8 +60,8 @@ After you have two `qa-suite-summary.json` artifacts, generate the release-gate
|
||||
```bash
|
||||
pnpm openclaw qa parity-report \
|
||||
--repo-root . \
|
||||
--candidate-summary .artifacts/qa-e2e/gpt55/qa-suite-summary.json \
|
||||
--baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \
|
||||
--candidate-summary .artifacts/qa-e2e/openai-candidate/qa-suite-summary.json \
|
||||
--baseline-summary .artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json \
|
||||
--output-dir .artifacts/qa-e2e/parity
|
||||
```
|
||||
|
||||
@@ -122,7 +122,7 @@ flowchart TD
|
||||
```mermaid
|
||||
flowchart LR
|
||||
A["Merged runtime slices (PR A-C)"] --> B["Run GPT-5.5 parity pack"]
|
||||
A --> C["Run Opus 4.6 parity pack"]
|
||||
A --> C["Run Opus 4.7 parity pack"]
|
||||
B --> D["qa-suite-summary.json"]
|
||||
C --> E["qa-suite-summary.json"]
|
||||
D --> F["openclaw qa parity-report"]
|
||||
@@ -178,7 +178,7 @@ Required outcomes:
|
||||
- no fake completion without real execution
|
||||
- no incorrect `/elevated full` guidance
|
||||
- no silent replay or compaction abandonment
|
||||
- parity-pack metrics that are at least as strong as the agreed Opus 4.6 baseline
|
||||
- parity-pack metrics that are at least as strong as the agreed Opus 4.7 baseline
|
||||
|
||||
For the first-wave harness, the gate compares:
|
||||
|
||||
@@ -189,7 +189,7 @@ For the first-wave harness, the gate compares:
|
||||
|
||||
Parity evidence is intentionally split across two layers:
|
||||
|
||||
- PR D proves same-scenario GPT-5.5 vs Opus 4.6 behavior with QA-lab
|
||||
- PR D proves same-scenario GPT-5.5 vs Opus 4.7 behavior with QA-lab
|
||||
- PR B deterministic suites prove auth, proxy, DNS, and `/elevated full` truthfulness outside the harness
|
||||
|
||||
## Goal-to-evidence matrix
|
||||
@@ -200,13 +200,13 @@ Parity evidence is intentionally split across two layers:
|
||||
| GPT-5.5 no longer fakes progress or fake tool completion | PR A + PR D | parity report scenario outcomes and fake-success count | no suspicious pass results and no commentary-only completion |
|
||||
| GPT-5.5 no longer gives false `/elevated full` guidance | PR B | deterministic truthfulness suites | blocked reasons and full-access hints stay runtime-accurate |
|
||||
| Replay/liveness failures stay explicit | PR C + PR D | PR C lifecycle/replay suites plus `compaction-retry-mutating-tool` | mutating work keeps replay-unsafety explicit instead of silently disappearing |
|
||||
| GPT-5.5 matches or beats Opus 4.6 on the agreed metrics | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | same scenario coverage and no regression on completion, stop behavior, or valid tool use |
|
||||
| GPT-5.5 matches or beats Opus 4.7 on the agreed metrics | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | same scenario coverage and no regression on completion, stop behavior, or valid tool use |
|
||||
|
||||
## How to read the parity verdict
|
||||
|
||||
Use the verdict in `qa-agentic-parity-summary.json` as the final machine-readable decision for the first-wave parity pack.
|
||||
|
||||
- `pass` means GPT-5.5 covered the same scenarios as Opus 4.6 and did not regress on the agreed aggregate metrics.
|
||||
- `pass` means GPT-5.5 covered the same scenarios as Opus 4.7 and did not regress on the agreed aggregate metrics.
|
||||
- `fail` means at least one hard gate tripped: weaker completion, worse unintended stops, weaker valid tool use, any fake-success case, or mismatched scenario coverage.
|
||||
- "shared/base CI issue" is not itself a parity result. If CI noise outside PR D blocks a run, the verdict should wait for a clean merged-runtime execution instead of being inferred from branch-era logs.
|
||||
- Auth, proxy, DNS, and `/elevated full` truthfulness still come from PR B's deterministic suites, so the final release claim needs both: a passing PR D parity verdict and green PR B truthfulness coverage.
|
||||
|
||||
@@ -47,9 +47,9 @@ When debugging real providers/models (requires real creds):
|
||||
- Live suite (models + gateway tool/image probes): `pnpm test:live`
|
||||
- Target one live file quietly: `pnpm test:live -- src/agents/models.profiles.live.test.ts`
|
||||
- Runtime performance reports: dispatch `OpenClaw Performance` with
|
||||
`live_gpt54=true` for a real `openai/gpt-5.4` agent turn or
|
||||
`live_openai_candidate=true` for a real `openai/gpt-5.5` agent turn or
|
||||
`deep_profile=true` for Kova CPU/heap/trace artifacts. Daily scheduled runs
|
||||
publish mock-provider, deep-profile, and GPT 5.4 lane artifacts to
|
||||
publish mock-provider, deep-profile, and GPT 5.5 lane artifacts to
|
||||
`openclaw/clawgrit-reports` when `CLAWGRIT_REPORTS_TOKEN` is configured. The
|
||||
mock-provider report also includes source-level gateway boot, memory,
|
||||
plugin-pressure, repeated fake-model hello-loop, and CLI startup numbers.
|
||||
|
||||
@@ -153,7 +153,7 @@ describe("qa agentic parity report", () => {
|
||||
it("fails the parity gate when the candidate regresses against baseline", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: [
|
||||
{ name: "Approval turn tool followthrough", status: "pass" },
|
||||
@@ -181,10 +181,10 @@ describe("qa agentic parity report", () => {
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toContain(
|
||||
"openai/gpt-5.5 completion rate 80.0% is below anthropic/claude-opus-4-6 100.0%.",
|
||||
"openai/gpt-5.5 completion rate 80.0% is below anthropic/claude-opus-4-7 100.0%.",
|
||||
);
|
||||
expect(comparison.failures).toContain(
|
||||
"openai/gpt-5.5 unintended-stop rate 20.0% exceeds anthropic/claude-opus-4-6 0.0%.",
|
||||
"openai/gpt-5.5 unintended-stop rate 20.0% exceeds anthropic/claude-opus-4-7 0.0%.",
|
||||
);
|
||||
});
|
||||
|
||||
@@ -199,7 +199,7 @@ describe("qa agentic parity report", () => {
|
||||
];
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: baselineScenarios.filter(
|
||||
(scenario) => scenario.name !== "Extra non-parity lane",
|
||||
@@ -211,14 +211,14 @@ describe("qa agentic parity report", () => {
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toContain(
|
||||
"Scenario coverage mismatch for Extra non-parity lane: openai/gpt-5.5=missing, anthropic/claude-opus-4-6=pass.",
|
||||
"Scenario coverage mismatch for Extra non-parity lane: openai/gpt-5.5=missing, anthropic/claude-opus-4-7=pass.",
|
||||
);
|
||||
});
|
||||
|
||||
it("reports each missing required parity scenario exactly once (no double-counting)", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
|
||||
},
|
||||
@@ -260,7 +260,7 @@ describe("qa agentic parity report", () => {
|
||||
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: summaryWithExtras,
|
||||
baselineSummary: scopedSummary,
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -282,7 +282,7 @@ describe("qa agentic parity report", () => {
|
||||
it("fails the parity gate when required parity scenarios are missing on both sides", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }],
|
||||
},
|
||||
@@ -294,14 +294,14 @@ describe("qa agentic parity report", () => {
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toContain(
|
||||
"Missing required parity scenario coverage for Image understanding from attachment: openai/gpt-5.5=missing, anthropic/claude-opus-4-6=missing.",
|
||||
"Missing required parity scenario coverage for Image understanding from attachment: openai/gpt-5.5=missing, anthropic/claude-opus-4-7=missing.",
|
||||
);
|
||||
});
|
||||
|
||||
it("fails the parity gate when required parity scenarios are skipped", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: [
|
||||
{ name: "Approval turn tool followthrough", status: "pass" },
|
||||
@@ -325,7 +325,7 @@ describe("qa agentic parity report", () => {
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toContain(
|
||||
"Missing required parity scenario coverage for Compaction retry after mutating tool: openai/gpt-5.5=skip, anthropic/claude-opus-4-6=skip.",
|
||||
"Missing required parity scenario coverage for Compaction retry after mutating tool: openai/gpt-5.5=skip, anthropic/claude-opus-4-7=skip.",
|
||||
);
|
||||
});
|
||||
|
||||
@@ -342,7 +342,7 @@ describe("qa agentic parity report", () => {
|
||||
});
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: { scenarios: scenariosWithBothFail },
|
||||
baselineSummary: { scenarios: scenariosWithBothFail },
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -350,7 +350,7 @@ describe("qa agentic parity report", () => {
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toContain(
|
||||
"Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-6=fail.",
|
||||
"Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-7=fail.",
|
||||
);
|
||||
// Metric comparisons are relative, so a same-on-both-sides failure
|
||||
// must not appear as a relative metric failure. The required-scenario
|
||||
@@ -370,7 +370,7 @@ describe("qa agentic parity report", () => {
|
||||
});
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: { scenarios: candidateWithOneFail },
|
||||
baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -378,7 +378,7 @@ describe("qa agentic parity report", () => {
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toContain(
|
||||
"Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-6=pass.",
|
||||
"Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-7=pass.",
|
||||
);
|
||||
});
|
||||
|
||||
@@ -387,7 +387,7 @@ describe("qa agentic parity report", () => {
|
||||
// below is the isolated gate failure under test (no coverage-gap noise).
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
},
|
||||
@@ -401,7 +401,7 @@ describe("qa agentic parity report", () => {
|
||||
|
||||
expect(comparison.pass).toBe(false);
|
||||
expect(comparison.failures).toEqual([
|
||||
"anthropic/claude-opus-4-6 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.",
|
||||
"anthropic/claude-opus-4-7 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.",
|
||||
]);
|
||||
});
|
||||
|
||||
@@ -571,14 +571,14 @@ status=done`,
|
||||
expect(() =>
|
||||
buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: parityPassScenarios,
|
||||
run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-6" },
|
||||
run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-7" },
|
||||
},
|
||||
baselineSummary: {
|
||||
scenarios: parityPassScenarios,
|
||||
run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-6" },
|
||||
run: { primaryProvider: "anthropic", primaryModel: "claude-opus-4-7" },
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
}),
|
||||
@@ -593,7 +593,7 @@ status=done`,
|
||||
expect(() =>
|
||||
buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: parityPassScenarios,
|
||||
run: { primaryProvider: "openai" },
|
||||
@@ -612,7 +612,7 @@ status=done`,
|
||||
it("accepts matching run.primaryProvider labels without throwing", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
@@ -625,8 +625,8 @@ status=done`,
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
primaryProvider: "anthropic",
|
||||
primaryModel: "anthropic/claude-opus-4-6",
|
||||
primaryModelName: "claude-opus-4-6",
|
||||
primaryModel: "anthropic/claude-opus-4-7",
|
||||
primaryModelName: "claude-opus-4-7",
|
||||
},
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -639,7 +639,7 @@ status=done`,
|
||||
// work against those, trusting the caller-supplied label.
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
|
||||
baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -650,7 +650,7 @@ status=done`,
|
||||
it("skips provider verification for arbitrary display labels when run metadata is present", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "GPT-5.5 candidate",
|
||||
baselineLabel: "Opus 4.6 baseline",
|
||||
baselineLabel: "Opus 4.7 baseline",
|
||||
candidateSummary: {
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
@@ -663,8 +663,8 @@ status=done`,
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
primaryProvider: "anthropic",
|
||||
primaryModel: "anthropic/claude-opus-4-6",
|
||||
primaryModelName: "claude-opus-4-6",
|
||||
primaryModel: "anthropic/claude-opus-4-7",
|
||||
primaryModelName: "claude-opus-4-7",
|
||||
},
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -676,7 +676,7 @@ status=done`,
|
||||
it("skips provider verification for mixed-case or decorated display labels", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "Candidate: GPT-5.5",
|
||||
baselineLabel: "Opus 4.6 / baseline",
|
||||
baselineLabel: "Opus 4.7 / baseline",
|
||||
candidateSummary: {
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
@@ -689,8 +689,8 @@ status=done`,
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
primaryProvider: "anthropic",
|
||||
primaryModel: "anthropic/claude-opus-4-6",
|
||||
primaryModelName: "claude-opus-4-6",
|
||||
primaryModel: "anthropic/claude-opus-4-7",
|
||||
primaryModelName: "claude-opus-4-7",
|
||||
},
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -703,7 +703,7 @@ status=done`,
|
||||
expect(() =>
|
||||
buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
@@ -716,8 +716,8 @@ status=done`,
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
primaryProvider: "anthropic",
|
||||
primaryModel: "anthropic/claude-opus-4-6",
|
||||
primaryModelName: "claude-opus-4-6",
|
||||
primaryModel: "anthropic/claude-opus-4-7",
|
||||
primaryModelName: "claude-opus-4-7",
|
||||
},
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -730,7 +730,7 @@ status=done`,
|
||||
it("accepts colon-delimited structured labels when provider and model both match", () => {
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai:gpt-5.5",
|
||||
baselineLabel: "anthropic:claude-opus-4-6",
|
||||
baselineLabel: "anthropic:claude-opus-4-7",
|
||||
candidateSummary: {
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
@@ -743,8 +743,8 @@ status=done`,
|
||||
scenarios: FULL_PARITY_PASS_SCENARIOS,
|
||||
run: {
|
||||
primaryProvider: "anthropic",
|
||||
primaryModel: "anthropic/claude-opus-4-6",
|
||||
primaryModelName: "claude-opus-4-6",
|
||||
primaryModel: "anthropic/claude-opus-4-7",
|
||||
primaryModelName: "claude-opus-4-7",
|
||||
},
|
||||
},
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -759,7 +759,7 @@ status=done`,
|
||||
// added by the second-wave expansion.
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5",
|
||||
baselineLabel: "anthropic/claude-opus-4-6",
|
||||
baselineLabel: "anthropic/claude-opus-4-7",
|
||||
candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
|
||||
baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
|
||||
comparedAt: "2026-04-11T00:00:00.000Z",
|
||||
@@ -768,7 +768,7 @@ status=done`,
|
||||
const report = renderQaAgenticParityMarkdownReport(comparison);
|
||||
|
||||
expect(report).toContain(
|
||||
"# OpenClaw Agentic Parity Report — openai/gpt-5.5 vs anthropic/claude-opus-4-6",
|
||||
"# OpenClaw Agentic Parity Report — openai/gpt-5.5 vs anthropic/claude-opus-4-7",
|
||||
);
|
||||
expect(report).toContain("| Completion rate | 100.0% | 100.0% |");
|
||||
expect(report).toContain("### Approval turn tool followthrough");
|
||||
@@ -779,7 +779,7 @@ status=done`,
|
||||
// Regression for the loop-7 Copilot finding: callers that configure
|
||||
// non-gpt-5.5 / non-opus labels (for example an internal candidate vs
|
||||
// another candidate) must see the labels in the rendered H1 instead of
|
||||
// the hardcoded "GPT-5.5 / Opus 4.6" title that would otherwise confuse
|
||||
// the hardcoded "GPT-5.5 / Opus 4.7" title that would otherwise confuse
|
||||
// readers of saved reports.
|
||||
const comparison = buildQaAgenticParityComparison({
|
||||
candidateLabel: "openai/gpt-5.5-alt",
|
||||
|
||||
@@ -564,7 +564,7 @@ export function renderQaAgenticParityMarkdownReport(comparison: QaAgenticParityC
|
||||
// Title is parametrized from the candidate / baseline labels so reports
|
||||
// for any candidate/baseline pair (not only gpt-5.5 vs opus 4.6) render
|
||||
// with an accurate header. The default CLI labels are still
|
||||
// openai/gpt-5.5 vs anthropic/claude-opus-4-6, but the helper works for
|
||||
// openai/gpt-5.5 vs anthropic/claude-opus-4-7, but the helper works for
|
||||
// any parity comparison a caller configures.
|
||||
const lines = [
|
||||
`# OpenClaw Agentic Parity Report — ${comparison.candidateLabel} vs ${comparison.baselineLabel}`,
|
||||
|
||||
@@ -274,7 +274,7 @@ describe("runQaCharacterEval", () => {
|
||||
{ model: "openai/gpt-5.5", rank: 1, score: 8, summary: "ok" },
|
||||
{ model: "openai/gpt-5.2", rank: 2, score: 7.5, summary: "ok" },
|
||||
{ model: "openai/gpt-5", rank: 3, score: 7.2, summary: "ok" },
|
||||
{ model: "anthropic/claude-opus-4-6", rank: 4, score: 7, summary: "ok" },
|
||||
{ model: "anthropic/claude-opus-4-7", rank: 4, score: 7, summary: "ok" },
|
||||
{ model: "anthropic/claude-sonnet-4-6", rank: 5, score: 6.8, summary: "ok" },
|
||||
{ model: "zai/glm-5.1", rank: 6, score: 6.3, summary: "ok" },
|
||||
{ model: "moonshot/kimi-k2.5", rank: 7, score: 6.2, summary: "ok" },
|
||||
@@ -294,7 +294,7 @@ describe("runQaCharacterEval", () => {
|
||||
"openai/gpt-5.5",
|
||||
"openai/gpt-5.2",
|
||||
"openai/gpt-5",
|
||||
"anthropic/claude-opus-4-6",
|
||||
"anthropic/claude-opus-4-7",
|
||||
"anthropic/claude-sonnet-4-6",
|
||||
"zai/glm-5.1",
|
||||
"moonshot/kimi-k2.5",
|
||||
@@ -323,7 +323,7 @@ describe("runQaCharacterEval", () => {
|
||||
expect(runJudge).toHaveBeenCalledTimes(2);
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeModel)).toEqual([
|
||||
"openai/gpt-5.5",
|
||||
"anthropic/claude-opus-4-6",
|
||||
"anthropic/claude-opus-4-7",
|
||||
]);
|
||||
expect(runJudge.mock.calls.map(([params]) => params.judgeThinkingDefault)).toEqual([
|
||||
"xhigh",
|
||||
@@ -577,11 +577,11 @@ describe("runQaCharacterEval", () => {
|
||||
candidateModelOptions: {
|
||||
"openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: false },
|
||||
},
|
||||
judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-6"],
|
||||
judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-7"],
|
||||
judgeThinkingDefault: "medium",
|
||||
judgeModelOptions: {
|
||||
"openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: true },
|
||||
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
|
||||
"anthropic/claude-opus-4-7": { thinkingDefault: "high" },
|
||||
},
|
||||
runSuite,
|
||||
runJudge,
|
||||
|
||||
@@ -622,7 +622,7 @@ describe("qa cli runtime", () => {
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-7",
|
||||
preflight: true,
|
||||
});
|
||||
|
||||
@@ -632,7 +632,7 @@ describe("qa cli runtime", () => {
|
||||
transportId: "qa-channel",
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-7",
|
||||
scenarioIds: ["approval-turn-tool-followthrough"],
|
||||
concurrency: 1,
|
||||
});
|
||||
@@ -930,7 +930,7 @@ describe("qa cli runtime", () => {
|
||||
fast: true,
|
||||
thinking: "medium",
|
||||
modelThinking: ["codex-cli/test-model=medium"],
|
||||
judgeModel: ["openai/gpt-5.5,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"],
|
||||
judgeModel: ["openai/gpt-5.5,thinking=xhigh,fast", "anthropic/claude-opus-4-7,thinking=high"],
|
||||
judgeTimeoutMs: 180_000,
|
||||
blindJudgeModels: true,
|
||||
concurrency: 4,
|
||||
@@ -951,10 +951,10 @@ describe("qa cli runtime", () => {
|
||||
"openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: false },
|
||||
"codex-cli/test-model": { thinkingDefault: "high", fastMode: true },
|
||||
},
|
||||
judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-6"],
|
||||
judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-7"],
|
||||
judgeModelOptions: {
|
||||
"openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: true },
|
||||
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
|
||||
"anthropic/claude-opus-4-7": { thinkingDefault: "high" },
|
||||
},
|
||||
judgeTimeoutMs: 180_000,
|
||||
judgeBlindModels: true,
|
||||
@@ -1285,7 +1285,7 @@ describe("qa cli runtime", () => {
|
||||
providerMode: "mock-openai",
|
||||
parityPack: "agentic",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-7",
|
||||
});
|
||||
|
||||
expect(runQaSuiteFromRuntime).toHaveBeenCalledWith({
|
||||
@@ -1294,7 +1294,7 @@ describe("qa cli runtime", () => {
|
||||
transportId: "qa-channel",
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-7",
|
||||
fastMode: undefined,
|
||||
scenarioIds: [
|
||||
"approval-turn-tool-followthrough",
|
||||
|
||||
@@ -346,9 +346,9 @@ describe("qa cli registration", () => {
|
||||
"--provider-mode",
|
||||
"live-frontier",
|
||||
"--model",
|
||||
"openai/gpt-5.4",
|
||||
"openai/gpt-5.5",
|
||||
"--alt-model",
|
||||
"openai/gpt-5.4",
|
||||
"openai/gpt-5.5",
|
||||
"--scenario",
|
||||
"slack-canary",
|
||||
"--credential-source",
|
||||
@@ -360,7 +360,7 @@ describe("qa cli registration", () => {
|
||||
]);
|
||||
|
||||
expect(runMantisSlackDesktopSmokeCommand).toHaveBeenCalledWith({
|
||||
alternateModel: "openai/gpt-5.4",
|
||||
alternateModel: "openai/gpt-5.5",
|
||||
crabboxBin: "/tmp/crabbox",
|
||||
credentialRole: "maintainer",
|
||||
credentialSource: "env",
|
||||
@@ -371,7 +371,7 @@ describe("qa cli registration", () => {
|
||||
leaseId: "cbx_123abc",
|
||||
machineClass: "beast",
|
||||
outputDir: ".artifacts/qa-e2e/mantis/slack-desktop",
|
||||
primaryModel: "openai/gpt-5.4",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
provider: "hetzner",
|
||||
providerMode: "live-frontier",
|
||||
repoRoot: "/tmp/openclaw-repo",
|
||||
|
||||
@@ -16,7 +16,7 @@ describe("gateway log sentinels", () => {
|
||||
"[plugins] plugin must declare contracts.tools for: runtime_tool",
|
||||
"2026-05-13T00:00:04Z codex app-server attempt timed out after 180000ms",
|
||||
"2026-05-13T00:00:05Z codex_app_server progress stalled for run abc123",
|
||||
"2026-05-13T00:00:06Z cron payload model openai/gpt-5.4 is not in model allowlist",
|
||||
"2026-05-13T00:00:06Z cron payload model openai/gpt-5.5 is not in model allowlist",
|
||||
"2026-05-13T00:00:07Z OpenAI quota exceeded for live-frontier request",
|
||||
].join("\n"),
|
||||
);
|
||||
|
||||
@@ -8,7 +8,7 @@ describe("qa live timeout policy", () => {
|
||||
{
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "anthropic/claude-sonnet-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-7",
|
||||
},
|
||||
30_000,
|
||||
),
|
||||
@@ -47,7 +47,7 @@ describe("qa live timeout policy", () => {
|
||||
{
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "anthropic/claude-sonnet-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-7",
|
||||
},
|
||||
30_000,
|
||||
),
|
||||
@@ -60,10 +60,10 @@ describe("qa live timeout policy", () => {
|
||||
{
|
||||
providerMode: "live-frontier",
|
||||
primaryModel: "anthropic/claude-sonnet-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-7",
|
||||
},
|
||||
30_000,
|
||||
"anthropic/claude-opus-4-6",
|
||||
"anthropic/claude-opus-4-7",
|
||||
),
|
||||
).toBe(240_000);
|
||||
});
|
||||
|
||||
@@ -9,7 +9,7 @@ export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([
|
||||
"openai/gpt-5.5",
|
||||
"openai/gpt-5.2",
|
||||
"openai/gpt-5",
|
||||
"anthropic/claude-opus-4-6",
|
||||
"anthropic/claude-opus-4-7",
|
||||
"anthropic/claude-sonnet-4-6",
|
||||
"zai/glm-5.1",
|
||||
"moonshot/kimi-k2.5",
|
||||
@@ -25,12 +25,12 @@ export const QA_FRONTIER_CHARACTER_THINKING_BY_MODEL: Readonly<Record<string, Qa
|
||||
|
||||
export const QA_FRONTIER_CHARACTER_JUDGE_MODELS = Object.freeze([
|
||||
"openai/gpt-5.5",
|
||||
"anthropic/claude-opus-4-6",
|
||||
"anthropic/claude-opus-4-7",
|
||||
]);
|
||||
|
||||
export const QA_FRONTIER_CHARACTER_JUDGE_MODEL_OPTIONS: Readonly<
|
||||
Record<string, QaFrontierCharacterModelOptions>
|
||||
> = Object.freeze({
|
||||
"openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: true },
|
||||
"anthropic/claude-opus-4-6": { thinkingDefault: "high" },
|
||||
"anthropic/claude-opus-4-7": { thinkingDefault: "high" },
|
||||
});
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.5";
|
||||
export const QA_FRONTIER_PARITY_BASELINE_LABEL = "anthropic/claude-opus-4-6";
|
||||
export const QA_FRONTIER_PARITY_BASELINE_LABEL = "anthropic/claude-opus-4-7";
|
||||
|
||||
@@ -2727,7 +2727,7 @@ describe("qa mock openai server", () => {
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: false,
|
||||
model: "mock-openai/gpt-5.4",
|
||||
model: "mock-openai/gpt-5.5",
|
||||
input: [
|
||||
{
|
||||
role: "user",
|
||||
@@ -2783,7 +2783,7 @@ describe("qa mock openai server", () => {
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
stream: false,
|
||||
model: "mock-openai/gpt-5.4",
|
||||
model: "mock-openai/gpt-5.5",
|
||||
input: [
|
||||
{
|
||||
role: "user",
|
||||
@@ -2956,7 +2956,7 @@ describe("qa mock openai server", () => {
|
||||
expect(outputText(await response.json())).toBe("NO_REPLY");
|
||||
});
|
||||
|
||||
it("advertises Anthropic claude-opus-4-6 baseline model on /v1/models", async () => {
|
||||
it("advertises Anthropic claude-opus-4-7 baseline model on /v1/models", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
port: 0,
|
||||
@@ -2969,7 +2969,7 @@ describe("qa mock openai server", () => {
|
||||
expect(response.status).toBe(200);
|
||||
const body = (await response.json()) as { data: Array<{ id: string }> };
|
||||
const ids = body.data.map((entry) => entry.id);
|
||||
expect(ids).toContain("claude-opus-4-6");
|
||||
expect(ids).toContain("claude-opus-4-7");
|
||||
expect(ids).toContain("gpt-5.5");
|
||||
});
|
||||
|
||||
@@ -2986,7 +2986,7 @@ describe("qa mock openai server", () => {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "claude-opus-4-6",
|
||||
model: "claude-opus-4-7",
|
||||
max_tokens: 256,
|
||||
messages: [
|
||||
{
|
||||
@@ -3011,7 +3011,7 @@ describe("qa mock openai server", () => {
|
||||
};
|
||||
expect(body.type).toBe("message");
|
||||
expect(body.role).toBe("assistant");
|
||||
expect(body.model).toBe("claude-opus-4-6");
|
||||
expect(body.model).toBe("claude-opus-4-7");
|
||||
expect(body.stop_reason).toBe("tool_use");
|
||||
const toolUseBlock = body.content.find((block) => block.type === "tool_use") as
|
||||
| { name: string; input: Record<string, unknown> }
|
||||
@@ -3022,7 +3022,7 @@ describe("qa mock openai server", () => {
|
||||
const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
|
||||
expect(debugResponse.status).toBe(200);
|
||||
const debugPayload = requireRecord(await debugResponse.json(), "debug request");
|
||||
expect(debugPayload.model).toBe("claude-opus-4-6");
|
||||
expect(debugPayload.model).toBe("claude-opus-4-7");
|
||||
expect(debugPayload.plannedToolName).toBe("read");
|
||||
});
|
||||
|
||||
@@ -3033,7 +3033,7 @@ describe("qa mock openai server", () => {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "claude-opus-4-6",
|
||||
model: "claude-opus-4-7",
|
||||
max_tokens: 256,
|
||||
tools: [
|
||||
{
|
||||
@@ -3073,7 +3073,7 @@ describe("qa mock openai server", () => {
|
||||
const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
|
||||
expect(debugResponse.status).toBe(200);
|
||||
const debugPayload = requireRecord(await debugResponse.json(), "debug request");
|
||||
expect(debugPayload.model).toBe("claude-opus-4-6");
|
||||
expect(debugPayload.model).toBe("claude-opus-4-7");
|
||||
expect(debugPayload.plannedToolName).toBe("sessions_spawn");
|
||||
});
|
||||
|
||||
@@ -3097,7 +3097,7 @@ describe("qa mock openai server", () => {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "claude-opus-4-6",
|
||||
model: "claude-opus-4-7",
|
||||
max_tokens: 256,
|
||||
messages: [
|
||||
{
|
||||
@@ -3171,7 +3171,7 @@ describe("qa mock openai server", () => {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "claude-opus-4-6",
|
||||
model: "claude-opus-4-7",
|
||||
max_tokens: 256,
|
||||
messages: [
|
||||
{
|
||||
@@ -3252,7 +3252,7 @@ describe("qa mock openai server", () => {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "claude-opus-4-6",
|
||||
model: "claude-opus-4-7",
|
||||
max_tokens: 256,
|
||||
stream: true,
|
||||
messages: [
|
||||
@@ -3293,7 +3293,7 @@ describe("qa mock openai server", () => {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "claude-opus-4-6",
|
||||
model: "claude-opus-4-7",
|
||||
max_tokens: 256,
|
||||
stream: true,
|
||||
messages: [
|
||||
@@ -3352,7 +3352,7 @@ describe("qa mock openai server", () => {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "claude-opus-4-6",
|
||||
model: "claude-opus-4-7",
|
||||
max_tokens: 256,
|
||||
stream: true,
|
||||
system: [
|
||||
@@ -3395,7 +3395,7 @@ describe("qa mock openai server", () => {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "claude-opus-4-6",
|
||||
model: "claude-opus-4-7",
|
||||
max_tokens: 256,
|
||||
stream: true,
|
||||
system: [
|
||||
@@ -3440,7 +3440,7 @@ describe("qa mock openai server", () => {
|
||||
const response = await fetch(`${server.baseUrl}/v1/messages`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: '{"model":"claude-opus-4-6","messages":[',
|
||||
body: '{"model":"claude-opus-4-7","messages":[',
|
||||
});
|
||||
|
||||
expect(response.status).toBe(400);
|
||||
@@ -3453,12 +3453,12 @@ describe("qa mock openai server", () => {
|
||||
expect(body.error.message).toContain("Malformed JSON body");
|
||||
});
|
||||
|
||||
it("defaults empty-string Anthropic /v1/messages model to claude-opus-4-6", async () => {
|
||||
it("defaults empty-string Anthropic /v1/messages model to claude-opus-4-7", async () => {
|
||||
// Regression for the loop-7 Copilot finding: a bare `typeof
|
||||
// body.model === "string"` check lets an empty-string model leak
|
||||
// through to `lastRequest.model` and `responseBody.model`. Empty
|
||||
// strings must be treated the same as absent and default to
|
||||
// `"claude-opus-4-6"` so parity consumers can trust the echoed label.
|
||||
// `"claude-opus-4-7"` so parity consumers can trust the echoed label.
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
port: 0,
|
||||
@@ -3483,12 +3483,12 @@ describe("qa mock openai server", () => {
|
||||
});
|
||||
expect(response.status).toBe(200);
|
||||
const body = (await response.json()) as { model: string };
|
||||
expect(body.model).toBe("claude-opus-4-6");
|
||||
expect(body.model).toBe("claude-opus-4-7");
|
||||
|
||||
const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`);
|
||||
expect(debugResponse.status).toBe(200);
|
||||
const debug = (await debugResponse.json()) as { model: string };
|
||||
expect(debug.model).toBe("claude-opus-4-6");
|
||||
expect(debug.model).toBe("claude-opus-4-7");
|
||||
});
|
||||
|
||||
it("scripts a reasoning-only recovery sequence after a replay-safe read", async () => {
|
||||
@@ -3711,9 +3711,9 @@ describe("resolveProviderVariant", () => {
|
||||
});
|
||||
|
||||
it("tags prefix-qualified anthropic models", () => {
|
||||
expect(resolveProviderVariant("anthropic/claude-opus-4-6")).toBe("anthropic");
|
||||
expect(resolveProviderVariant("anthropic:claude-opus-4-6")).toBe("anthropic");
|
||||
expect(resolveProviderVariant("claude-cli/claude-opus-4-6")).toBe("anthropic");
|
||||
expect(resolveProviderVariant("anthropic/claude-opus-4-7")).toBe("anthropic");
|
||||
expect(resolveProviderVariant("anthropic:claude-opus-4-7")).toBe("anthropic");
|
||||
expect(resolveProviderVariant("claude-cli/claude-opus-4-7")).toBe("anthropic");
|
||||
});
|
||||
|
||||
it("tags bare model names by prefix", () => {
|
||||
@@ -3721,7 +3721,7 @@ describe("resolveProviderVariant", () => {
|
||||
expect(resolveProviderVariant("gpt-5.5-alt")).toBe("openai");
|
||||
expect(resolveProviderVariant("gpt-4.5")).toBe("openai");
|
||||
expect(resolveProviderVariant("o1-preview")).toBe("openai");
|
||||
expect(resolveProviderVariant("claude-opus-4-6")).toBe("anthropic");
|
||||
expect(resolveProviderVariant("claude-opus-4-7")).toBe("anthropic");
|
||||
expect(resolveProviderVariant("claude-sonnet-4-6")).toBe("anthropic");
|
||||
});
|
||||
|
||||
@@ -3779,7 +3779,7 @@ describe("qa mock openai server provider variant tagging", () => {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
model: "claude-opus-4-6",
|
||||
model: "claude-opus-4-7",
|
||||
max_tokens: 256,
|
||||
messages: [{ role: "user", content: "Heartbeat check" }],
|
||||
}),
|
||||
@@ -3789,7 +3789,7 @@ describe("qa mock openai server provider variant tagging", () => {
|
||||
model: string;
|
||||
providerVariant: string;
|
||||
};
|
||||
expect(debug.model).toBe("claude-opus-4-6");
|
||||
expect(debug.model).toBe("claude-opus-4-7");
|
||||
expect(debug.providerVariant).toBe("anthropic");
|
||||
});
|
||||
|
||||
|
||||
@@ -81,7 +81,7 @@ export function resolveProviderVariant(model: string | undefined): MockOpenAiPro
|
||||
return "anthropic";
|
||||
}
|
||||
// Fall back to model-name prefix matching for bare model strings like
|
||||
// `gpt-5.5` or `claude-opus-4-6`.
|
||||
// `gpt-5.5` or `claude-opus-4-7`.
|
||||
if (/^(?:gpt-|o1-|openai-)/.test(trimmed)) {
|
||||
return "openai";
|
||||
}
|
||||
@@ -2161,7 +2161,7 @@ async function buildResponsesPayload(
|
||||
//
|
||||
// The QA parity gate needs two comparable scenario runs: one against the
|
||||
// "candidate" (openai/gpt-5.5) and one against the "baseline"
|
||||
// (anthropic/claude-opus-4-6). The OpenAI mock above already dispatches all
|
||||
// (anthropic/claude-opus-4-7). The OpenAI mock above already dispatches all
|
||||
// the scenario prompt branches we care about. Rather than duplicating that
|
||||
// machinery, the /v1/messages route below translates Anthropic request
|
||||
// shapes into the shared ResponsesInputItem[] format, calls the same
|
||||
@@ -2384,7 +2384,7 @@ function buildAnthropicMessageResponse(params: {
|
||||
id: `msg_mock_${Math.floor(Math.random() * 1_000_000).toString(16)}`,
|
||||
type: "message",
|
||||
role: "assistant",
|
||||
model: params.model || "claude-opus-4-6",
|
||||
model: params.model || "claude-opus-4-7",
|
||||
content,
|
||||
stop_reason: stopReason,
|
||||
stop_sequence: null,
|
||||
@@ -2412,7 +2412,7 @@ function buildAnthropicMessageStreamEvents(params: {
|
||||
id: messageId,
|
||||
type: "message",
|
||||
role: "assistant",
|
||||
model: params.model || "claude-opus-4-6",
|
||||
model: params.model || "claude-opus-4-7",
|
||||
content: [],
|
||||
stop_reason: null,
|
||||
stop_sequence: null,
|
||||
@@ -2511,7 +2511,7 @@ async function buildMessagesPayload(
|
||||
// which then confuses parity consumers that assume the mock always
|
||||
// echoes the real provider label. Normalize once and reuse everywhere.
|
||||
const normalizedModel =
|
||||
typeof body.model === "string" && body.model.trim() !== "" ? body.model : "claude-opus-4-6";
|
||||
typeof body.model === "string" && body.model.trim() !== "" ? body.model : "claude-opus-4-7";
|
||||
// Dispatch through the same scenario logic the /v1/responses route uses.
|
||||
// Preserve declared tools so route-specific adapters mirror what the
|
||||
// real provider request made available to the model.
|
||||
@@ -2556,7 +2556,7 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n
|
||||
{ id: "gpt-5.5-alt", object: "model" },
|
||||
{ id: "gpt-image-1", object: "model" },
|
||||
{ id: "text-embedding-3-small", object: "model" },
|
||||
{ id: "claude-opus-4-6", object: "model" },
|
||||
{ id: "claude-opus-4-7", object: "model" },
|
||||
{ id: "claude-sonnet-4-6", object: "model" },
|
||||
],
|
||||
});
|
||||
|
||||
@@ -71,8 +71,8 @@ function createMockAnthropicMessagesProvider(baseUrl: string): ModelProviderConf
|
||||
},
|
||||
models: [
|
||||
{
|
||||
id: "claude-opus-4-6",
|
||||
name: "claude-opus-4-6",
|
||||
id: "claude-opus-4-7",
|
||||
name: "claude-opus-4-7",
|
||||
api: "anthropic-messages",
|
||||
reasoning: false,
|
||||
input: ["text", "image"],
|
||||
|
||||
@@ -90,7 +90,7 @@ describe("buildQaGatewayConfig", () => {
|
||||
workspaceDir: "/tmp/qa-workspace",
|
||||
providerMode: "mock-openai",
|
||||
primaryModel: "openai/gpt-5.5",
|
||||
alternateModel: "anthropic/claude-opus-4-6",
|
||||
alternateModel: "anthropic/claude-opus-4-7",
|
||||
});
|
||||
|
||||
expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("openai/gpt-5.5");
|
||||
@@ -101,7 +101,7 @@ describe("buildQaGatewayConfig", () => {
|
||||
expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080");
|
||||
expect(cfg.models?.providers?.anthropic?.request).toEqual({ allowPrivateNetwork: true });
|
||||
expect(cfg.models?.providers?.anthropic?.models.map((model) => model.id)).toContain(
|
||||
"claude-opus-4-6",
|
||||
"claude-opus-4-7",
|
||||
);
|
||||
expect(cfg.plugins?.allow).toEqual(["acpx", "memory-core"]);
|
||||
});
|
||||
|
||||
@@ -174,7 +174,7 @@ describe("qa suite planning helpers", () => {
|
||||
makeQaSuiteTestScenario("anthropic-only", {
|
||||
config: {
|
||||
requiredProvider: "anthropic",
|
||||
requiredModel: "claude-opus-4-6",
|
||||
requiredModel: "claude-opus-4-7",
|
||||
},
|
||||
}),
|
||||
];
|
||||
@@ -320,7 +320,7 @@ describe("qa suite planning helpers", () => {
|
||||
config: { requiredProvider: "openai", requiredModel: "gpt-5.5" },
|
||||
}),
|
||||
makeQaSuiteTestScenario("anthropic-only", {
|
||||
config: { requiredProvider: "anthropic", requiredModel: "claude-opus-4-6" },
|
||||
config: { requiredProvider: "anthropic", requiredModel: "claude-opus-4-7" },
|
||||
}),
|
||||
makeQaSuiteTestScenario("claude-subscription", {
|
||||
config: { requiredProvider: "claude-cli", authMode: "subscription" },
|
||||
|
||||
@@ -67,12 +67,12 @@ describe("buildQaSuiteSummaryJson", () => {
|
||||
it("records an Anthropic baseline lane cleanly for parity runs", () => {
|
||||
const json = buildQaSuiteSummaryJson({
|
||||
...baseParams,
|
||||
primaryModel: "anthropic/claude-opus-4-6",
|
||||
primaryModel: "anthropic/claude-opus-4-7",
|
||||
alternateModel: "anthropic/claude-sonnet-4-6",
|
||||
});
|
||||
expect(json.run.primaryModel).toBe("anthropic/claude-opus-4-6");
|
||||
expect(json.run.primaryModel).toBe("anthropic/claude-opus-4-7");
|
||||
expect(json.run.primaryProvider).toBe("anthropic");
|
||||
expect(json.run.primaryModelName).toBe("claude-opus-4-6");
|
||||
expect(json.run.primaryModelName).toBe("claude-opus-4-7");
|
||||
expect(json.run.alternateModel).toBe("anthropic/claude-sonnet-4-6");
|
||||
expect(json.run.alternateProvider).toBe("anthropic");
|
||||
expect(json.run.alternateModelName).toBe("claude-sonnet-4-6");
|
||||
|
||||
@@ -12,7 +12,7 @@ coverage:
|
||||
objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using API-key auth.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary provider is anthropic.
|
||||
- The selected primary model is Anthropic Opus 4.6.
|
||||
- The selected primary model is Anthropic Opus 4.7.
|
||||
- The QA gateway worker has an Anthropic API key available through environment auth.
|
||||
- The agent replies through the regular Anthropic provider.
|
||||
docsRefs:
|
||||
@@ -24,10 +24,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model anthropic/claude-opus-4-6 --alt-model anthropic/claude-opus-4-6 --scenario anthropic-opus-api-key-smoke`.
|
||||
summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model anthropic/claude-opus-4-7 --alt-model anthropic/claude-opus-4-7 --scenario anthropic-opus-api-key-smoke`.
|
||||
config:
|
||||
requiredProvider: anthropic
|
||||
requiredModel: claude-opus-4-6
|
||||
requiredModel: claude-opus-4-7
|
||||
chatPrompt: "Anthropic Opus API key smoke. Reply exactly: ANTHROPIC-OPUS-API-KEY-OK"
|
||||
chatExpected: ANTHROPIC-OPUS-API-KEY-OK
|
||||
```
|
||||
|
||||
@@ -12,7 +12,7 @@ coverage:
|
||||
objective: Verify the regular Anthropic Opus lane can complete a quick chat turn using setup-token auth.
|
||||
successCriteria:
|
||||
- A live-frontier run fails fast unless the selected primary provider is anthropic.
|
||||
- The selected primary model is Anthropic Opus 4.6.
|
||||
- The selected primary model is Anthropic Opus 4.7.
|
||||
- The QA gateway worker stages a token auth profile in the isolated agent store.
|
||||
- The agent replies through the regular Anthropic provider.
|
||||
docsRefs:
|
||||
@@ -24,10 +24,10 @@ codeRefs:
|
||||
- extensions/qa-lab/src/suite.ts
|
||||
execution:
|
||||
kind: flow
|
||||
summary: Run with `OPENCLAW_LIVE_SETUP_TOKEN_VALUE=<setup-token> pnpm openclaw qa suite --provider-mode live-frontier --model anthropic/claude-opus-4-6 --alt-model anthropic/claude-opus-4-6 --scenario anthropic-opus-setup-token-smoke`.
|
||||
summary: Run with `OPENCLAW_LIVE_SETUP_TOKEN_VALUE=<setup-token> pnpm openclaw qa suite --provider-mode live-frontier --model anthropic/claude-opus-4-7 --alt-model anthropic/claude-opus-4-7 --scenario anthropic-opus-setup-token-smoke`.
|
||||
config:
|
||||
requiredProvider: anthropic
|
||||
requiredModel: claude-opus-4-6
|
||||
requiredModel: claude-opus-4-7
|
||||
profileId: "anthropic:qa-setup-token"
|
||||
chatPrompt: "Anthropic Opus setup-token smoke. Reply exactly: ANTHROPIC-OPUS-SETUP-TOKEN-OK"
|
||||
chatExpected: ANTHROPIC-OPUS-SETUP-TOKEN-OK
|
||||
|
||||
@@ -46,7 +46,7 @@ const providerConfig = {
|
||||
extensionId: "openai",
|
||||
secretEnv: "OPENAI_API_KEY",
|
||||
authChoice: "openai-api-key",
|
||||
model: "openai/gpt-5.4",
|
||||
model: "openai/gpt-5.5",
|
||||
baseUrl: "https://api.openai.com/v1",
|
||||
timeoutSeconds: CROSS_OS_AGENT_TURN_TIMEOUT_SECONDS,
|
||||
},
|
||||
|
||||
@@ -1047,9 +1047,9 @@ describe("run-node script", () => {
|
||||
"qa",
|
||||
"parity-report",
|
||||
"--candidate-summary",
|
||||
".artifacts/qa-e2e/gpt54/qa-suite-summary.json",
|
||||
".artifacts/qa-e2e/openai-candidate/qa-suite-summary.json",
|
||||
"--baseline-summary",
|
||||
".artifacts/qa-e2e/opus46/qa-suite-summary.json",
|
||||
".artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json",
|
||||
],
|
||||
env: {
|
||||
...process.env,
|
||||
@@ -1068,9 +1068,9 @@ describe("run-node script", () => {
|
||||
"tsx",
|
||||
path.join(tmp, "scripts", "qa-parity-report.ts"),
|
||||
"--candidate-summary",
|
||||
".artifacts/qa-e2e/gpt54/qa-suite-summary.json",
|
||||
".artifacts/qa-e2e/openai-candidate/qa-suite-summary.json",
|
||||
"--baseline-summary",
|
||||
".artifacts/qa-e2e/opus46/qa-suite-summary.json",
|
||||
".artifacts/qa-e2e/anthropic-baseline/qa-suite-summary.json",
|
||||
],
|
||||
]);
|
||||
});
|
||||
|
||||
@@ -100,17 +100,17 @@ const modelCatalogMocks = getSharedMocks("openclaw.trigger-handling.model-catalo
|
||||
loadModelCatalog: vi.fn().mockResolvedValue([
|
||||
{
|
||||
provider: "anthropic",
|
||||
id: "claude-opus-4-6",
|
||||
name: "Claude Opus 4.5",
|
||||
id: "claude-opus-4-7",
|
||||
name: "Claude Opus 4.7",
|
||||
contextWindow: 200000,
|
||||
},
|
||||
{
|
||||
provider: "openrouter",
|
||||
id: "anthropic/claude-opus-4-6",
|
||||
name: "Claude Opus 4.5 (OpenRouter)",
|
||||
id: "anthropic/claude-opus-4-7",
|
||||
name: "Claude Opus 4.7 (OpenRouter)",
|
||||
contextWindow: 200000,
|
||||
},
|
||||
{ provider: "openai", id: "gpt-5.4-mini", name: "GPT-5.4 mini" },
|
||||
{ provider: "openai", id: "gpt-5.5-mini", name: "GPT-5.5 mini" },
|
||||
{ provider: "openai", id: "gpt-5.5", name: "GPT-5.5" },
|
||||
{ provider: "openai-codex", id: "gpt-5.5", name: "GPT-5.5 (Codex)" },
|
||||
{ provider: "minimax", id: "MiniMax-M2.7", name: "MiniMax M2.7" },
|
||||
@@ -284,7 +284,7 @@ export function makeCfg(home: string): OpenClawConfig {
|
||||
return withFastReplyConfig({
|
||||
agents: {
|
||||
defaults: {
|
||||
model: { primary: "anthropic/claude-opus-4-6" },
|
||||
model: { primary: "anthropic/claude-opus-4-7" },
|
||||
workspace: join(home, "openclaw"),
|
||||
// Test harness: avoid 1s coalescer idle sleeps that dominate trigger suites.
|
||||
blockStreamingCoalesce: { idleMs: 1 },
|
||||
|
||||
@@ -205,10 +205,10 @@ describe("scripts/openclaw-cross-os-release-checks", () => {
|
||||
OPENCLAW_CROSS_OS_MODEL: "openai/gpt-5.4-nano",
|
||||
})?.model,
|
||||
).toBe("openai/gpt-5.4-nano");
|
||||
expect(resolveProviderConfig("openai", {})?.model).toBe("openai/gpt-5.4");
|
||||
expect(resolveProviderConfig("openai", {})?.model).toBe("openai/gpt-5.5");
|
||||
});
|
||||
|
||||
it("keeps release cross-OS OpenAI smoke on GPT-5.4", () => {
|
||||
it("keeps release cross-OS OpenAI smoke on GPT-5.5", () => {
|
||||
const workflow = readFileSync(
|
||||
".github/workflows/openclaw-cross-os-release-checks-reusable.yml",
|
||||
"utf8",
|
||||
@@ -216,9 +216,9 @@ describe("scripts/openclaw-cross-os-release-checks", () => {
|
||||
const releaseChecks = readFileSync(".github/workflows/openclaw-release-checks.yml", "utf8");
|
||||
|
||||
expect(workflow).toContain(
|
||||
"OPENCLAW_CROSS_OS_OPENAI_MODEL: ${{ inputs.openai_model || vars.OPENCLAW_CROSS_OS_OPENAI_MODEL || 'openai/gpt-5.4' }}",
|
||||
"OPENCLAW_CROSS_OS_OPENAI_MODEL: ${{ inputs.openai_model || vars.OPENCLAW_CROSS_OS_OPENAI_MODEL || 'openai/gpt-5.5' }}",
|
||||
);
|
||||
expect(releaseChecks).toContain("openai_model: openai/gpt-5.4");
|
||||
expect(releaseChecks).toContain("openai_model: openai/gpt-5.5");
|
||||
});
|
||||
|
||||
it("keeps release smoke plugin allowlists focused on agent-turn essentials", () => {
|
||||
|
||||
@@ -413,7 +413,7 @@ describe("package artifact reuse", () => {
|
||||
expect(workflow).toContain("suite_id: native-live-src-gateway-profiles-anthropic-opus");
|
||||
expect(workflow).toContain("suite_id: native-live-src-gateway-profiles-anthropic-sonnet-haiku");
|
||||
expect(workflow).toContain("suite_group: native-live-src-gateway-profiles-anthropic");
|
||||
expect(workflow).toContain("anthropic/claude-opus-4-7,anthropic/claude-opus-4-6");
|
||||
expect(workflow).toContain("OPENCLAW_LIVE_GATEWAY_MODELS=anthropic/claude-opus-4-7");
|
||||
expect(workflow).toContain("anthropic/claude-sonnet-4-6,anthropic/claude-haiku-4-5");
|
||||
expect(workflow).toMatch(
|
||||
/suite_id: native-live-src-gateway-profiles-fireworks[\s\S]*?advisory: true/u,
|
||||
|
||||
Reference in New Issue
Block a user