diff --git a/.agents/skills/openclaw-docker-e2e-authoring/SKILL.md b/.agents/skills/openclaw-docker-e2e-authoring/SKILL.md new file mode 100644 index 000000000000..8703e75e822f --- /dev/null +++ b/.agents/skills/openclaw-docker-e2e-authoring/SKILL.md @@ -0,0 +1,64 @@ +--- +name: openclaw-docker-e2e-authoring +description: "Author OpenClaw Docker E2E and live provider Docker lanes." +--- + +# OpenClaw Docker E2E Authoring + +Use this when adding or changing Docker E2E lanes, release-path Docker tests, +or live-provider Docker proof. + +## Lane Choice + +- Deterministic Docker: fake the dependency/server and assert the exact runtime + contract crossing the boundary. +- Live Docker: use real provider credentials/model only when user-visible + behavior needs the real service. +- Prefer both when they prove different risks: deterministic for byte/payload + routing, live for actual provider behavior. + +## Authoring Rules + +- Test-only helpers live in `test/helpers` or `scripts/e2e/lib//`, not + `src/**`, unless production imports them. +- Package-installed app runs from `/app`; mount only explicit harness/helper + paths read-only. +- Fake servers should log boundary requests as JSONL and clients should assert + the real dependency payload, not just process success. +- Add the package script and `scripts/lib/docker-e2e-scenarios.mjs` lane in the + same change. +- If a lane installs a plugin from npm, default the spec via env so published + and local override paths are both testable. + +## Media And Vision + +- Expected answer must exist only in pixels or provider output being tested. +- Use neutral filenames, neutral prompts, and no metadata leaks. +- Random bitmap/OCR tokens reuse the repo OCR-safe alphabet `24567ACEF` unless + the test owns a stronger glyph set. +- Make the expected answer unique per run when proving real image + understanding. + +## `chat.send` E2E + +- Require `chat.send` to return `status: "started"` and a string `runId`. +- Wait for completion with `agent.wait`. +- Assert final user-visible text via `chat.history` when event ordering is not + the behavior under test. +- Keep originating channel/account metadata only when the bug path needs queued + inbound/channel context. + +## Verification + +Run the smallest proof that covers the touched lane: + +```bash +pnpm exec oxfmt --write +node --check +bash -n +node scripts/run-vitest.mjs test/scripts/docker-e2e-plan.test.ts +OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker: +``` + +For real-provider lanes, run the matching live Docker script after deterministic +Docker is green. Finish with `$autoreview` before commit/PR. diff --git a/config/knip.config.ts b/config/knip.config.ts index 8f0ed5e14e59..77d74f5222a2 100644 --- a/config/knip.config.ts +++ b/config/knip.config.ts @@ -125,7 +125,7 @@ const config = { "**/*.test-helpers.ts", "**/*.test-mocks.ts", "**/*.test-utils.ts", - "src/gateway/live-image-probe.ts", + "test/helpers/live-image-probe.ts", "src/secrets/credential-matrix.ts", "src/agents/claude-cli-runner.ts", "src/agents/pi-auth-json.ts", diff --git a/docs/help/testing-live.md b/docs/help/testing-live.md index f12f2237dc4c..0b8810ec57f0 100644 --- a/docs/help/testing-live.md +++ b/docs/help/testing-live.md @@ -103,7 +103,7 @@ Live tests are split into two layers so we can isolate failures: - `read` probe: the test writes a nonce file in the workspace and asks the agent to `read` it and echo the nonce back. - `exec+read` probe: the test asks the agent to `exec`-write a nonce into a temp file, then `read` it back. - image probe: the test attaches a generated PNG (cat + randomized code) and expects the model to return `cat `. - - Implementation reference: `src/gateway/gateway-models.profiles.live.test.ts` and `src/gateway/live-image-probe.ts`. + - Implementation reference: `src/gateway/gateway-models.profiles.live.test.ts` and `test/helpers/live-image-probe.ts`. - How to enable: - `pnpm test:live` (or `OPENCLAW_LIVE_TEST=1` if invoking Vitest directly) - How to select models: @@ -117,7 +117,7 @@ Live tests are split into two layers so we can isolate failures: - `read` probe + `exec+read` probe (tool stress) - image probe runs when the model advertises image input support - Flow (high level): - - Test generates a tiny PNG with "CAT" + random code (`src/gateway/live-image-probe.ts`) + - Test generates a tiny PNG with "CAT" + random code (`test/helpers/live-image-probe.ts`) - Sends it via `agent` `attachments: [{ mimeType: "image/png", content: "" }]` - Gateway parses attachments into `images[]` (`src/gateway/server-methods/agent.ts` + `src/gateway/chat-attachments.ts`) - Embedded agent forwards a multimodal user message to the model diff --git a/package.json b/package.json index 830173168af0..a1706b384066 100644 --- a/package.json +++ b/package.json @@ -1603,6 +1603,7 @@ "test:docker:crestodian-planner": "bash scripts/e2e/crestodian-planner-docker.sh", "test:docker:crestodian-rescue": "bash scripts/e2e/crestodian-rescue-docker.sh", "test:docker:cron-mcp-cleanup": "bash scripts/e2e/cron-mcp-cleanup-docker.sh", + "test:docker:codex-media-path": "bash scripts/e2e/codex-media-path-docker.sh", "test:docker:doctor-switch": "bash scripts/e2e/doctor-install-switch-docker.sh", "test:docker:e2e-build": "bash scripts/e2e/build-image.sh", "test:docker:gateway-network": "bash scripts/e2e/gateway-network-docker.sh", @@ -1624,6 +1625,7 @@ "test:docker:live-cli-backend:gemini:resume": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=google-gemini-cli/gemini-3-flash-preview OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1 bash scripts/test-live-cli-backend-docker.sh", "test:docker:live-codex-bind": "OPENCLAW_LIVE_CODEX_BIND=1 OPENCLAW_LIVE_CODEX_TEST_FILES=src/gateway/gateway-codex-bind.live.test.ts bash scripts/test-live-codex-harness-docker.sh", "test:docker:live-codex-harness": "bash scripts/test-live-codex-harness-docker.sh", + "test:docker:live-codex-media-path": "OPENCLAW_LIVE_CODEX_HARNESS_AUTH=api-key OPENCLAW_LIVE_CODEX_HARNESS_CHAT_IMAGE_PROBE=1 OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=0 OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=0 OPENCLAW_LIVE_CODEX_HARNESS_SUBAGENT_PROBE=0 OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=0 bash scripts/test-live-codex-harness-docker.sh", "test:docker:live-codex-npm-plugin": "bash scripts/e2e/codex-npm-plugin-live-docker.sh", "test:docker:live-plugin-tool": "bash scripts/e2e/live-plugin-tool-docker.sh", "test:docker:live-subagent-announce": "bash scripts/test-live-subagent-announce-docker.sh", diff --git a/scripts/e2e/codex-media-path-docker.sh b/scripts/e2e/codex-media-path-docker.sh new file mode 100644 index 000000000000..e9e77f5ab966 --- /dev/null +++ b/scripts/e2e/codex-media-path-docker.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +source "$ROOT_DIR/scripts/lib/docker-e2e-image.sh" + +IMAGE_NAME="$(docker_e2e_resolve_image "openclaw-codex-media-path-e2e" OPENCLAW_CODEX_MEDIA_PATH_E2E_IMAGE)" +PORT="${OPENCLAW_CODEX_MEDIA_PATH_PORT:-18790}" +TOKEN="codex-media-path-e2e-$$" +CODEX_PLUGIN_SPEC="${OPENCLAW_CODEX_MEDIA_PATH_PLUGIN_SPEC:-npm:@openclaw/codex}" + +docker_e2e_build_or_reuse "$IMAGE_NAME" codex-media-path "$ROOT_DIR/scripts/e2e/Dockerfile" "$ROOT_DIR" +OPENCLAW_TEST_STATE_SCRIPT_B64="$(docker_e2e_test_state_shell_b64 codex-media-path empty)" + +echo "Running Codex media-path Docker E2E..." +docker_e2e_run_logged_with_harness codex-media-path \ + -e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \ + -e "OPENCLAW_CODEX_MEDIA_PATH_PLUGIN_SPEC=$CODEX_PLUGIN_SPEC" \ + -e "OPENCLAW_CODEX_MEDIA_PATH_TIMEOUT_SECONDS=${OPENCLAW_CODEX_MEDIA_PATH_TIMEOUT_SECONDS:-180}" \ + -e "OPENCLAW_ALLOW_INSECURE_PRIVATE_WS=1" \ + -e "OPENCLAW_GATEWAY_TOKEN=$TOKEN" \ + -e "OPENCLAW_TEST_STATE_SCRIPT_B64=$OPENCLAW_TEST_STATE_SCRIPT_B64" \ + -e "PORT=$PORT" \ + -v "$ROOT_DIR/src:/app/src:ro" \ + -v "$ROOT_DIR/test/helpers:/app/test/helpers:ro" \ + "$IMAGE_NAME" \ + bash scripts/e2e/lib/codex-media-path/scenario.sh diff --git a/scripts/e2e/lib/codex-media-path/client.mjs b/scripts/e2e/lib/codex-media-path/client.mjs new file mode 100644 index 000000000000..124e47fbf85b --- /dev/null +++ b/scripts/e2e/lib/codex-media-path/client.mjs @@ -0,0 +1,259 @@ +import { createHash, randomBytes, randomUUID } from "node:crypto"; +import fs from "node:fs"; +import { setTimeout as delay } from "node:timers/promises"; +import { WebSocket } from "ws"; +import { PROTOCOL_VERSION } from "../../../../dist/gateway/protocol/index.js"; +import { renderBitmapTextPngBase64 } from "../../../../test/helpers/live-image-probe.ts"; + +const port = process.env.PORT; +const token = process.env.OPENCLAW_GATEWAY_TOKEN; +const appServerLog = + process.env.OPENCLAW_CODEX_MEDIA_PATH_APP_SERVER_LOG ?? + "/tmp/openclaw-codex-media-path-app-server.jsonl"; +const timeoutSeconds = Number.parseInt( + process.env.OPENCLAW_CODEX_MEDIA_PATH_TIMEOUT_SECONDS ?? "180", + 10, +); + +if (!port || !token) { + throw new Error("missing PORT/OPENCLAW_GATEWAY_TOKEN"); +} + +function assert(condition, message) { + if (!condition) { + throw new Error(message); + } +} + +function sha256Base64(data) { + return createHash("sha256").update(Buffer.from(data, "base64")).digest("hex"); +} + +function readLoggedRequests() { + if (!fs.existsSync(appServerLog)) { + return []; + } + return fs + .readFileSync(appServerLog, "utf8") + .split("\n") + .filter(Boolean) + .map((line) => JSON.parse(line)); +} + +async function waitFor(label, predicate, timeoutMs) { + const started = Date.now(); + while (Date.now() - started < timeoutMs) { + const value = await predicate(); + if (value !== undefined) { + return value; + } + await delay(50); + } + throw new Error(`timeout waiting for ${label}`); +} + +function wsDataToString(data) { + if (typeof data === "string") { + return data; + } + if (Buffer.isBuffer(data)) { + return data.toString("utf8"); + } + if (Array.isArray(data)) { + return Buffer.concat(data).toString("utf8"); + } + return Buffer.from(data).toString("utf8"); +} + +async function connectGateway() { + const ws = new WebSocket(`ws://127.0.0.1:${port}`); + await new Promise((resolve, reject) => { + const timer = setTimeout(() => reject(new Error("gateway ws open timeout")), 45_000); + timer.unref?.(); + ws.once("open", () => { + clearTimeout(timer); + resolve(); + }); + ws.once("error", (error) => { + clearTimeout(timer); + reject(error); + }); + }); + + const events = []; + const pending = new Map(); + ws.on("message", (data) => { + let frame; + try { + frame = JSON.parse(wsDataToString(data)); + } catch { + return; + } + if (frame?.type === "event" && typeof frame.event === "string") { + events.push({ + event: frame.event, + payload: frame.payload && typeof frame.payload === "object" ? frame.payload : {}, + }); + return; + } + if (frame?.type !== "res" || typeof frame.id !== "string") { + return; + } + const match = pending.get(frame.id); + if (!match) { + return; + } + pending.delete(frame.id); + if (frame.ok === true) { + match.resolve(frame.payload ?? frame.result); + return; + } + match.reject(new Error(frame.error?.message ?? "gateway request failed")); + }); + ws.once("close", (code, reason) => { + const error = new Error(`gateway closed (${code}): ${wsDataToString(reason)}`); + for (const entry of pending.values()) { + entry.reject(error); + } + pending.clear(); + }); + + function request(method, params, opts = {}) { + const id = randomUUID(); + const timeoutMs = opts.timeoutMs ?? 60_000; + return new Promise((resolve, reject) => { + const timer = setTimeout(() => { + pending.delete(id); + reject(new Error(`gateway request timeout: ${method}`)); + }, timeoutMs); + timer.unref?.(); + pending.set(id, { + resolve: (value) => { + clearTimeout(timer); + resolve(value); + }, + reject: (error) => { + clearTimeout(timer); + reject(error); + }, + }); + ws.send(JSON.stringify({ type: "req", id, method, params: params ?? {} })); + }); + } + + await request( + "connect", + { + minProtocol: PROTOCOL_VERSION, + maxProtocol: PROTOCOL_VERSION, + client: { + id: "gateway-client", + displayName: "docker-codex-media-path", + version: "1.0.0", + platform: process.platform, + mode: "backend", + }, + role: "operator", + scopes: ["operator.read", "operator.write", "operator.admin"], + caps: [], + auth: { token }, + }, + { timeoutMs: 60_000 }, + ); + await request("sessions.subscribe", {}, { timeoutMs: 60_000 }); + + return { + events, + request, + async close() { + if (ws.readyState === WebSocket.CLOSED) { + return; + } + await new Promise((resolve) => { + const timer = setTimeout(resolve, 2_000); + timer.unref?.(); + ws.once("close", () => { + clearTimeout(timer); + resolve(); + }); + ws.close(); + }); + }, + }; +} + +const gateway = await connectGateway(); + +function randomBitmapTextToken(length = 6) { + const alphabet = "24567ACEF"; + return [...randomBytes(length)].map((byte) => alphabet[byte % alphabet.length]).join(""); +} + +try { + const expectedToken = randomBitmapTextToken(); + const imageBase64 = renderBitmapTextPngBase64(expectedToken); + const expectedHash = sha256Base64(imageBase64); + const runId = `codex-media-path-${randomUUID()}`; + const started = Date.now(); + + const response = await gateway.request( + "chat.send", + { + sessionKey: "agent:main:codex-media-path-e2e", + idempotencyKey: runId, + message: "Read the code printed in the attached image. Reply only the code.", + attachments: [ + { + mimeType: "image/png", + fileName: "codex-media-path-probe.png", + content: imageBase64, + }, + ], + originatingChannel: "codex-media-path-e2e", + originatingTo: "codex-media-path-e2e", + originatingAccountId: "codex-media-path-e2e", + }, + { timeoutMs: timeoutSeconds * 1000 }, + ); + assert(response?.status === "started", `chat.send did not start: ${JSON.stringify(response)}`); + + const turnRequest = await waitFor( + "Codex turn/start image input", + () => + readLoggedRequests().find((request) => { + if (request.method !== "turn/start") { + return undefined; + } + const imageInput = request.params?.input?.find?.( + (entry) => entry?.type === "image" && typeof entry.url === "string", + ); + return imageInput ? request : undefined; + }), + timeoutSeconds * 1000, + ); + + const imageInput = turnRequest.params.input.find((entry) => entry?.type === "image"); + const imageUrl = imageInput.url; + assert( + imageUrl.startsWith("data:image/png;base64,"), + `turn/start image input is not an inline PNG: ${JSON.stringify(imageInput)}`, + ); + const actualBase64 = imageUrl.slice("data:image/png;base64,".length); + const actualHash = sha256Base64(actualBase64); + assert( + actualHash === expectedHash, + `forwarded PNG hash mismatch: expected ${expectedHash}, got ${actualHash}`, + ); + + await delay(50); + console.log( + JSON.stringify({ + ok: true, + elapsedMs: Date.now() - started, + expectedToken, + imageSha256: actualHash, + }), + ); +} finally { + await gateway.close(); +} diff --git a/scripts/e2e/lib/codex-media-path/fake-codex-app-server.mjs b/scripts/e2e/lib/codex-media-path/fake-codex-app-server.mjs new file mode 100644 index 000000000000..0aaab844112a --- /dev/null +++ b/scripts/e2e/lib/codex-media-path/fake-codex-app-server.mjs @@ -0,0 +1,87 @@ +import fs from "node:fs"; +import readline from "node:readline"; + +const requestLog = + process.env.OPENCLAW_CODEX_MEDIA_PATH_APP_SERVER_LOG ?? + "/tmp/openclaw-codex-media-path-app-server.jsonl"; +let turnCount = 0; + +function appendRequest(request) { + fs.appendFileSync(requestLog, `${JSON.stringify(request)}\n`); +} + +function send(id, result) { + process.stdout.write(`${JSON.stringify({ id, result })}\n`); +} + +const rl = readline.createInterface({ input: process.stdin }); +rl.on("line", (line) => { + if (!line.trim()) { + return; + } + const request = JSON.parse(line); + appendRequest(request); + const { id, method, params } = request; + if (method === "initialize") { + send(id, { + protocolVersion: "2", + serverInfo: { name: "openclaw-codex-media-path-e2e", version: "0.125.0" }, + userAgent: "openclaw-codex-media-path-e2e/0.125.0 (Docker; test)", + }); + return; + } + if (method === "thread/start") { + const now = Date.now(); + send(id, { + thread: { + id: "thread-codex-media-path-e2e", + sessionId: "session-codex-media-path-e2e", + forkedFromId: null, + preview: "", + ephemeral: false, + modelProvider: "openai", + createdAt: now, + updatedAt: now, + cwd: params?.cwd ?? process.cwd(), + status: { type: "idle" }, + path: null, + cliVersion: "0.125.0", + source: "unknown", + agentNickname: null, + agentRole: null, + gitInfo: null, + name: null, + turns: [], + }, + model: params?.model ?? "gpt-5.5", + modelProvider: "openai", + serviceTier: null, + cwd: params?.cwd ?? process.cwd(), + instructionSources: [], + approvalPolicy: params?.approvalPolicy ?? "never", + approvalsReviewer: params?.approvalsReviewer ?? "user", + sandbox: { type: "dangerFullAccess" }, + permissionProfile: null, + reasoningEffort: null, + }); + return; + } + if (method === "turn/start") { + turnCount += 1; + send(id, { + turn: { + id: `turn-codex-media-path-e2e-${turnCount}`, + status: "completed", + items: [ + { + type: "agentMessage", + id: `msg-codex-media-path-e2e-${turnCount}`, + text: "CODEX_MEDIA_PATH_E2E_OK", + }, + ], + }, + }); + return; + } + send(id, {}); +}); diff --git a/scripts/e2e/lib/codex-media-path/scenario.sh b/scripts/e2e/lib/codex-media-path/scenario.sh new file mode 100644 index 000000000000..a98883a69fce --- /dev/null +++ b/scripts/e2e/lib/codex-media-path/scenario.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -euo pipefail + +source scripts/lib/openclaw-e2e-instance.sh +openclaw_e2e_eval_test_state_from_b64 "${OPENCLAW_TEST_STATE_SCRIPT_B64:?missing OPENCLAW_TEST_STATE_SCRIPT_B64}" +export OPENCLAW_SKIP_CHANNELS=1 +export OPENCLAW_SKIP_GMAIL_WATCHER=1 +export OPENCLAW_SKIP_CRON=1 +export OPENCLAW_SKIP_CANVAS_HOST=1 +export OPENCLAW_SKIP_BROWSER_CONTROL_SERVER=1 +export OPENCLAW_SKIP_ACPX_RUNTIME=1 +export OPENCLAW_SKIP_ACPX_RUNTIME_PROBE=1 +export OPENCLAW_AGENT_HARNESS_FALLBACK=none +export OPENCLAW_CODEX_MEDIA_PATH_APP_SERVER_LOG="/tmp/openclaw-codex-media-path-app-server.jsonl" + +PORT="${PORT:?missing PORT}" +TOKEN="${OPENCLAW_GATEWAY_TOKEN:?missing OPENCLAW_GATEWAY_TOKEN}" +PLUGIN_SPEC="${OPENCLAW_CODEX_MEDIA_PATH_PLUGIN_SPEC:?missing OPENCLAW_CODEX_MEDIA_PATH_PLUGIN_SPEC}" +GATEWAY_LOG="/tmp/openclaw-codex-media-path-gateway.log" +CLIENT_LOG="/tmp/openclaw-codex-media-path-client.log" +PLUGIN_INSTALL_LOG="/tmp/openclaw-codex-media-path-plugin-install.log" +PLUGIN_INSPECT_LOG="/tmp/openclaw-codex-media-path-plugin-inspect.json" +gateway_pid="" + +cleanup() { + openclaw_e2e_stop_process "$gateway_pid" +} +trap cleanup EXIT + +dump_debug_logs() { + local status="$1" + echo "Codex media-path Docker E2E failed with exit code $status" >&2 + openclaw_e2e_dump_logs "$PLUGIN_INSTALL_LOG" "$PLUGIN_INSPECT_LOG" "$GATEWAY_LOG" "$CLIENT_LOG" "$OPENCLAW_CODEX_MEDIA_PATH_APP_SERVER_LOG" +} +trap 'status=$?; dump_debug_logs "$status"; exit "$status"' ERR + +entry="$(openclaw_e2e_resolve_entrypoint)" +mkdir -p "$OPENCLAW_STATE_DIR" "$OPENCLAW_TEST_WORKSPACE_DIR" +rm -f "$OPENCLAW_CODEX_MEDIA_PATH_APP_SERVER_LOG" + +echo "Installing Codex plugin: $PLUGIN_SPEC" +openclaw plugins install "$PLUGIN_SPEC" --force >"$PLUGIN_INSTALL_LOG" 2>&1 +openclaw plugins inspect codex --runtime --json >"$PLUGIN_INSPECT_LOG" + +node scripts/e2e/lib/codex-media-path/write-config.mjs + +gateway_pid="$(openclaw_e2e_start_gateway "$entry" "$PORT" "$GATEWAY_LOG")" +openclaw_e2e_wait_gateway_ready "$gateway_pid" "$GATEWAY_LOG" 480 + +PORT="$PORT" OPENCLAW_GATEWAY_TOKEN="$TOKEN" \ + tsx scripts/e2e/lib/codex-media-path/client.mjs >"$CLIENT_LOG" 2>&1 + +cat "$CLIENT_LOG" +echo "Codex media-path Docker E2E passed" diff --git a/scripts/e2e/lib/codex-media-path/write-config.mjs b/scripts/e2e/lib/codex-media-path/write-config.mjs new file mode 100644 index 000000000000..c146daeea304 --- /dev/null +++ b/scripts/e2e/lib/codex-media-path/write-config.mjs @@ -0,0 +1,76 @@ +import fs from "node:fs"; +import path from "node:path"; + +function requireEnv(name) { + const value = process.env[name]; + if (!value) { + throw new Error(`missing ${name}`); + } + return value; +} + +const configPath = requireEnv("OPENCLAW_CONFIG_PATH"); +const stateDir = requireEnv("OPENCLAW_STATE_DIR"); +const workspaceDir = requireEnv("OPENCLAW_TEST_WORKSPACE_DIR"); +const token = requireEnv("OPENCLAW_GATEWAY_TOKEN"); +const timeoutSeconds = Number.parseInt( + process.env.OPENCLAW_CODEX_MEDIA_PATH_TIMEOUT_SECONDS ?? "180", + 10, +); + +const config = { + gateway: { + port: Number.parseInt(process.env.PORT ?? "18790", 10), + bind: "loopback", + auth: { mode: "token", token }, + controlUi: { enabled: false }, + }, + plugins: { + enabled: true, + allow: ["codex"], + entries: { + codex: { + enabled: true, + config: { + appServer: { + mode: "yolo", + command: "node", + args: ["scripts/e2e/lib/codex-media-path/fake-codex-app-server.mjs"], + requestTimeoutMs: timeoutSeconds * 1000, + turnCompletionIdleTimeoutMs: timeoutSeconds * 1000, + }, + }, + }, + }, + }, + agents: { + defaults: { + agentRuntime: { id: "codex" }, + model: { primary: "codex/gpt-5.5", fallbacks: [] }, + models: { + "codex/gpt-5.5": { + agentRuntime: { id: "codex" }, + }, + }, + workspace: workspaceDir, + skipBootstrap: true, + timeoutSeconds, + sandbox: { mode: "off" }, + }, + list: [ + { + id: "main", + default: true, + agentRuntime: { id: "codex" }, + model: { primary: "codex/gpt-5.5", fallbacks: [] }, + workspace: workspaceDir, + }, + ], + }, + skills: { allowBundled: [] }, +}; + +fs.mkdirSync(path.dirname(configPath), { recursive: true }); +fs.mkdirSync(workspaceDir, { recursive: true }); +fs.writeFileSync(configPath, `${JSON.stringify(config, null, 2)}\n`); +fs.mkdirSync(path.join(stateDir, "logs"), { recursive: true }); diff --git a/scripts/lib/docker-e2e-scenarios.mjs b/scripts/lib/docker-e2e-scenarios.mjs index 39be8fe4ab18..cb3518e8f825 100644 --- a/scripts/lib/docker-e2e-scenarios.mjs +++ b/scripts/lib/docker-e2e-scenarios.mjs @@ -223,6 +223,15 @@ export const mainLanes = [ stateScenario: "empty", weight: 3, }), + serviceLane( + "codex-media-path", + "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:codex-media-path", + { + resources: ["npm"], + stateScenario: "empty", + weight: 3, + }, + ), npmLane( "npm-onboard-channel-agent", "OPENCLAW_SKIP_DOCKER_BUILD=1 pnpm test:docker:npm-onboard-channel-agent", @@ -436,6 +445,20 @@ export const tailLanes = [ timeoutMs: LIVE_ACP_TIMEOUT_MS, weight: 3, }), + liveLane( + "live-codex-media-path", + liveDockerScriptCommand( + "test-live-codex-harness-docker.sh", + "OPENCLAW_LIVE_CODEX_HARNESS_AUTH=api-key OPENCLAW_LIVE_CODEX_HARNESS_CHAT_IMAGE_PROBE=1 OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=0 OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=0 OPENCLAW_LIVE_CODEX_HARNESS_SUBAGENT_PROBE=0 OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE=0", + ), + { + cacheKey: "codex-harness", + provider: "openai", + resources: ["npm"], + timeoutMs: LIVE_ACP_TIMEOUT_MS, + weight: 3, + }, + ), liveLane( "live-subagent-announce", liveDockerScriptCommand("test-live-subagent-announce-docker.sh"), diff --git a/scripts/test-live-codex-harness-docker.sh b/scripts/test-live-codex-harness-docker.sh index 96c754b0f7d9..cc1437a644f6 100644 --- a/scripts/test-live-codex-harness-docker.sh +++ b/scripts/test-live-codex-harness-docker.sh @@ -286,6 +286,7 @@ OPENCLAW_LIVE_DOCKER_REPO_ROOT="$ROOT_DIR" "$TRUSTED_HARNESS_DIR/scripts/test-li echo "==> Run Codex harness live test in Docker" echo "==> Model: ${OPENCLAW_LIVE_CODEX_HARNESS_MODEL:-codex/gpt-5.5}" +echo "==> Chat image probe: ${OPENCLAW_LIVE_CODEX_HARNESS_CHAT_IMAGE_PROBE:-0}" echo "==> Image probe: ${OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE:-1}" echo "==> MCP probe: ${OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE:-1}" echo "==> Subagent probe: ${OPENCLAW_LIVE_CODEX_HARNESS_SUBAGENT_PROBE:-1}" @@ -316,6 +317,7 @@ DOCKER_RUN_ARGS=(docker run --rm -t \ -e OPENCLAW_LIVE_DOCKER_SOURCE_STAGE_MODE="${OPENCLAW_LIVE_DOCKER_SOURCE_STAGE_MODE:-copy}" \ -e OPENCLAW_LIVE_CODEX_HARNESS_AUTH="$CODEX_HARNESS_AUTH_MODE" \ -e OPENCLAW_LIVE_CODEX_HARNESS=1 \ + -e OPENCLAW_LIVE_CODEX_HARNESS_CHAT_IMAGE_PROBE="${OPENCLAW_LIVE_CODEX_HARNESS_CHAT_IMAGE_PROBE:-0}" \ -e OPENCLAW_LIVE_CODEX_HARNESS_DEBUG="${OPENCLAW_LIVE_CODEX_HARNESS_DEBUG:-}" \ -e OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE="${OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE:-1}" \ -e OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE="${OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE:-1}" \ diff --git a/src/gateway/gateway-acp-bind.live.test.ts b/src/gateway/gateway-acp-bind.live.test.ts index 87f623c9f5ab..d0218668ce06 100644 --- a/src/gateway/gateway-acp-bind.live.test.ts +++ b/src/gateway/gateway-acp-bind.live.test.ts @@ -4,6 +4,7 @@ import net from "node:net"; import os from "node:os"; import path from "node:path"; import { describe, expect, it } from "vitest"; +import { renderCatFacePngBase64 } from "../../test/helpers/live-image-probe.js"; import { getAcpRuntimeBackend } from "../acp/runtime/registry.js"; import { isLiveTestEnabled } from "../agents/live-test-helpers.js"; import { @@ -32,7 +33,6 @@ import { runOpenClawCliJson, shouldRunLiveImageProbe, } from "./live-agent-probes.js"; -import { renderCatFacePngBase64 } from "./live-image-probe.js"; import { startGatewayServer } from "./server.js"; const LIVE = isLiveTestEnabled(); diff --git a/src/gateway/gateway-cli-backend.live-probe-helpers.ts b/src/gateway/gateway-cli-backend.live-probe-helpers.ts index ecda8a6be519..ec8dc16551bb 100644 --- a/src/gateway/gateway-cli-backend.live-probe-helpers.ts +++ b/src/gateway/gateway-cli-backend.live-probe-helpers.ts @@ -1,4 +1,5 @@ import { randomUUID } from "node:crypto"; +import { renderCatFacePngBase64 } from "../../test/helpers/live-image-probe.js"; import { isTruthyEnvValue } from "../infra/env.js"; import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js"; import type { GatewayClient } from "./client.js"; @@ -15,7 +16,6 @@ import { runOpenClawCliJson, type CronListJob, } from "./live-agent-probes.js"; -import { renderCatFacePngBase64 } from "./live-image-probe.js"; import { getActiveMcpLoopbackRuntime } from "./mcp-http.js"; import { resolveMcpLoopbackBearerToken } from "./mcp-http.loopback-runtime.js"; import { extractPayloadText } from "./test-helpers.agent-results.js"; diff --git a/src/gateway/gateway-codex-bind.live.test.ts b/src/gateway/gateway-codex-bind.live.test.ts index 7855a7536124..a45d2c522f22 100644 --- a/src/gateway/gateway-codex-bind.live.test.ts +++ b/src/gateway/gateway-codex-bind.live.test.ts @@ -3,6 +3,7 @@ import fs from "node:fs/promises"; import os from "node:os"; import path from "node:path"; import { describe, expect, it } from "vitest"; +import { renderCatFacePngBase64 } from "../../test/helpers/live-image-probe.js"; import { isLiveTestEnabled } from "../agents/live-test-helpers.js"; import type { ChannelOutboundContext } from "../channels/plugins/types.public.js"; import { clearConfigCache, clearRuntimeConfigSnapshot } from "../config/config.js"; @@ -22,7 +23,6 @@ import { createTestRegistry } from "../test-utils/channel-plugins.js"; import { sleep } from "../utils.js"; import type { GatewayClient } from "./client.js"; import { connectTestGatewayClient } from "./gateway-cli-backend.live-helpers.js"; -import { renderCatFacePngBase64 } from "./live-image-probe.js"; import { startGatewayServer } from "./server.js"; const LIVE = isLiveTestEnabled(); diff --git a/src/gateway/gateway-codex-harness.live.test.ts b/src/gateway/gateway-codex-harness.live.test.ts index 8691df68fe80..6a3568042fa1 100644 --- a/src/gateway/gateway-codex-harness.live.test.ts +++ b/src/gateway/gateway-codex-harness.live.test.ts @@ -5,10 +5,15 @@ import os from "node:os"; import path from "node:path"; import { setTimeout as delay } from "node:timers/promises"; import { describe, expect, it } from "vitest"; +import { + renderBitmapTextPngBase64, + renderSolidColorPngBase64, +} from "../../test/helpers/live-image-probe.js"; import { isLiveTestEnabled } from "../agents/live-test-helpers.js"; import type { OpenClawConfig } from "../config/config.js"; import type { ContextEngine } from "../context-engine/types.js"; import { isTruthyEnvValue } from "../infra/env.js"; +import { extractFirstTextBlock } from "../shared/chat-message-content.js"; import type { CallGatewayOptions } from "./call.js"; import type { GatewayClient } from "./client.js"; import { @@ -30,7 +35,6 @@ import { type CronListJob, } from "./live-agent-probes.js"; import { restoreLiveEnv, snapshotLiveEnv, type LiveEnvSnapshot } from "./live-env-test-helpers.js"; -import { renderSolidColorPngBase64 } from "./live-image-probe.js"; import type { EventFrame } from "./protocol/index.js"; const LIVE = isLiveTestEnabled(); @@ -39,6 +43,9 @@ const CODEX_HARNESS_DEBUG = isTruthyEnvValue(process.env.OPENCLAW_LIVE_CODEX_HAR const CODEX_HARNESS_IMAGE_PROBE = isTruthyEnvValue( process.env.OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE, ); +const CODEX_HARNESS_CHAT_IMAGE_PROBE = isTruthyEnvValue( + process.env.OPENCLAW_LIVE_CODEX_HARNESS_CHAT_IMAGE_PROBE, +); const CODEX_HARNESS_MCP_PROBE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE); const CODEX_HARNESS_SUBAGENT_PROBE = isTruthyEnvValue( process.env.OPENCLAW_LIVE_CODEX_HARNESS_SUBAGENT_PROBE, @@ -51,6 +58,7 @@ const CODEX_HARNESS_CODE_MODE_ONLY = isTruthyEnvValue( ); const CODEX_HARNESS_SUBAGENT_ONLY = CODEX_HARNESS_SUBAGENT_PROBE && + !CODEX_HARNESS_CHAT_IMAGE_PROBE && !CODEX_HARNESS_IMAGE_PROBE && !CODEX_HARNESS_MCP_PROBE && !CODEX_HARNESS_GUARDIAN_PROBE && @@ -408,6 +416,22 @@ async function waitForChatFinalText(params: { throw new Error(`timed out waiting for chat final for ${params.runId}`); } +async function waitForChatAgentRunOk(client: GatewayClient, runId: string): Promise { + const result: { status?: string } = await client.request( + "agent.wait", + { + runId, + timeoutMs: CODEX_HARNESS_REQUEST_TIMEOUT_MS, + }, + { + timeoutMs: CODEX_HARNESS_REQUEST_TIMEOUT_MS + 5_000, + }, + ); + if (result?.status !== "ok") { + throw new Error(`agent.wait failed for ${runId}: status=${String(result?.status)}`); + } +} + function extractChatFinalText(event: EventFrame, runId: string): string | undefined { if (event.event !== "chat") { return undefined; @@ -438,6 +462,69 @@ function extractChatFinalText(event: EventFrame, runId: string): string | undefi .trim(); } +function extractAssistantTexts(messages: unknown[]): string[] { + const texts: string[] = []; + for (const entry of messages) { + if (!entry || typeof entry !== "object") { + continue; + } + if ((entry as { role?: unknown }).role !== "assistant") { + continue; + } + const text = extractFirstTextBlock(entry); + if (typeof text === "string" && text.trim().length > 0) { + texts.push(text); + } + } + return texts; +} + +function formatAssistantTextPreview(texts: string[], maxChars = 800): string { + const combined = texts.join("\n\n").trim(); + if (!combined) { + return ""; + } + return combined.length > maxChars ? `${combined.slice(0, maxChars)}...` : combined; +} + +async function waitForAssistantText(params: { + client: GatewayClient; + sessionKey: string; + contains: string; + timeoutMs?: number; +}): Promise { + const timeoutMs = params.timeoutMs ?? 60_000; + const startedAt = Date.now(); + while (Date.now() - startedAt < timeoutMs) { + const history: { messages?: unknown[] } = await params.client.request("chat.history", { + sessionKey: params.sessionKey, + limit: 24, + }); + const assistantTexts = extractAssistantTexts(history.messages ?? []); + const normalizedContains = params.contains.toUpperCase(); + const matched = assistantTexts.find((text) => + text + .toUpperCase() + .replace(/[^A-F0-9]/g, "") + .includes(normalizedContains), + ); + if (matched) { + return matched; + } + await delay(500); + } + + const finalHistory: { messages?: unknown[] } = await params.client.request("chat.history", { + sessionKey: params.sessionKey, + limit: 24, + }); + throw new Error( + `timed out waiting for assistant text containing ${params.contains}: ${formatAssistantTextPreview( + extractAssistantTexts(finalHistory.messages ?? []), + )}`, + ); +} + async function verifyCodexImageProbe(params: { client: GatewayClient; sessionKey: string; @@ -491,6 +578,51 @@ async function verifyCodexImageProbe(params: { expect(events.map((event) => event.stream)).toContain("codex_app_server.lifecycle"); } +async function verifyCodexChatImageProbe(params: { + client: GatewayClient; + sessionKey: string; +}): Promise { + const token = randomBitmapTextToken(); + const runId = `idem-${randomUUID()}-codex-chat-image`; + const started: { runId?: string; status?: string } = await params.client.request( + "chat.send", + { + sessionKey: params.sessionKey, + idempotencyKey: runId, + message: "Read the code printed in the attached image. Reply with only that code.", + attachments: [ + { + mimeType: "image/png", + fileName: "codex-chat-image-probe.png", + content: renderBitmapTextPngBase64(token), + }, + ], + originatingChannel: "codex-harness-live", + originatingTo: "codex-harness-live", + originatingAccountId: "codex-harness-live", + }, + { timeoutMs: CODEX_HARNESS_REQUEST_TIMEOUT_MS }, + ); + if (started?.status !== "started" || typeof started.runId !== "string") { + throw new Error(`codex chat image probe did not start correctly: ${JSON.stringify(started)}`); + } + await waitForChatAgentRunOk(params.client, started.runId); + const text = await waitForAssistantText({ + client: params.client, + sessionKey: params.sessionKey, + contains: token, + }); + const normalized = text.toUpperCase().replace(/[^A-F0-9]/g, ""); + expect(normalized, `Expected Codex to read bitmap token ${token}; received:\n${text}`).toContain( + token, + ); +} + +function randomBitmapTextToken(length = 6): string { + const alphabet = "24567ACEF"; + return [...randomBytes(length)].map((byte) => alphabet[byte % alphabet.length]).join(""); +} + function findGuardianReviewStatus(events: CapturedAgentEvent[]): "approved" | "denied" | undefined { const status = events.findLast((event) => event.data?.phase === "completed" && event.data?.status) ?.data?.status; @@ -964,6 +1096,12 @@ describeLive("gateway live (Codex harness)", () => { }); logCodexLiveStep("codex-models-command", { modelsText }); + if (CODEX_HARNESS_CHAT_IMAGE_PROBE) { + logCodexLiveStep("chat-image-probe:start", { sessionKey }); + await verifyCodexChatImageProbe({ client, sessionKey }); + logCodexLiveStep("chat-image-probe:done"); + } + if (CODEX_HARNESS_IMAGE_PROBE) { logCodexLiveStep("image-probe:start", { sessionKey }); await verifyCodexImageProbe({ client, sessionKey }); diff --git a/src/gateway/gateway-models.profiles.live.test.ts b/src/gateway/gateway-models.profiles.live.test.ts index b4638d31caf1..9d625583d9ac 100644 --- a/src/gateway/gateway-models.profiles.live.test.ts +++ b/src/gateway/gateway-models.profiles.live.test.ts @@ -14,6 +14,7 @@ import { type ModelThinkingLevel, } from "@earendil-works/pi-ai"; import { afterEach, describe, expect, it } from "vitest"; +import { renderCatNoncePngBase64 } from "../../test/helpers/live-image-probe.js"; import { resolveAgentWorkspaceDir, resolveDefaultAgentDir } from "../agents/agent-scope.js"; import { ensureAuthProfileStore, @@ -54,7 +55,6 @@ import { stripAssistantInternalScaffolding } from "../shared/text/assistant-visi import { containsFinalTag, stripFinalTags } from "../shared/text/final-tags.js"; import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js"; import { GatewayClient } from "./client.js"; -import { renderCatNoncePngBase64 } from "./live-image-probe.js"; import { hasExpectedSingleNonce, hasExpectedToolNonce, @@ -1086,7 +1086,7 @@ async function runAnthropicRefusalProbe(params: { function randomImageProbeCode(len = 6): string { // Chosen to avoid common OCR confusions in our 5x7 bitmap font. // Notably: 0↔8, B↔8, 6↔9, 3↔B, D↔0. - // Must stay within the glyph set in `src/gateway/live-image-probe.ts`. + // Must stay within the glyph set in `test/helpers/live-image-probe.ts`. const alphabet = "24567ACEF"; const bytes = randomBytes(len); let out = ""; diff --git a/src/gateway/live-image-probe.test.ts b/test/helpers/live-image-probe.test.ts similarity index 100% rename from src/gateway/live-image-probe.test.ts rename to test/helpers/live-image-probe.test.ts diff --git a/src/gateway/live-image-probe.ts b/test/helpers/live-image-probe.ts similarity index 87% rename from src/gateway/live-image-probe.ts rename to test/helpers/live-image-probe.ts index 43c00816d297..ac080f96f4ec 100644 --- a/src/gateway/live-image-probe.ts +++ b/test/helpers/live-image-probe.ts @@ -1,4 +1,4 @@ -import { encodePngRgba, fillPixel } from "../media/png-encode.js"; +import { encodePngRgba, fillPixel } from "../../src/media/png-encode.js"; const GLYPH_ROWS_5X7: Record = { "0": [0b01110, 0b10001, 0b10011, 0b10101, 0b11001, 0b10001, 0b01110], @@ -89,6 +89,52 @@ function measureTextWidthPx(text: string, scale: number) { return text.length * 6 * scale - scale; // 5px glyph + 1px space } +export function renderBitmapTextPngBase64( + text: string, + options: { + background?: { r: number; g: number; b: number; a?: number }; + foreground?: { r: number; g: number; b: number; a?: number }; + padding?: number; + scale?: number; + } = {}, +): string { + const normalized = text.trim().toUpperCase(); + if (!normalized) { + throw new Error("bitmap text image requires non-empty text"); + } + const unsupported = [...normalized].filter((ch) => !(ch in GLYPH_ROWS_5X7)); + if (unsupported.length > 0) { + throw new Error(`bitmap text image contains unsupported glyphs: ${unsupported.join(",")}`); + } + const scale = Math.max(1, Math.floor(options.scale ?? 4)); + const padding = Math.max(0, Math.floor(options.padding ?? 8)); + const width = measureTextWidthPx(normalized, scale) + padding * 2; + const height = 7 * scale + padding * 2; + const background = options.background ?? { r: 245, g: 247, b: 250, a: 255 }; + const foreground = options.foreground ?? { r: 18, g: 24, b: 33, a: 255 }; + const buf = Buffer.alloc(width * height * 4); + fillRect({ + buf, + width, + height, + x: 0, + y: 0, + w: width, + h: height, + color: background, + }); + drawText({ + buf, + width, + x: padding, + y: padding, + text: normalized, + scale, + color: foreground, + }); + return encodePngRgba(buf, width, height).toString("base64"); +} + function fillRect(params: { buf: Buffer; width: number;