From b08e0da25b8a217b55393cac22d6709b0c8d6672 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 16 May 2026 01:08:07 +0100 Subject: [PATCH] fix: clarify provider timeout ceiling --- CHANGELOG.md | 1 + docs/concepts/agent-loop.md | 4 ++-- docs/concepts/model-providers.md | 2 +- docs/gateway/doctor.md | 2 +- docs/gateway/local-models.md | 3 ++- .../run.timeout-triggered-compaction.test.ts | 5 +++-- src/agents/pi-embedded-runner/run.ts | 3 ++- src/commands/doctor/shared/config-flow-steps.test.ts | 4 +++- src/commands/doctor/shared/deprecation-compat.ts | 2 +- src/commands/doctor/shared/legacy-config-migrate.test.ts | 2 +- .../doctor/shared/legacy-config-migrate.validation.test.ts | 2 +- .../doctor/shared/legacy-config-migrations.runtime.agents.ts | 2 +- 12 files changed, 19 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7e6067a4f16..9f61e83cc152 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai ### Fixes - Telegram: let authorized text `/stop` commands use the fast-abort path before queued agent work, so active turns stop immediately instead of processing the abort after the turn finishes. Fixes #82162. Thanks @civiltox. +- Agents/timeouts: clarify model idle-timeout errors and docs so provider `timeoutSeconds` is shown as bounded by the whole agent/run timeout ceiling. - Release tooling: align the published launcher Node floor, `npm start`, package script checks, sharded lint locking, Vitest root project coverage, and plugin-SDK declaration build cache metadata so release/package validation does not silently skip or ship stale surfaces. - Cron/agents: honor configured subagent model fallbacks for isolated scheduled runs and forward that fallback policy into embedded agent timeout failover. Fixes #74985. Thanks @chrisgwynne. - Codex app-server/MCP: scope user MCP servers to specific OpenClaw agent ids through an optional `mcp.servers..codex.agents` list and accept `codex.defaultToolsApprovalMode` (`auto`/`prompt`/`approve`) for native Codex approval defaults; OpenClaw strips the `codex` block before handing `mcp_servers` config to Codex. (#82180) Thanks @sercada. diff --git a/docs/concepts/agent-loop.md b/docs/concepts/agent-loop.md index 90c65945fde1..0b7d6fc945d7 100644 --- a/docs/concepts/agent-loop.md +++ b/docs/concepts/agent-loop.md @@ -166,8 +166,8 @@ surfaces, while Codex native hooks remain a separate lower-level Codex mechanism - Agent runtime: `agents.defaults.timeoutSeconds` default 172800s (48 hours); enforced in `runEmbeddedPiAgent` abort timer. - Cron runtime: isolated agent-turn `timeoutSeconds` is owned by cron. The scheduler starts that timer when execution begins, aborts the underlying run at the configured deadline, then runs bounded cleanup before recording the timeout so a stale child session cannot keep the lane stuck. - Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work. Stale session bookkeeping releases the affected session lane immediately; stalled embedded runs are abort-drained only after `diagnostics.stuckSessionAbortMs` (default: at least 5 minutes and 3x the warning threshold) so queued work can resume without cutting off merely slow runs. Recovery emits structured requested/completed outcomes, and diagnostic state is marked idle only if the same processing generation is still current. Repeated `session.stuck` diagnostics back off while the session remains unchanged. -- Model idle timeout: OpenClaw aborts a model request when no response chunks arrive before the idle window. `models.providers..timeoutSeconds` extends this idle watchdog for slow local/self-hosted providers; otherwise OpenClaw uses `agents.defaults.timeoutSeconds` when configured, capped at 120s by default. Cron-triggered runs with no explicit model or agent timeout disable the idle watchdog and rely on the cron outer timeout. -- Provider HTTP request timeout: `models.providers..timeoutSeconds` applies to that provider's model HTTP fetches, including connect, headers, body, SDK request timeout, total guarded-fetch abort handling, and model stream idle watchdog. Use this for slow local/self-hosted providers such as Ollama before raising the whole agent runtime timeout. +- Model idle timeout: OpenClaw aborts a model request when no response chunks arrive before the idle window. `models.providers..timeoutSeconds` extends this idle watchdog for slow local/self-hosted providers, but it is still bounded by any lower `agents.defaults.timeoutSeconds` or run-specific timeout because those control the whole agent run. Otherwise OpenClaw uses `agents.defaults.timeoutSeconds` when configured, capped at 120s by default. Cron-triggered runs with no explicit model or agent timeout disable the idle watchdog and rely on the cron outer timeout. +- Provider HTTP request timeout: `models.providers..timeoutSeconds` applies to that provider's model HTTP fetches, including connect, headers, body, SDK request timeout, total guarded-fetch abort handling, and model stream idle watchdog. Use this for slow local/self-hosted providers such as Ollama before raising the whole agent runtime timeout, and keep the agent/runtime timeout at least as high when the model request needs to run longer. ## Where things can end early diff --git a/docs/concepts/model-providers.md b/docs/concepts/model-providers.md index 31ec64a5dd47..bd7efee17027 100644 --- a/docs/concepts/model-providers.md +++ b/docs/concepts/model-providers.md @@ -688,7 +688,7 @@ Example (OpenAI-compatible): - Proxy-style OpenAI-compatible routes also skip native OpenAI-only request shaping: no `service_tier`, no Responses `store`, no Completions `store`, no prompt-cache hints, no OpenAI reasoning-compat payload shaping, and no hidden OpenClaw attribution headers. - For OpenAI-compatible Completions proxies that need vendor-specific fields, set `agents.defaults.models["provider/model"].params.extra_body` (or `extraBody`) to merge extra JSON into the outbound request body. - For vLLM chat-template controls, set `agents.defaults.models["provider/model"].params.chat_template_kwargs`. The bundled vLLM plugin automatically sends `enable_thinking: false` and `force_nonempty_content: true` for `vllm/nemotron-3-*` when the session thinking level is off. - - For slow local models or remote LAN/tailnet hosts, set `models.providers..timeoutSeconds`. This extends provider model HTTP request handling, including connect, headers, body streaming, and the total guarded-fetch abort, without increasing the whole agent runtime timeout. + - For slow local models or remote LAN/tailnet hosts, set `models.providers..timeoutSeconds`. This extends provider model HTTP request handling, including connect, headers, body streaming, and the total guarded-fetch abort, without increasing the whole agent runtime timeout. If `agents.defaults.timeoutSeconds` or a run-specific timeout is lower, raise that ceiling too; provider timeouts cannot extend the whole run. - Model provider HTTP calls allow Surge, Clash, and sing-box fake-IP DNS answers in `198.18.0.0/15` and `fc00::/7` only for the configured provider `baseUrl` hostname. Custom/local provider endpoints also trust that exact configured `scheme://host:port` origin for guarded model requests, including loopback, LAN, and tailnet hosts. This is not a new config option; the `baseUrl` you configure extends the request policy only for that origin. Fake-IP hostname allowance and exact-origin trust are independent mechanisms. Other private, loopback, link-local, metadata destinations, and different ports still require an explicit `models.providers..request.allowPrivateNetwork: true` opt-in. Set `models.providers..request.allowPrivateNetwork: false` to opt out of the exact-origin trust. - If `baseUrl` is empty/omitted, OpenClaw keeps the default OpenAI behavior (which resolves to `api.openai.com`). - For safety, an explicit `compat.supportsDeveloperRole: true` is still overridden on non-native `openai-completions` endpoints. diff --git a/docs/gateway/doctor.md b/docs/gateway/doctor.md index ecbc13ccdc8a..cf37ed8719bf 100644 --- a/docs/gateway/doctor.md +++ b/docs/gateway/doctor.md @@ -218,7 +218,7 @@ That stages grounded durable candidates into the short-term dreaming store while - `identity` → `agents.list[].identity` - `agent.*` → `agents.defaults` + `tools.*` (tools/elevated/exec/sandbox/subagents) - `agent.model`/`allowedModels`/`modelAliases`/`modelFallbacks`/`imageModelFallbacks` → `agents.defaults.models` + `agents.defaults.model.primary/fallbacks` + `agents.defaults.imageModel.primary/fallbacks` - - remove `agents.defaults.llm`; use `models.providers..timeoutSeconds` for slow provider/model timeouts + - remove `agents.defaults.llm`; use `models.providers..timeoutSeconds` for slow provider/model timeouts, and keep the agent/run timeout above that value when the whole run must last longer - `browser.ssrfPolicy.allowPrivateNetwork` → `browser.ssrfPolicy.dangerouslyAllowPrivateNetwork` - `browser.profiles.*.driver: "extension"` → `"existing-session"` - remove `browser.relayBindHost` (legacy extension relay setting) diff --git a/docs/gateway/local-models.md b/docs/gateway/local-models.md index 2685ab7a1fe6..7b4107a5260c 100644 --- a/docs/gateway/local-models.md +++ b/docs/gateway/local-models.md @@ -198,7 +198,8 @@ Keep `models.mode: "merge"` so hosted models stay available as fallbacks. Use `models.providers..timeoutSeconds` for slow local or remote model servers before raising `agents.defaults.timeoutSeconds`. The provider timeout applies only to model HTTP requests, including connect, headers, body streaming, -and the total guarded-fetch abort. +and the total guarded-fetch abort. If the agent or run timeout is lower, raise +that ceiling too because provider timeouts cannot extend the whole agent run. For custom OpenAI-compatible providers, persisting a non-secret local marker such as `apiKey: "ollama-local"` is accepted when `baseUrl` resolves to loopback, a private LAN, `.local`, or a bare hostname. OpenClaw treats it as a valid local credential instead of reporting a missing key. Use a real value for any provider that accepts a public hostname. diff --git a/src/agents/pi-embedded-runner/run.timeout-triggered-compaction.test.ts b/src/agents/pi-embedded-runner/run.timeout-triggered-compaction.test.ts index 451bb5af9d6d..93d20f90565c 100644 --- a/src/agents/pi-embedded-runner/run.timeout-triggered-compaction.test.ts +++ b/src/agents/pi-embedded-runner/run.timeout-triggered-compaction.test.ts @@ -300,7 +300,7 @@ describe("timeout-triggered compaction", () => { expect(result.payloads?.[0]?.text).toContain("timed out"); }); - it("points idle-timeout errors at the provider timeout config key", async () => { + it("points idle-timeout errors at provider timeout and the agent runtime ceiling", async () => { mockedRunEmbeddedAttempt.mockResolvedValueOnce( makeAttemptResult({ timedOut: true, @@ -316,7 +316,8 @@ describe("timeout-triggered compaction", () => { expect(mockedCompactDirect).not.toHaveBeenCalled(); expect(result.payloads?.[0]?.isError).toBe(true); expect(result.payloads?.[0]?.text).toContain("models.providers..timeoutSeconds"); - expect(result.payloads?.[0]?.text).not.toContain("agents.defaults.timeoutSeconds"); + expect(result.payloads?.[0]?.text).toContain("agents.defaults.timeoutSeconds"); + expect(result.payloads?.[0]?.text).toContain("provider timeouts cannot extend"); }); it("retries one silent idle timeout before surfacing an error", async () => { diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index 697cc63b3a6a..f8d3c2651995 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -2541,7 +2541,8 @@ export async function runEmbeddedPiAgent( if (timedOutDuringPrompt && !hasMessagingToolDeliveryEvidence(attempt)) { const timeoutText = idleTimedOut ? "The model did not produce a response before the model idle timeout. " + - "Please try again, or increase `models.providers..timeoutSeconds` for slow local or self-hosted providers." + "Please try again, or increase `models.providers..timeoutSeconds` for slow local or self-hosted providers. " + + "If `agents.defaults.timeoutSeconds` or a run-specific timeout is lower, raise that ceiling too; provider timeouts cannot extend the whole agent run." : "Request timed out before a response was generated. " + "Please try again, or increase `agents.defaults.timeoutSeconds` in your config."; const replayInvalid = resolveReplayInvalidForAttempt(null); diff --git a/src/commands/doctor/shared/config-flow-steps.test.ts b/src/commands/doctor/shared/config-flow-steps.test.ts index f06bb7948be8..509fbce82ae9 100644 --- a/src/commands/doctor/shared/config-flow-steps.test.ts +++ b/src/commands/doctor/shared/config-flow-steps.test.ts @@ -105,7 +105,9 @@ describe("doctor config flow steps", () => { const migratedConfig = { agents: { defaults: { model: { primary: "openai/gpt-5.4" } } } }; migrateLegacyConfigMock.mockReturnValueOnce({ config: migratedConfig, - changes: ["Removed agents.defaults.llm; model idle timeout now follows models.providers."], + changes: [ + "Removed agents.defaults.llm; model idle timeout now follows models.providers within the agent/run timeout ceiling.", + ], partiallyValid: true, }); diff --git a/src/commands/doctor/shared/deprecation-compat.ts b/src/commands/doctor/shared/deprecation-compat.ts index 7cacc5c94467..fca3d8896594 100644 --- a/src/commands/doctor/shared/deprecation-compat.ts +++ b/src/commands/doctor/shared/deprecation-compat.ts @@ -67,7 +67,7 @@ const DOCTOR_DEPRECATION_COMPAT_RECORDS = [ docsPath: "/gateway/config-agents", tests: ["src/commands/doctor/shared/legacy-config-migrate.test.ts"], notes: - "The old agent-level idle timeout knob was collapsed into provider request timeout handling.", + "The old agent-level idle timeout knob was collapsed into provider request timeout handling, bounded by the agent/run timeout ceiling.", }), deprecatedCompatRecord({ code: "doctor-agent-runtime-embedded-harness", diff --git a/src/commands/doctor/shared/legacy-config-migrate.test.ts b/src/commands/doctor/shared/legacy-config-migrate.test.ts index c4b169b5c465..9850b5f19981 100644 --- a/src/commands/doctor/shared/legacy-config-migrate.test.ts +++ b/src/commands/doctor/shared/legacy-config-migrate.test.ts @@ -412,7 +412,7 @@ describe("legacy migrate sandbox scope aliases", () => { }); expect(res.changes).toStrictEqual([ - "Removed agents.defaults.llm; model idle timeout now follows models.providers..timeoutSeconds.", + "Removed agents.defaults.llm; model idle timeout now follows models.providers..timeoutSeconds within the agent/run timeout ceiling.", ]); expect(res.config?.agents?.defaults).toEqual({ model: { primary: "openai/gpt-5.4" }, diff --git a/src/commands/doctor/shared/legacy-config-migrate.validation.test.ts b/src/commands/doctor/shared/legacy-config-migrate.validation.test.ts index 55b7cf81f473..9ebb361b2fc0 100644 --- a/src/commands/doctor/shared/legacy-config-migrate.validation.test.ts +++ b/src/commands/doctor/shared/legacy-config-migrate.validation.test.ts @@ -62,7 +62,7 @@ describe("legacy config migrate validation", () => { expect(res.partiallyValid).toBe(true); expect(res.changes).toStrictEqual([ - "Removed agents.defaults.llm; model idle timeout now follows models.providers..timeoutSeconds.", + "Removed agents.defaults.llm; model idle timeout now follows models.providers..timeoutSeconds within the agent/run timeout ceiling.", "Migration applied; other validation issues remain — run doctor to review.", ]); expect(res.config?.agents?.defaults).toEqual({ diff --git a/src/commands/doctor/shared/legacy-config-migrations.runtime.agents.ts b/src/commands/doctor/shared/legacy-config-migrations.runtime.agents.ts index c5877eb67c78..00bd7041dee2 100644 --- a/src/commands/doctor/shared/legacy-config-migrations.runtime.agents.ts +++ b/src/commands/doctor/shared/legacy-config-migrations.runtime.agents.ts @@ -326,7 +326,7 @@ export const LEGACY_CONFIG_MIGRATIONS_RUNTIME_AGENTS: LegacyConfigMigrationSpec[ } delete defaults.llm; changes.push( - "Removed agents.defaults.llm; model idle timeout now follows models.providers..timeoutSeconds.", + "Removed agents.defaults.llm; model idle timeout now follows models.providers..timeoutSeconds within the agent/run timeout ceiling.", ); }, }),