From b08e0da25b8a217b55393cac22d6709b0c8d6672 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <peter@steipete.me>
Date: Sat, 16 May 2026 01:08:07 +0100
Subject: [PATCH] fix: clarify provider timeout ceiling

---
 CHANGELOG.md                                                 | 1 +
 docs/concepts/agent-loop.md                                  | 4 ++--
 docs/concepts/model-providers.md                             | 2 +-
 docs/gateway/doctor.md                                       | 2 +-
 docs/gateway/local-models.md                                 | 3 ++-
 .../run.timeout-triggered-compaction.test.ts                 | 5 +++--
 src/agents/pi-embedded-runner/run.ts                         | 3 ++-
 src/commands/doctor/shared/config-flow-steps.test.ts         | 4 +++-
 src/commands/doctor/shared/deprecation-compat.ts             | 2 +-
 src/commands/doctor/shared/legacy-config-migrate.test.ts     | 2 +-
 .../doctor/shared/legacy-config-migrate.validation.test.ts   | 2 +-
 .../doctor/shared/legacy-config-migrations.runtime.agents.ts | 2 +-
 12 files changed, 19 insertions(+), 13 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d7e6067a4f16..9f61e83cc152 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes
 
 - Telegram: let authorized text `/stop` commands use the fast-abort path before queued agent work, so active turns stop immediately instead of processing the abort after the turn finishes. Fixes #82162. Thanks @civiltox.
+- Agents/timeouts: clarify model idle-timeout errors and docs so provider `timeoutSeconds` is shown as bounded by the whole agent/run timeout ceiling.
 - Release tooling: align the published launcher Node floor, `npm start`, package script checks, sharded lint locking, Vitest root project coverage, and plugin-SDK declaration build cache metadata so release/package validation does not silently skip or ship stale surfaces.
 - Cron/agents: honor configured subagent model fallbacks for isolated scheduled runs and forward that fallback policy into embedded agent timeout failover. Fixes #74985. Thanks @chrisgwynne.
 - Codex app-server/MCP: scope user MCP servers to specific OpenClaw agent ids through an optional `mcp.servers.<name>.codex.agents` list and accept `codex.defaultToolsApprovalMode` (`auto`/`prompt`/`approve`) for native Codex approval defaults; OpenClaw strips the `codex` block before handing `mcp_servers` config to Codex. (#82180) Thanks @sercada.
diff --git a/docs/concepts/agent-loop.md b/docs/concepts/agent-loop.md
index 90c65945fde1..0b7d6fc945d7 100644
--- a/docs/concepts/agent-loop.md
+++ b/docs/concepts/agent-loop.md
@@ -166,8 +166,8 @@ surfaces, while Codex native hooks remain a separate lower-level Codex mechanism
 - Agent runtime: `agents.defaults.timeoutSeconds` default 172800s (48 hours); enforced in `runEmbeddedPiAgent` abort timer.
 - Cron runtime: isolated agent-turn `timeoutSeconds` is owned by cron. The scheduler starts that timer when execution begins, aborts the underlying run at the configured deadline, then runs bounded cleanup before recording the timeout so a stale child session cannot keep the lane stuck.
 - Session liveness diagnostics: with diagnostics enabled, `diagnostics.stuckSessionWarnMs` classifies long `processing` sessions that have no observed reply, tool, status, block, or ACP progress. Active embedded runs, model calls, and tool calls report as `session.long_running`; active work with no recent progress reports as `session.stalled`; `session.stuck` is reserved for stale session bookkeeping with no active work. Stale session bookkeeping releases the affected session lane immediately; stalled embedded runs are abort-drained only after `diagnostics.stuckSessionAbortMs` (default: at least 5 minutes and 3x the warning threshold) so queued work can resume without cutting off merely slow runs. Recovery emits structured requested/completed outcomes, and diagnostic state is marked idle only if the same processing generation is still current. Repeated `session.stuck` diagnostics back off while the session remains unchanged.
-- Model idle timeout: OpenClaw aborts a model request when no response chunks arrive before the idle window. `models.providers.<id>.timeoutSeconds` extends this idle watchdog for slow local/self-hosted providers; otherwise OpenClaw uses `agents.defaults.timeoutSeconds` when configured, capped at 120s by default. Cron-triggered runs with no explicit model or agent timeout disable the idle watchdog and rely on the cron outer timeout.
-- Provider HTTP request timeout: `models.providers.<id>.timeoutSeconds` applies to that provider's model HTTP fetches, including connect, headers, body, SDK request timeout, total guarded-fetch abort handling, and model stream idle watchdog. Use this for slow local/self-hosted providers such as Ollama before raising the whole agent runtime timeout.
+- Model idle timeout: OpenClaw aborts a model request when no response chunks arrive before the idle window. `models.providers.<id>.timeoutSeconds` extends this idle watchdog for slow local/self-hosted providers, but it is still bounded by any lower `agents.defaults.timeoutSeconds` or run-specific timeout because those control the whole agent run. Otherwise OpenClaw uses `agents.defaults.timeoutSeconds` when configured, capped at 120s by default. Cron-triggered runs with no explicit model or agent timeout disable the idle watchdog and rely on the cron outer timeout.
+- Provider HTTP request timeout: `models.providers.<id>.timeoutSeconds` applies to that provider's model HTTP fetches, including connect, headers, body, SDK request timeout, total guarded-fetch abort handling, and model stream idle watchdog. Use this for slow local/self-hosted providers such as Ollama before raising the whole agent runtime timeout, and keep the agent/runtime timeout at least as high when the model request needs to run longer.
 
 ## Where things can end early
 
diff --git a/docs/concepts/model-providers.md b/docs/concepts/model-providers.md
index 31ec64a5dd47..bd7efee17027 100644
--- a/docs/concepts/model-providers.md
+++ b/docs/concepts/model-providers.md
@@ -688,7 +688,7 @@ Example (OpenAI-compatible):
     - Proxy-style OpenAI-compatible routes also skip native OpenAI-only request shaping: no `service_tier`, no Responses `store`, no Completions `store`, no prompt-cache hints, no OpenAI reasoning-compat payload shaping, and no hidden OpenClaw attribution headers.
     - For OpenAI-compatible Completions proxies that need vendor-specific fields, set `agents.defaults.models["provider/model"].params.extra_body` (or `extraBody`) to merge extra JSON into the outbound request body.
     - For vLLM chat-template controls, set `agents.defaults.models["provider/model"].params.chat_template_kwargs`. The bundled vLLM plugin automatically sends `enable_thinking: false` and `force_nonempty_content: true` for `vllm/nemotron-3-*` when the session thinking level is off.
-    - For slow local models or remote LAN/tailnet hosts, set `models.providers.<id>.timeoutSeconds`. This extends provider model HTTP request handling, including connect, headers, body streaming, and the total guarded-fetch abort, without increasing the whole agent runtime timeout.
+    - For slow local models or remote LAN/tailnet hosts, set `models.providers.<id>.timeoutSeconds`. This extends provider model HTTP request handling, including connect, headers, body streaming, and the total guarded-fetch abort, without increasing the whole agent runtime timeout. If `agents.defaults.timeoutSeconds` or a run-specific timeout is lower, raise that ceiling too; provider timeouts cannot extend the whole run.
     - Model provider HTTP calls allow Surge, Clash, and sing-box fake-IP DNS answers in `198.18.0.0/15` and `fc00::/7` only for the configured provider `baseUrl` hostname. Custom/local provider endpoints also trust that exact configured `scheme://host:port` origin for guarded model requests, including loopback, LAN, and tailnet hosts. This is not a new config option; the `baseUrl` you configure extends the request policy only for that origin. Fake-IP hostname allowance and exact-origin trust are independent mechanisms. Other private, loopback, link-local, metadata destinations, and different ports still require an explicit `models.providers.<id>.request.allowPrivateNetwork: true` opt-in. Set `models.providers.<id>.request.allowPrivateNetwork: false` to opt out of the exact-origin trust.
     - If `baseUrl` is empty/omitted, OpenClaw keeps the default OpenAI behavior (which resolves to `api.openai.com`).
     - For safety, an explicit `compat.supportsDeveloperRole: true` is still overridden on non-native `openai-completions` endpoints.
diff --git a/docs/gateway/doctor.md b/docs/gateway/doctor.md
index ecbc13ccdc8a..cf37ed8719bf 100644
--- a/docs/gateway/doctor.md
+++ b/docs/gateway/doctor.md
@@ -218,7 +218,7 @@ That stages grounded durable candidates into the short-term dreaming store while
     - `identity` → `agents.list[].identity`
     - `agent.*` → `agents.defaults` + `tools.*` (tools/elevated/exec/sandbox/subagents)
     - `agent.model`/`allowedModels`/`modelAliases`/`modelFallbacks`/`imageModelFallbacks` → `agents.defaults.models` + `agents.defaults.model.primary/fallbacks` + `agents.defaults.imageModel.primary/fallbacks`
-    - remove `agents.defaults.llm`; use `models.providers.<id>.timeoutSeconds` for slow provider/model timeouts
+    - remove `agents.defaults.llm`; use `models.providers.<id>.timeoutSeconds` for slow provider/model timeouts, and keep the agent/run timeout above that value when the whole run must last longer
     - `browser.ssrfPolicy.allowPrivateNetwork` → `browser.ssrfPolicy.dangerouslyAllowPrivateNetwork`
     - `browser.profiles.*.driver: "extension"` → `"existing-session"`
     - remove `browser.relayBindHost` (legacy extension relay setting)
diff --git a/docs/gateway/local-models.md b/docs/gateway/local-models.md
index 2685ab7a1fe6..7b4107a5260c 100644
--- a/docs/gateway/local-models.md
+++ b/docs/gateway/local-models.md
@@ -198,7 +198,8 @@ Keep `models.mode: "merge"` so hosted models stay available as fallbacks.
 Use `models.providers.<id>.timeoutSeconds` for slow local or remote model
 servers before raising `agents.defaults.timeoutSeconds`. The provider timeout
 applies only to model HTTP requests, including connect, headers, body streaming,
-and the total guarded-fetch abort.
+and the total guarded-fetch abort. If the agent or run timeout is lower, raise
+that ceiling too because provider timeouts cannot extend the whole agent run.
 
 <Note>
 For custom OpenAI-compatible providers, persisting a non-secret local marker such as `apiKey: "ollama-local"` is accepted when `baseUrl` resolves to loopback, a private LAN, `.local`, or a bare hostname. OpenClaw treats it as a valid local credential instead of reporting a missing key. Use a real value for any provider that accepts a public hostname.
diff --git a/src/agents/pi-embedded-runner/run.timeout-triggered-compaction.test.ts b/src/agents/pi-embedded-runner/run.timeout-triggered-compaction.test.ts
index 451bb5af9d6d..93d20f90565c 100644
--- a/src/agents/pi-embedded-runner/run.timeout-triggered-compaction.test.ts
+++ b/src/agents/pi-embedded-runner/run.timeout-triggered-compaction.test.ts
@@ -300,7 +300,7 @@ describe("timeout-triggered compaction", () => {
     expect(result.payloads?.[0]?.text).toContain("timed out");
   });
 
-  it("points idle-timeout errors at the provider timeout config key", async () => {
+  it("points idle-timeout errors at provider timeout and the agent runtime ceiling", async () => {
     mockedRunEmbeddedAttempt.mockResolvedValueOnce(
       makeAttemptResult({
         timedOut: true,
@@ -316,7 +316,8 @@ describe("timeout-triggered compaction", () => {
     expect(mockedCompactDirect).not.toHaveBeenCalled();
     expect(result.payloads?.[0]?.isError).toBe(true);
     expect(result.payloads?.[0]?.text).toContain("models.providers.<id>.timeoutSeconds");
-    expect(result.payloads?.[0]?.text).not.toContain("agents.defaults.timeoutSeconds");
+    expect(result.payloads?.[0]?.text).toContain("agents.defaults.timeoutSeconds");
+    expect(result.payloads?.[0]?.text).toContain("provider timeouts cannot extend");
   });
 
   it("retries one silent idle timeout before surfacing an error", async () => {
diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts
index 697cc63b3a6a..f8d3c2651995 100644
--- a/src/agents/pi-embedded-runner/run.ts
+++ b/src/agents/pi-embedded-runner/run.ts
@@ -2541,7 +2541,8 @@ export async function runEmbeddedPiAgent(
           if (timedOutDuringPrompt && !hasMessagingToolDeliveryEvidence(attempt)) {
             const timeoutText = idleTimedOut
               ? "The model did not produce a response before the model idle timeout. " +
-                "Please try again, or increase `models.providers.<id>.timeoutSeconds` for slow local or self-hosted providers."
+                "Please try again, or increase `models.providers.<id>.timeoutSeconds` for slow local or self-hosted providers. " +
+                "If `agents.defaults.timeoutSeconds` or a run-specific timeout is lower, raise that ceiling too; provider timeouts cannot extend the whole agent run."
               : "Request timed out before a response was generated. " +
                 "Please try again, or increase `agents.defaults.timeoutSeconds` in your config.";
             const replayInvalid = resolveReplayInvalidForAttempt(null);
diff --git a/src/commands/doctor/shared/config-flow-steps.test.ts b/src/commands/doctor/shared/config-flow-steps.test.ts
index f06bb7948be8..509fbce82ae9 100644
--- a/src/commands/doctor/shared/config-flow-steps.test.ts
+++ b/src/commands/doctor/shared/config-flow-steps.test.ts
@@ -105,7 +105,9 @@ describe("doctor config flow steps", () => {
     const migratedConfig = { agents: { defaults: { model: { primary: "openai/gpt-5.4" } } } };
     migrateLegacyConfigMock.mockReturnValueOnce({
       config: migratedConfig,
-      changes: ["Removed agents.defaults.llm; model idle timeout now follows models.providers."],
+      changes: [
+        "Removed agents.defaults.llm; model idle timeout now follows models.providers within the agent/run timeout ceiling.",
+      ],
       partiallyValid: true,
     });
 
diff --git a/src/commands/doctor/shared/deprecation-compat.ts b/src/commands/doctor/shared/deprecation-compat.ts
index 7cacc5c94467..fca3d8896594 100644
--- a/src/commands/doctor/shared/deprecation-compat.ts
+++ b/src/commands/doctor/shared/deprecation-compat.ts
@@ -67,7 +67,7 @@ const DOCTOR_DEPRECATION_COMPAT_RECORDS = [
     docsPath: "/gateway/config-agents",
     tests: ["src/commands/doctor/shared/legacy-config-migrate.test.ts"],
     notes:
-      "The old agent-level idle timeout knob was collapsed into provider request timeout handling.",
+      "The old agent-level idle timeout knob was collapsed into provider request timeout handling, bounded by the agent/run timeout ceiling.",
   }),
   deprecatedCompatRecord({
     code: "doctor-agent-runtime-embedded-harness",
diff --git a/src/commands/doctor/shared/legacy-config-migrate.test.ts b/src/commands/doctor/shared/legacy-config-migrate.test.ts
index c4b169b5c465..9850b5f19981 100644
--- a/src/commands/doctor/shared/legacy-config-migrate.test.ts
+++ b/src/commands/doctor/shared/legacy-config-migrate.test.ts
@@ -412,7 +412,7 @@ describe("legacy migrate sandbox scope aliases", () => {
     });
 
     expect(res.changes).toStrictEqual([
-      "Removed agents.defaults.llm; model idle timeout now follows models.providers.<id>.timeoutSeconds.",
+      "Removed agents.defaults.llm; model idle timeout now follows models.providers.<id>.timeoutSeconds within the agent/run timeout ceiling.",
     ]);
     expect(res.config?.agents?.defaults).toEqual({
       model: { primary: "openai/gpt-5.4" },
diff --git a/src/commands/doctor/shared/legacy-config-migrate.validation.test.ts b/src/commands/doctor/shared/legacy-config-migrate.validation.test.ts
index 55b7cf81f473..9ebb361b2fc0 100644
--- a/src/commands/doctor/shared/legacy-config-migrate.validation.test.ts
+++ b/src/commands/doctor/shared/legacy-config-migrate.validation.test.ts
@@ -62,7 +62,7 @@ describe("legacy config migrate validation", () => {
 
     expect(res.partiallyValid).toBe(true);
     expect(res.changes).toStrictEqual([
-      "Removed agents.defaults.llm; model idle timeout now follows models.providers.<id>.timeoutSeconds.",
+      "Removed agents.defaults.llm; model idle timeout now follows models.providers.<id>.timeoutSeconds within the agent/run timeout ceiling.",
       "Migration applied; other validation issues remain — run doctor to review.",
     ]);
     expect(res.config?.agents?.defaults).toEqual({
diff --git a/src/commands/doctor/shared/legacy-config-migrations.runtime.agents.ts b/src/commands/doctor/shared/legacy-config-migrations.runtime.agents.ts
index c5877eb67c78..00bd7041dee2 100644
--- a/src/commands/doctor/shared/legacy-config-migrations.runtime.agents.ts
+++ b/src/commands/doctor/shared/legacy-config-migrations.runtime.agents.ts
@@ -326,7 +326,7 @@ export const LEGACY_CONFIG_MIGRATIONS_RUNTIME_AGENTS: LegacyConfigMigrationSpec[
       }
       delete defaults.llm;
       changes.push(
-        "Removed agents.defaults.llm; model idle timeout now follows models.providers.<id>.timeoutSeconds.",
+        "Removed agents.defaults.llm; model idle timeout now follows models.providers.<id>.timeoutSeconds within the agent/run timeout ceiling.",
       );
     },
   }),