fix(auth): add force re-login recovery and fallback auth skips

Summary:
- Add forced provider re-login support that clears cached auth profiles before running provider login again.
- Add provider-auth remediation guidance and a session-scoped skip cache for known-bad fallback auth attempts.
- Wire session ids through agent command, auto-reply, and embedded compaction fallback callers so the skip cache applies on real run paths.
- Fail closed when forced auth profile removal cannot update the profile store.

Verification:
- Local format, lint, diff-check, focused Vitest shards, and autoreview passed.
- PR CI, CodeQL Security High, and Critical Quality agent-runtime-boundary passed on head 1b4e9e753e.

Co-authored-by: Mert Basar <MertBasar0@users.noreply.github.com>
This commit is contained in:
Mert Başar
2026-05-31 21:01:51 +03:00
committed by GitHub
parent db0209ac5d
commit 0ff5fe3a80
18 changed files with 1039 additions and 2 deletions

View File

@@ -67,6 +67,27 @@ OpenClaw separates the selected provider/model from why it was selected. That so
The auto fallback primary-probe interval is five minutes and is not configurable. OpenClaw remembers recent probes per session and primary model so a failing primary is not retried on every turn. OpenClaw sends a visible notice when a session moves onto fallback and another notice when it returns to the selected primary; it does not repeat the notice on every sticky fallback turn.
## Auth failure skip cache
By default, every new turn keeps the existing fallback retry behavior: OpenClaw
will try each configured fallback candidate again, including non-primary
candidates that recently failed with `auth` or `auth_permanent`.
Operators who prefer to suppress those repeat auth failures can opt in with:
```bash
OPENCLAW_FALLBACK_SKIP_TTL_MS=60000
```
When enabled, OpenClaw records an in-memory, session-scoped skip marker for a
non-primary fallback candidate after an auth-class failure. The marker is keyed
by session id, provider, and model. Primary candidates are never skipped, so an
explicit user model selection still surfaces the real auth error. The cache is
process-local and clears on Gateway restart.
The value is a TTL in milliseconds. `0` or an unset value disables the cache.
Positive values are clamped between 1 second and 10 minutes.
## User-visible fallback notices
When a session moves onto an auto-selected fallback, OpenClaw sends a status notice in the same reply surface:

View File

@@ -206,6 +206,17 @@ openclaw models auth login --provider openai --profile-id openai:lain
This is the easiest way to keep multiple OAuth logins for the same provider
separate inside one agent.
Use `--force` when a saved provider profile is stuck, expired, or tied to the
wrong account and the normal login command keeps reusing it. `--force` deletes
the saved auth profiles for that provider in the selected agent directory, then
runs the same provider auth flow again. It does not revoke credentials at the
provider; rotate or revoke them in the provider dashboard when you need
provider-side invalidation.
```bash
openclaw models auth login --provider anthropic --force
```
### Per-session (chat command)
Use `/model <alias-or-id>@<profileId>` to pin a specific provider credential for the current session (example profile ids: `anthropic:default`, `anthropic:work`).

View File

@@ -704,6 +704,7 @@ beforeAll(async () => {
type FallbackRunnerParams = {
provider: string;
model: string;
sessionId?: string;
run: (provider: string, model: string) => Promise<unknown>;
onFallbackStep?: (step: Record<string, unknown>) => void | Promise<void>;
classifyResult?: (params: {
@@ -969,6 +970,7 @@ describe("agentCommand LiveSessionModelSwitchError retry", () => {
const secondCall = mockCallArg(state.runWithModelFallbackMock, 1) as FallbackRunnerParams;
expect(secondCall.provider).toBe("openai");
expect(secondCall.model).toBe("gpt-5.4");
expect(secondCall.sessionId).toBe("session-1");
const lifecycleEndCalls = state.emitAgentEventMock.mock.calls.filter((call: unknown[]) => {
const arg = call[0] as { stream?: string; data?: { phase?: string } };

View File

@@ -1513,6 +1513,7 @@ async function agentCommandInternal(
runId,
agentDir,
agentId: sessionAgentId,
sessionId,
sessionKey: sessionKey ?? sessionId,
prepareAgentHarnessRuntime: async ({
provider: providerValue,

View File

@@ -445,6 +445,7 @@ export async function compactEmbeddedAgentSessionDirect(
runId: params.runId ?? params.sessionId,
agentDir: params.agentDir,
agentId: fallbackAgentId,
sessionId: params.sessionId,
sessionKey: fallbackSessionKey,
prepareAgentHarnessRuntime: async ({ provider, model, agentHarnessRuntimeOverride }) => {
await ensureSelectedAgentHarnessPlugin({

View File

@@ -1,6 +1,8 @@
import { describe, expect, it } from "vitest";
import { classifyFailoverSignal } from "./embedded-agent-helpers/errors.js";
import {
buildFailoverRemediationHint,
buildProviderReauthCommand,
coerceToFailoverError,
describeFailoverError,
FailoverError,
@@ -1240,3 +1242,72 @@ describe("failover-error", () => {
});
});
});
describe("buildFailoverRemediationHint", () => {
it("returns a copy-pasteable login command for auth failures", () => {
const err = new FailoverError("missing token", {
reason: "auth",
provider: "anthropic",
model: "claude-opus-4-7",
});
expect(buildFailoverRemediationHint(err)).toBe(
"Re-authenticate with: openclaw models auth login --provider 'anthropic' --force",
);
});
it("returns a hint for auth_permanent as well", () => {
const err = new FailoverError("revoked", {
reason: "auth_permanent",
provider: "google-gemini-cli",
model: "gemini-3.1-pro-preview",
});
expect(buildFailoverRemediationHint(err)).toBe(
"Re-authenticate with: openclaw models auth login --provider 'google-gemini-cli' --force",
);
});
it("quotes provider ids that contain shell metacharacters", () => {
expect(buildProviderReauthCommand("custom;touch /tmp/pwned")).toBe(
"openclaw models auth login --provider 'custom;touch /tmp/pwned' --force",
);
expect(buildProviderReauthCommand("custom'provider")).toBe(
"openclaw models auth login --provider 'custom'\\''provider' --force",
);
});
it("refuses control characters in rendered provider commands", () => {
expect(buildProviderReauthCommand("custom\nprovider")).toBeUndefined();
});
it("wraps rendered provider commands in the standard CLI formatter", () => {
expect(buildProviderReauthCommand("anthropic", { OPENCLAW_PROFILE: "work" })).toBe(
"openclaw --profile work models auth login --provider 'anthropic' --force",
);
expect(buildProviderReauthCommand("anthropic", { OPENCLAW_CONTAINER_HINT: "dev" })).toBe(
"openclaw --container dev models auth login --provider 'anthropic' --force",
);
});
it("returns undefined for non-auth reasons", () => {
const err = new FailoverError("429", {
reason: "rate_limit",
provider: "openai",
model: "gpt-5",
});
expect(buildFailoverRemediationHint(err)).toBeUndefined();
});
it("returns undefined when provider is not attributed", () => {
const err = new FailoverError("no token", {
reason: "auth",
model: "claude-opus-4-7",
});
expect(buildFailoverRemediationHint(err)).toBeUndefined();
});
it("returns undefined for non-FailoverError inputs", () => {
expect(buildFailoverRemediationHint(new Error("oops"))).toBeUndefined();
expect(buildFailoverRemediationHint(undefined)).toBeUndefined();
expect(buildFailoverRemediationHint("just a string")).toBeUndefined();
});
});

View File

@@ -1,4 +1,5 @@
import { parseStrictNonNegativeInteger } from "@openclaw/normalization-core/number-coercion";
import { formatCliCommand } from "../cli/command-format.js";
import { readErrorName } from "../infra/errors.js";
import {
classifyFailoverSignal,
@@ -535,6 +536,59 @@ export function resolveFailoverReasonFromError(
);
}
/**
* Build an actionable remediation hint for a failover error when the failure
* reason is `auth` / `auth_permanent` and we have enough provider attribution
* to suggest a re-authentication command. Returns `undefined` for any other
* failure shape so callers can opportunistically append the hint without
* branching on every reason themselves.
*
* Keep the string short and copy-pasteable — operators see it in fallback
* summary errors and TUI status lines.
*/
export function buildFailoverRemediationHint(err: unknown): string | undefined {
if (!isFailoverError(err)) {
return undefined;
}
if (err.reason !== "auth" && err.reason !== "auth_permanent") {
return undefined;
}
const provider = err.provider?.trim();
if (!provider) {
return undefined;
}
const command = buildProviderReauthCommand(provider);
return command ? `Re-authenticate with: ${command}` : undefined;
}
function quotePosixShellArg(value: string): string {
return `'${value.replaceAll("'", "'\\''")}'`;
}
export function buildProviderReauthCommand(
provider: string,
env: Record<string, string | undefined> = process.env as Record<string, string | undefined>,
): string | undefined {
const trimmed = provider.trim();
if (!trimmed || hasControlCharacter(trimmed)) {
return undefined;
}
return formatCliCommand(
`openclaw models auth login --provider ${quotePosixShellArg(trimmed)} --force`,
env,
);
}
function hasControlCharacter(value: string): boolean {
for (let i = 0; i < value.length; i += 1) {
const code = value.charCodeAt(i);
if (code < 0x20 || code === 0x7f) {
return true;
}
}
return false;
}
export function describeFailoverError(err: unknown): {
message: string;
rawError?: string;

View File

@@ -0,0 +1,334 @@
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import {
DEFAULT_FALLBACK_SKIP_TTL_MS,
resetFallbackSkipCacheForTest,
clearFallbackSkipCacheForSession,
getFallbackCandidateSkipReason,
isFallbackCandidateSkipped,
markFallbackCandidateSkipped,
} from "./fallback-skip-cache.js";
describe("fallback-skip-cache", () => {
beforeEach(() => {
resetFallbackSkipCacheForTest();
});
afterEach(() => {
resetFallbackSkipCacheForTest();
});
it("returns false for an unknown (session, provider, model) triple", () => {
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 1_000,
}),
).toBe(false);
});
it("treats falsy sessionId as a no-op for both mark and check", () => {
markFallbackCandidateSkipped({
sessionId: undefined,
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 1_000,
});
expect(
isFallbackCandidateSkipped({
sessionId: undefined,
provider: "anthropic",
model: "claude-opus-4-7",
now: 1_000,
}),
).toBe(false);
expect(
isFallbackCandidateSkipped({
sessionId: "",
provider: "anthropic",
model: "claude-opus-4-7",
now: 1_000,
}),
).toBe(false);
});
it("marks then sees a candidate as skipped within the TTL", () => {
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 1_000,
ttlMs: 60_000,
});
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 30_000,
}),
).toBe(true);
expect(
getFallbackCandidateSkipReason({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 30_000,
}),
).toBe("auth");
});
it("expires entries after the TTL elapses", () => {
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth_permanent",
now: 1_000,
ttlMs: 10_000,
});
// Just before expiry, still skipped.
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 10_000,
}),
).toBe(true);
// At and after expiry, no longer skipped.
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 11_001,
}),
).toBe(false);
expect(
getFallbackCandidateSkipReason({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 11_001,
}),
).toBeUndefined();
});
it("isolates entries across sessions", () => {
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 1_000,
});
expect(
isFallbackCandidateSkipped({
sessionId: "s2",
provider: "anthropic",
model: "claude-opus-4-7",
now: 30_000,
}),
).toBe(false);
});
it("isolates entries across (provider, model) pairs", () => {
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 1_000,
});
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-sonnet-4-6",
now: 30_000,
}),
).toBe(false);
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "google",
model: "claude-opus-4-7",
now: 30_000,
}),
).toBe(false);
});
it("clearFallbackSkipCacheForSession drops every marker for that session", () => {
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 1_000,
});
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "google",
model: "gemini-3.1-pro-preview",
reason: "auth",
now: 1_000,
});
clearFallbackSkipCacheForSession("s1");
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 30_000,
}),
).toBe(false);
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "google",
model: "gemini-3.1-pro-preview",
now: 30_000,
}),
).toBe(false);
});
it("re-marking the same triple refreshes the TTL", () => {
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 1_000,
ttlMs: 10_000,
});
// Re-mark just before the original entry would expire.
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth_permanent",
now: 10_000,
ttlMs: 10_000,
});
// Without refresh, this point would be past expiry. With refresh it lives.
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 19_000,
}),
).toBe(true);
// The most recent reason wins.
expect(
getFallbackCandidateSkipReason({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 19_000,
}),
).toBe("auth_permanent");
});
it("prunes expired buckets from sessions that are never queried again", async () => {
const { peekFallbackSkipBucketsForTest } = await import("./fallback-skip-cache.js");
// Two short-lived sessions write markers, then never come back.
markFallbackCandidateSkipped({
sessionId: "one-off-1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 1_000,
ttlMs: 10_000,
});
markFallbackCandidateSkipped({
sessionId: "one-off-2",
provider: "google",
model: "gemini-3.1-pro-preview",
reason: "auth",
now: 1_000,
ttlMs: 10_000,
});
expect(peekFallbackSkipBucketsForTest().size).toBe(2);
// A third session writes well after the first two have expired. The
// opportunistic global prune must drop the stale buckets even though
// those original sessions are never re-queried.
markFallbackCandidateSkipped({
sessionId: "later",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 100_000,
ttlMs: 10_000,
});
const buckets = peekFallbackSkipBucketsForTest();
expect(buckets.has("one-off-1")).toBe(false);
expect(buckets.has("one-off-2")).toBe(false);
expect(buckets.has("later")).toBe(true);
});
it("does not skip by default when ttlMs is omitted", () => {
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 1_000,
});
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 1_000,
}),
).toBe(false);
expect(DEFAULT_FALLBACK_SKIP_TTL_MS).toBe(0);
});
it("uses OPENCLAW_FALLBACK_SKIP_TTL_MS as an opt-in default TTL", () => {
const previous = process.env.OPENCLAW_FALLBACK_SKIP_TTL_MS;
process.env.OPENCLAW_FALLBACK_SKIP_TTL_MS = "60000";
try {
markFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
reason: "auth",
now: 1_000,
});
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 60_000,
}),
).toBe(true);
expect(
isFallbackCandidateSkipped({
sessionId: "s1",
provider: "anthropic",
model: "claude-opus-4-7",
now: 61_001,
}),
).toBe(false);
} finally {
if (previous === undefined) {
delete process.env.OPENCLAW_FALLBACK_SKIP_TTL_MS;
} else {
process.env.OPENCLAW_FALLBACK_SKIP_TTL_MS = previous;
}
}
});
});

View File

@@ -0,0 +1,246 @@
/**
* Session-scoped "known-bad candidate" cache for the model fallback chain.
*
* When explicitly enabled and a fallback candidate fails with a non-transient
* credential error (`auth` / `auth_permanent`), the chain can avoid retrying
* the same candidate on every subsequent turn until the user fixes their auth.
*
* This module records skip markers per `(sessionId, provider, model)` with a
* short TTL. The cache is intentionally in-memory only: a process restart
* clears it so a freshly-restarted gateway always tries every candidate at
* least once before deciding to skip again.
*
* The cache is global, not per-config, so any caller running fallbacks for the
* same `sessionId` shares the same skip set. Tests can reset state via
* `resetFallbackSkipCacheForTest()`.
*/
import { modelKey } from "./model-selection-normalize.js";
/**
* Default time-to-live for a skip marker. Disabled by default so existing
* fallback retry behavior stays unchanged unless an operator opts in with
* OPENCLAW_FALLBACK_SKIP_TTL_MS.
*/
export const DEFAULT_FALLBACK_SKIP_TTL_MS = 0;
const FALLBACK_SKIP_TTL_ENV = "OPENCLAW_FALLBACK_SKIP_TTL_MS";
const FALLBACK_SKIP_TTL_MIN_MS = 1_000;
const FALLBACK_SKIP_TTL_MAX_MS = 10 * 60_000;
function resolveConfiguredSkipTtlMs(env: NodeJS.ProcessEnv = process.env): number {
const raw = env[FALLBACK_SKIP_TTL_ENV];
if (!raw) {
return DEFAULT_FALLBACK_SKIP_TTL_MS;
}
const trimmed = raw.trim();
if (!trimmed) {
return DEFAULT_FALLBACK_SKIP_TTL_MS;
}
const parsed = Number.parseInt(trimmed, 10);
if (!Number.isFinite(parsed) || parsed < 0) {
return DEFAULT_FALLBACK_SKIP_TTL_MS;
}
if (parsed === 0) {
return 0;
}
return Math.min(FALLBACK_SKIP_TTL_MAX_MS, Math.max(FALLBACK_SKIP_TTL_MIN_MS, parsed));
}
type SkipEntry = {
expiresAtMs: number;
reason: string;
};
type SkipBySession = Map<string, Map<string, SkipEntry>>;
type SkipCacheState = {
buckets: SkipBySession;
lastGlobalPruneAtMs: number;
};
/**
* Minimum interval between two opportunistic global prunes. Keeps the
* worst-case cost of a hot write/check path amortized: even if a gateway
* tracks thousands of sessions, the cache is only walked every
* `GLOBAL_PRUNE_INTERVAL_MS`, not on every call.
*/
const GLOBAL_PRUNE_INTERVAL_MS = 5_000;
function getState(): SkipCacheState {
const globalStore = globalThis as typeof globalThis & {
openclawFallbackSkipCache?: SkipBySession;
openclawFallbackSkipCacheState?: SkipCacheState;
};
if (!globalStore.openclawFallbackSkipCacheState) {
// Reuse the existing buckets map if a prior version of this module already
// populated the legacy global; otherwise start fresh.
const buckets = globalStore.openclawFallbackSkipCache ?? new Map();
globalStore.openclawFallbackSkipCacheState = {
buckets,
lastGlobalPruneAtMs: 0,
};
globalStore.openclawFallbackSkipCache = buckets;
}
return globalStore.openclawFallbackSkipCacheState;
}
function getBuckets(): SkipBySession {
return getState().buckets;
}
function sessionBucket(sessionId: string, create: boolean): Map<string, SkipEntry> | undefined {
const buckets = getBuckets();
let bucket = buckets.get(sessionId);
if (!bucket && create) {
bucket = new Map();
buckets.set(sessionId, bucket);
}
return bucket;
}
function candidateKey(provider: string, model: string): string {
return modelKey(provider, model);
}
function pruneExpired(bucket: Map<string, SkipEntry>, now: number): void {
for (const [key, entry] of bucket.entries()) {
if (entry.expiresAtMs <= now) {
bucket.delete(key);
}
}
}
/**
* Walk every session bucket, drop expired markers, and remove buckets that
* end up empty. Called opportunistically from the hot write/check paths so
* stale buckets left behind by one-off sessions cannot accumulate across the
* gateway's lifetime — the per-bucket prune only fires when the same session
* is queried again, which is not guaranteed for short-lived sessions.
*/
function pruneAllExpired(now: number): void {
const state = getState();
if (now - state.lastGlobalPruneAtMs < GLOBAL_PRUNE_INTERVAL_MS) {
return;
}
state.lastGlobalPruneAtMs = now;
for (const [sessionId, bucket] of state.buckets.entries()) {
pruneExpired(bucket, now);
if (bucket.size === 0) {
state.buckets.delete(sessionId);
}
}
}
/**
* Record that `(sessionId, provider, model)` should be skipped for the
* configured TTL. Safe to call with falsy `sessionId` — the call becomes a
* no-op so callers do not need to guard themselves.
*/
export function markFallbackCandidateSkipped(params: {
sessionId: string | undefined;
provider: string;
model: string;
reason: string;
now?: number;
ttlMs?: number;
}): void {
if (!params.sessionId || !params.provider || !params.model) {
return;
}
const now = params.now ?? Date.now();
const ttlMs = params.ttlMs ?? resolveConfiguredSkipTtlMs();
if (ttlMs <= 0) {
return;
}
pruneAllExpired(now);
const bucket = sessionBucket(params.sessionId, true);
if (!bucket) {
return;
}
bucket.set(candidateKey(params.provider, params.model), {
expiresAtMs: now + ttlMs,
reason: params.reason,
});
}
/**
* Returns true when `(sessionId, provider, model)` has an unexpired skip
* marker. Expired entries are pruned as a side-effect so the cache does not
* grow unbounded.
*/
export function isFallbackCandidateSkipped(params: {
sessionId: string | undefined;
provider: string;
model: string;
now?: number;
}): boolean {
if (!params.sessionId || !params.provider || !params.model) {
return false;
}
const now = params.now ?? Date.now();
pruneAllExpired(now);
const bucket = sessionBucket(params.sessionId, false);
if (!bucket) {
return false;
}
pruneExpired(bucket, now);
if (bucket.size === 0) {
getBuckets().delete(params.sessionId);
return false;
}
const entry = bucket.get(candidateKey(params.provider, params.model));
return Boolean(entry && entry.expiresAtMs > now);
}
/**
* Look up the recorded skip reason for a `(sessionId, provider, model)`
* triple. Returns `undefined` when no unexpired marker exists. Used by the
* fallback chain to surface the original failure reason in observation logs.
*/
export function getFallbackCandidateSkipReason(params: {
sessionId: string | undefined;
provider: string;
model: string;
now?: number;
}): string | undefined {
if (!params.sessionId || !params.provider || !params.model) {
return undefined;
}
const bucket = sessionBucket(params.sessionId, false);
if (!bucket) {
return undefined;
}
const now = params.now ?? Date.now();
const entry = bucket.get(candidateKey(params.provider, params.model));
if (!entry || entry.expiresAtMs <= now) {
return undefined;
}
return entry.reason;
}
/** Drop every skip marker associated with the given session. */
export function clearFallbackSkipCacheForSession(sessionId: string | undefined): void {
if (!sessionId) {
return;
}
getBuckets().delete(sessionId);
}
/**
* Test-only escape hatch. Production code must not call this; the global
* cache is meant to outlive individual fallback runs.
*/
export function resetFallbackSkipCacheForTest(): void {
const state = getState();
state.buckets.clear();
state.lastGlobalPruneAtMs = 0;
}
/**
* Test-only inspection hook for the global session-bucket map. Production
* code must not read this; the buckets are an implementation detail of the
* cache and may change shape.
*/
export function peekFallbackSkipBucketsForTest(): SkipBySession {
return getBuckets();
}

View File

@@ -19,6 +19,7 @@ import type { AuthProfileStore } from "./auth-profiles/types.js";
import { classifyEmbeddedAgentRunResultForModelFallback } from "./embedded-agent-runner/result-fallback-classifier.js";
import type { EmbeddedAgentRunResult } from "./embedded-agent-runner/types.js";
import { FailoverError } from "./failover-error.js";
import { resetFallbackSkipCacheForTest } from "./fallback-skip-cache.js";
import { MissingAgentHarnessError } from "./harness/errors.js";
import { LiveSessionModelSwitchError } from "./live-model-switch-error.js";
import {
@@ -180,6 +181,7 @@ afterAll(() => {
});
function resetModelFallbackTestState(): void {
resetFallbackSkipCacheForTest();
authRuntimeMock.clear();
authRuntimeMock.runtime.ensureAuthProfileStore.mockClear();
authRuntimeMock.runtime.loadAuthProfileStoreForRuntime.mockClear();
@@ -514,6 +516,75 @@ const INSUFFICIENT_QUOTA_PAYLOAD =
'{"type":"error","error":{"type":"insufficient_quota","message":"Your account has insufficient quota balance to run this request."}}';
describe("runWithModelFallback", () => {
it("uses the opt-in auth skip cache on the second turn for the same session", async () => {
const previous = process.env.OPENCLAW_FALLBACK_SKIP_TTL_MS;
process.env.OPENCLAW_FALLBACK_SKIP_TTL_MS = "60000";
try {
const cfg = makeCfg({
agents: {
defaults: {
model: {
primary: "openai/gpt-5.4",
fallbacks: ["anthropic/claude-opus-4-7", "google/gemini-3.1-pro-preview"],
},
},
},
});
const run = vi.fn(async (provider: string, model: string) => {
if (provider === "openai") {
throw new FailoverError("primary rate limited", {
provider,
model,
reason: "rate_limit",
});
}
if (provider === "anthropic") {
throw new FailoverError("fallback auth failed", {
provider,
model,
reason: "auth",
});
}
return "ok";
});
const first = await runWithModelFallback({
cfg,
provider: "openai",
model: "gpt-5.4",
sessionId: "session:auth-skip",
run,
});
const second = await runWithModelFallback({
cfg,
provider: "openai",
model: "gpt-5.4",
sessionId: "session:auth-skip",
run,
});
expect(first.result).toBe("ok");
expect(second.result).toBe("ok");
expect(run.mock.calls.map(([provider, model]) => `${provider}/${model}`)).toEqual([
"openai/gpt-5.4",
"anthropic/claude-opus-4-7",
"google/gemini-3.1-pro-preview",
"openai/gpt-5.4",
"google/gemini-3.1-pro-preview",
]);
expect(second.attempts.some((attempt) => attempt.provider === "anthropic")).toBe(true);
expect(second.attempts.find((attempt) => attempt.provider === "anthropic")?.error).toContain(
"recent auth failure",
);
} finally {
if (previous === undefined) {
delete process.env.OPENCLAW_FALLBACK_SKIP_TTL_MS;
} else {
process.env.OPENCLAW_FALLBACK_SKIP_TTL_MS = previous;
}
}
});
it("skips auth store bootstrap when no auth profile sources exist", async () => {
authSourceCheckMock.hasAnyAuthProfileStoreSource.mockReturnValue(false);
const run = vi.fn().mockResolvedValueOnce("ok");

View File

@@ -28,6 +28,8 @@ import { isLikelyContextOverflowError } from "./embedded-agent-helpers/errors.js
import type { FailoverReason } from "./embedded-agent-helpers/types.js";
import {
FailoverError,
buildFailoverRemediationHint,
buildProviderReauthCommand,
coerceToFailoverError,
describeFailoverError,
isFailoverError,
@@ -39,6 +41,11 @@ import {
shouldPreserveTransientCooldownProbeSlot,
shouldUseTransientCooldownProbeSlot,
} from "./failover-policy.js";
import {
getFallbackCandidateSkipReason,
isFallbackCandidateSkipped,
markFallbackCandidateSkipped,
} from "./fallback-skip-cache.js";
import { MissingAgentHarnessError, isMissingAgentHarnessError } from "./harness/errors.js";
import { resolveAgentHarnessPolicy } from "./harness/policy.js";
import { getRegisteredAgentHarness } from "./harness/registry.js";
@@ -562,8 +569,12 @@ function throwFallbackFailureSummary(params: {
const summary =
params.attempts.length > 0 ? params.attempts.map(params.formatAttempt).join(" | ") : "unknown";
const remediation = buildFailoverRemediationHint(params.lastError);
const message = remediation
? `All ${params.label} failed (${params.attempts.length || params.candidates.length}): ${summary}. ${remediation}`
: `All ${params.label} failed (${params.attempts.length || params.candidates.length}): ${summary}`;
throw new FallbackSummaryError(
`All ${params.label} failed (${params.attempts.length || params.candidates.length}): ${summary}`,
message,
params.attempts,
params.soonestCooldownExpiry ?? null,
params.lastError instanceof Error ? params.lastError : undefined,
@@ -1203,6 +1214,58 @@ export async function runWithModelFallback<T>(
const requestedModel = requestedCandidate
? sameModelCandidate(candidate, requestedCandidate)
: false;
// Skip-known-bad cache: when a previous turn in this session failed this
// candidate with `auth` / `auth_permanent` (e.g. missing or expired
// credentials), suppress repeat attempts for the cache TTL so we do not
// burn latency on the same broken candidate every turn. Primary is never
// skipped — if the user explicitly requested it we should still surface
// the auth error rather than silently jumping past it.
if (!isPrimary && params.sessionId) {
const skipped = isFallbackCandidateSkipped({
sessionId: params.sessionId,
provider: candidate.provider,
model: candidate.model,
});
if (skipped) {
const skipReason =
getFallbackCandidateSkipReason({
sessionId: params.sessionId,
provider: candidate.provider,
model: candidate.model,
}) ?? "auth";
const reauthCommand = buildProviderReauthCommand(candidate.provider);
const reauthHint = reauthCommand
? `run \`${reauthCommand}\` to re-authenticate`
: "re-authenticate that provider";
const error = `Skipping ${candidate.provider}/${candidate.model}: recent ${skipReason} failure in this session (${reauthHint})`;
attempts.push({
provider: candidate.provider,
model: candidate.model,
error,
reason: skipReason as FailoverReason,
});
await observeDecision({
decision: "skip_candidate",
runId: params.runId,
sessionId: params.sessionId,
lane: params.lane,
requestedProvider: params.provider,
requestedModel: params.model,
candidate,
attempt: i + 1,
total: candidates.length,
reason: skipReason as FailoverReason,
error,
nextCandidate: candidates[i + 1],
isPrimary,
requestedModelMatched: requestedModel,
fallbackConfigured: hasFallbackCandidates,
});
continue;
}
}
let runOptions: ModelFallbackRunOptions | undefined;
let attemptedDuringCooldown = false;
let transientProbeProviderForAttempt: string | null = null;
@@ -1498,6 +1561,23 @@ export async function runWithModelFallback<T>(
throw err;
}
// Record auth-class failures in the session-scoped skip cache so the
// next turn does not re-attempt the same broken candidate. Only mark
// for non-primary candidates — see the skip-check above for rationale.
if (
isKnownFailover &&
!isPrimary &&
params.sessionId &&
(normalized.reason === "auth" || normalized.reason === "auth_permanent")
) {
markFallbackCandidateSkipped({
sessionId: params.sessionId,
provider: candidate.provider,
model: candidate.model,
reason: normalized.reason,
});
}
lastError = isKnownFailover ? normalized : err;
await observeFailedCandidate({
attempts,

View File

@@ -211,6 +211,7 @@ async function getApplyFallbackCandidateSelectionToEntry() {
type FallbackRunnerParams = {
provider: string;
model: string;
sessionId?: string;
abortSignal?: AbortSignal;
run: (provider: string, model: string) => Promise<unknown>;
classifyResult?: (params: {
@@ -1152,6 +1153,7 @@ describe("runAgentTurnWithFallback", () => {
"runEmbeddedAgent params",
);
expect(fallbackCall.abortSignal).toBe(replyOperation.abortSignal);
expect(fallbackCall.sessionId).toBe("session");
expect(embeddedCall.abortSignal).toBe(replyOperation.abortSignal);
});

View File

@@ -72,6 +72,7 @@ type ModelFallbackParams = {
model?: string;
abortSignal?: AbortSignal;
agentId?: string;
sessionId?: string;
sessionKey?: string;
fallbacksOverride?: unknown[];
resolveAgentHarnessRuntimeOverride?: (provider: string, model: string) => string | undefined;
@@ -775,6 +776,7 @@ describe("runMemoryFlushIfNeeded", () => {
expect(fallbackCall.provider).toBe("ollama");
expect(fallbackCall.model).toBe("qwen3:8b");
expect(fallbackCall.abortSignal).toBe(replyOperation.abortSignal);
expect(fallbackCall.sessionId).toBe("session");
expect(fallbackCall.fallbacksOverride).toEqual([]);
expect(runEmbeddedAgentMock).toHaveBeenCalledTimes(1);
const agentCall = requireEmbeddedAgentCall();

View File

@@ -1501,6 +1501,7 @@ describe("createFollowupRunner runtime config", () => {
const call = requireLastMockCallArg(runEmbeddedAgentMock, "run embedded agent");
expect(fallbackCall.abortSignal).toBeInstanceOf(AbortSignal);
expect(fallbackCall.abortSignal).not.toBe(abortController.signal);
expect(fallbackCall.sessionId).toBe("session");
expect(call.abortSignal).toBe(fallbackCall.abortSignal);
});

View File

@@ -663,6 +663,7 @@ export function createFollowupRunner(params: {
...resolveModelFallbackOptions(run, runtimeConfig),
cfg: runtimeConfig,
runId,
sessionId: run.sessionId,
abortSignal: runAbortSignal,
resolveAgentHarnessRuntimeOverride: (provider) =>
resolveSessionRuntimeOverrideForProvider({

View File

@@ -348,6 +348,11 @@ export function registerModelsCli(program: Command) {
.option("--device-code", "Use the provider device-code auth method", false)
.option("--profile-id <id>", "Auth profile id override for single-profile login methods")
.option("--set-default", "Apply the provider's default model recommendation", false)
.option(
"--force",
"Remove existing profiles for the provider before logging in (use when a cached OAuth profile is stuck or you want to switch accounts)",
false,
)
.action(async (opts, command) => {
if (opts.deviceCode && typeof opts.method === "string" && opts.method !== "device-code") {
throw new Error(
@@ -363,6 +368,7 @@ export function registerModelsCli(program: Command) {
method: opts.deviceCode ? "device-code" : (opts.method as string | undefined),
profileId: opts.profileId as string | undefined,
setDefault: Boolean(opts.setDefault),
force: Boolean(opts.force),
agent,
},
defaultRuntime,

View File

@@ -48,6 +48,7 @@ const mocks = vi.hoisted(() => ({
resolveDefaultAgentWorkspaceDir: vi.fn(),
upsertAuthProfile: vi.fn(),
upsertAuthProfileWithLock: vi.fn(),
removeProviderAuthProfilesWithLock: vi.fn(),
resolvePluginProviders: vi.fn(),
createClackPrompter: vi.fn(),
loadValidConfigOrThrow: vi.fn(),
@@ -67,6 +68,7 @@ const mocks = vi.hoisted(() => ({
vi.mock("../../agents/auth-profiles/profiles.js", () => ({
listProfilesForProvider: mocks.listProfilesForProvider,
promoteAuthProfileInOrder: mocks.promoteAuthProfileInOrder,
removeProviderAuthProfilesWithLock: mocks.removeProviderAuthProfilesWithLock,
upsertAuthProfile: mocks.upsertAuthProfile,
upsertAuthProfileWithLock: mocks.upsertAuthProfileWithLock,
}));
@@ -362,6 +364,8 @@ describe("modelsAuthLoginCommand", () => {
mocks.upsertAuthProfileWithLock.mockReset();
mocks.upsertAuthProfileWithLock.mockResolvedValue({ version: 1, profiles: {} });
mocks.promoteAuthProfileInOrder.mockReset();
mocks.removeProviderAuthProfilesWithLock.mockReset();
mocks.removeProviderAuthProfilesWithLock.mockResolvedValue({ version: 1, profiles: {} });
mocks.resolveDefaultAgentId.mockReturnValue("main");
mocks.resolveAgentDir.mockReturnValue("/tmp/openclaw/agents/main");
@@ -1177,6 +1181,95 @@ describe("modelsAuthLoginCommand", () => {
expect(runProviderAuth).toHaveBeenCalledOnce();
});
it("--force purges cached profiles for the provider before login", async () => {
const runtime = createRuntime();
await modelsAuthLoginCommand({ provider: "openai", force: true }, runtime);
expect(mocks.removeProviderAuthProfilesWithLock).toHaveBeenCalledWith({
provider: "openai",
agentDir: "/tmp/openclaw/agents/main",
});
expect(runProviderAuth).toHaveBeenCalledOnce();
expect(runtime.log).toHaveBeenCalledWith(
expect.stringContaining('Removed cached auth profiles for provider "openai"'),
);
});
it("--force does not purge when omitted", async () => {
const runtime = createRuntime();
await modelsAuthLoginCommand({ provider: "openai" }, runtime);
expect(mocks.removeProviderAuthProfilesWithLock).not.toHaveBeenCalled();
expect(runProviderAuth).toHaveBeenCalledOnce();
});
it("--force fails before login when purge throws", async () => {
const runtime = createRuntime();
mocks.removeProviderAuthProfilesWithLock.mockRejectedValueOnce(new Error("disk full"));
await expect(
modelsAuthLoginCommand({ provider: "openai", force: true }, runtime),
).rejects.toThrow('Could not clear cached profiles for "openai" before re-login: disk full');
expect(runtime.error).not.toHaveBeenCalled();
expect(runProviderAuth).not.toHaveBeenCalled();
});
it("--force fails before login when purge cannot update the profile store", async () => {
const runtime = createRuntime();
mocks.removeProviderAuthProfilesWithLock.mockResolvedValueOnce(null);
await expect(
modelsAuthLoginCommand({ provider: "openai", force: true }, runtime),
).rejects.toThrow(
'Could not clear cached profiles for "openai" before re-login: profile store update failed',
);
expect(runtime.error).not.toHaveBeenCalled();
expect(runProviderAuth).not.toHaveBeenCalled();
});
it("--force does NOT purge cached profiles when the requested auth method is unknown", async () => {
const runtime = createRuntime();
const runOauthAuth = vi.fn().mockResolvedValue({ profiles: [] });
const runApiKeyAuth = vi.fn().mockResolvedValue({ profiles: [] });
mocks.resolvePluginSetupProvider.mockReturnValue(
createProvider({
id: "openai",
label: "OpenAI",
run: runOauthAuth as ProviderPlugin["auth"][number]["run"],
auth: [
{
id: "oauth",
label: "ChatGPT Login",
kind: "oauth",
run: runOauthAuth,
},
{
id: "api-key",
label: "OpenAI API Key",
kind: "api_key",
run: runApiKeyAuth,
},
],
}),
);
// Using the wrong method id ("api_key" vs the registered "api-key") forces
// pickProviderAuthMethod to return null, which throws "Unknown auth method".
// The purge must NOT have run, otherwise the user's working credentials
// would be deleted before any auth flow had a chance to start.
await expect(
modelsAuthLoginCommand({ provider: "openai", method: "api_key", force: true }, runtime),
).rejects.toThrow("Unknown auth method");
expect(mocks.removeProviderAuthProfilesWithLock).not.toHaveBeenCalled();
expect(runOauthAuth).not.toHaveBeenCalled();
expect(runApiKeyAuth).not.toHaveBeenCalled();
});
it("reports loaded plugin providers when requested provider is unavailable", async () => {
const runtime = createRuntime();

View File

@@ -20,7 +20,10 @@ import {
resolveAgentWorkspaceDir,
resolveDefaultAgentId,
} from "../../agents/agent-scope.js";
import { externalCliDiscoveryForProviderAuth } from "../../agents/auth-profiles.js";
import {
externalCliDiscoveryForProviderAuth,
removeProviderAuthProfilesWithLock,
} from "../../agents/auth-profiles.js";
import {
listProfilesForProvider,
promoteAuthProfileInOrder,
@@ -874,6 +877,13 @@ type LoginOptions = {
setDefault?: boolean;
yes?: boolean;
agent?: string;
/**
* When true, remove any existing auth profiles for the resolved provider
* before invoking the auth flow. This is the escape hatch for stuck
* cached OAuth profiles where the standard `auth login` short-circuits
* because credentials already exist on disk.
*/
force?: boolean;
};
/**
@@ -982,6 +992,7 @@ export async function modelsAuthLoginCommand(opts: LoginOptions, runtime: Runtim
`Unknown provider. Run ${formatCliCommand("openclaw models status")} or ${formatCliCommand("openclaw plugins list")} to see available provider plugins.`,
);
}
const chosenMethod = await pickProviderAuthMethod({
provider: selectedProvider,
requestedMethod: opts.method,
@@ -994,6 +1005,35 @@ export async function modelsAuthLoginCommand(opts: LoginOptions, runtime: Runtim
);
}
if (opts.force) {
// Purge existing profiles for this provider only after we have a valid
// auth method to invoke. Running the purge earlier (before method
// resolution) would delete the user's working credentials and then
// throw on an unresolvable `--method`, leaving them without a usable
// profile and no auth flow started. This is the documented escape
// hatch for stuck OAuth credentials (expired token, swapped account,
// etc.) where `auth login` would otherwise short-circuit on the cached
// profile.
try {
const clearedStore = await removeProviderAuthProfilesWithLock({
provider: selectedProvider.id,
agentDir,
});
if (!clearedStore) {
throw new Error("profile store update failed");
}
runtime.log(
`Removed cached auth profiles for provider "${selectedProvider.id}" (--force). Running fresh auth flow.`,
);
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
throw new Error(
`Could not clear cached profiles for "${selectedProvider.id}" before re-login: ${message}. Re-login was not started because --force must remove cached profiles first.`,
{ cause: err },
);
}
}
await runProviderAuthMethod({
config,
agentDir,