fix(agents): honor OpenAI-compatible cache retention

Carry over #82973 and fix #81281 by preserving explicit cacheRetention for OpenAI-compatible completions providers that opt into prompt-cache-key support.

The change keeps explicit cacheRetention suppressed for OpenAI-compatible providers without compat.supportsPromptCacheKey, adds regression coverage for both paths, and updates prompt-caching docs for prompt_cache_key / prompt_cache_retention behavior.

Fixes #81281.
Supersedes #82973.

Co-authored-by: lonexreb <reach2shubhankar@gmail.com>
This commit is contained in:
Peter Steinberger
2026-05-27 13:21:23 +01:00
committed by GitHub
parent 517ce3df75
commit 3e351b718e
7 changed files with 237 additions and 5 deletions

View File

@@ -104,8 +104,8 @@ Per-agent heartbeat is supported at `agents.list[].heartbeat`.
### OpenAI (direct API)
- Prompt caching is automatic on supported recent models. OpenClaw does not need to inject block-level cache markers.
- OpenClaw uses `prompt_cache_key` to keep cache routing stable across turns and uses `prompt_cache_retention: "24h"` only when `cacheRetention: "long"` is selected on direct OpenAI hosts.
- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`; `cacheRetention: "none"` still suppresses it.
- OpenClaw uses `prompt_cache_key` to keep cache routing stable across turns. Direct OpenAI hosts use `prompt_cache_retention: "24h"` when `cacheRetention: "long"` is selected.
- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`; with that same opt-in, explicit `cacheRetention: "long"` also forwards `prompt_cache_retention: "24h"`, and `cacheRetention: "none"` suppresses both fields.
- OpenAI responses expose cached prompt tokens via `usage.prompt_tokens_details.cached_tokens` (or `input_tokens_details.cached_tokens` on Responses API events). OpenClaw maps that to `cacheRead`.
- OpenAI does not expose a separate cache-write token counter, so `cacheWrite` stays `0` on OpenAI paths even when the provider is warming a cache.
- OpenAI returns useful tracing and rate-limit headers such as `x-request-id`, `openai-processing-ms`, and `x-ratelimit-*`, but cache-hit accounting should come from the usage payload, not from headers.

View File

@@ -4637,6 +4637,67 @@ describe("openai transport stream", () => {
expect(notOptedIn.prompt_cache_key).toBeUndefined();
});
it("emits prompt_cache_retention=24h for completions when cacheRetention is long", () => {
const model = {
id: "custom-model",
name: "Custom Model",
api: "openai-completions",
provider: "custom-cpa",
baseUrl: "https://proxy.example.com/v1",
compat: { supportsPromptCacheKey: true },
reasoning: false,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 32768,
maxTokens: 8192,
} as unknown as Model<"openai-completions">;
const context = {
systemPrompt: "system",
messages: [],
tools: [],
} as never;
const longRetention = buildOpenAICompletionsParams(model, context, {
sessionId: "session-123",
cacheRetention: "long",
}) as { prompt_cache_key?: string; prompt_cache_retention?: string };
expect(longRetention.prompt_cache_key).toBe("session-123");
expect(longRetention.prompt_cache_retention).toBe("24h");
});
it("omits prompt_cache_retention for completions when cacheRetention is short or unset", () => {
const model = {
id: "custom-model",
name: "Custom Model",
api: "openai-completions",
provider: "custom-cpa",
baseUrl: "https://proxy.example.com/v1",
compat: { supportsPromptCacheKey: true },
reasoning: false,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 32768,
maxTokens: 8192,
} as unknown as Model<"openai-completions">;
const context = {
systemPrompt: "system",
messages: [],
tools: [],
} as never;
const shortRetention = buildOpenAICompletionsParams(model, context, {
sessionId: "session-123",
cacheRetention: "short",
});
const defaultRetention = buildOpenAICompletionsParams(model, context, {
sessionId: "session-123",
});
expect(shortRetention).not.toHaveProperty("prompt_cache_retention");
expect(defaultRetention).not.toHaveProperty("prompt_cache_retention");
});
it("sorts Chat Completions tools by function name for stable prompt-cache payloads", () => {
const model = {
id: "custom-model",

View File

@@ -3499,6 +3499,15 @@ export function buildOpenAICompletionsParams(
}
if (compat.supportsPromptCacheKey && cacheRetention !== "none" && options?.sessionId) {
params.prompt_cache_key = options.sessionId;
// When the caller explicitly opted into long retention, forward the
// canonical prompt_cache_retention value alongside the cache key so
// OpenAI-compatible completions backends (oMLX, llama.cpp, official
// OpenAI, etc.) can honor the 24h prefix-cache lifetime. Without this
// the key reaches the wire but the retention preference is silently
// dropped (issue #81281).
if (cacheRetention === "long") {
params.prompt_cache_retention = "24h";
}
}
if (options?.temperature !== undefined) {
params.temperature = options.temperature;

View File

@@ -2647,6 +2647,55 @@ describe("applyExtraParamsToAgent", () => {
expect(calls[0]?.cacheRetention).toBe("long");
});
it("passes through explicit cacheRetention for prompt-cache-key openai-completions providers", () => {
const { calls, agent } = createOptionsCaptureAgent();
const cfg = buildModelConfig("omlx-local/local_model", {
cacheRetention: "long",
});
applyExtraParamsToAgent(agent, cfg, "omlx-local", "local_model");
const model = {
api: "openai-completions",
provider: "omlx-local",
id: "local_model",
compat: { supportsPromptCacheKey: true },
} as unknown as Model<"openai-completions">;
const context: Context = { messages: [] };
void agent.streamFn?.(model, context, {
sessionId: "session-81281",
});
expect(calls).toHaveLength(1);
expect(calls[0]?.cacheRetention).toBe("long");
expect(calls[0]?.sessionId).toBe("session-81281");
});
it("keeps explicit cacheRetention off openai-completions providers without prompt-cache-key support", () => {
const { calls, agent } = createOptionsCaptureAgent();
const cfg = buildModelConfig("omlx-local/local_model", {
cacheRetention: "long",
});
applyExtraParamsToAgent(agent, cfg, "omlx-local", "local_model");
const model = {
api: "openai-completions",
provider: "omlx-local",
id: "local_model",
} as Model<"openai-completions">;
const context: Context = { messages: [] };
void agent.streamFn?.(model, context, {
sessionId: "session-81281",
});
expect(calls).toHaveLength(1);
expect(calls[0]?.cacheRetention).toBeUndefined();
expect(calls[0]?.sessionId).toBe("session-81281");
});
it("passes through explicit cacheRetention for custom anthropic-messages providers", () => {
const { calls, agent } = createOptionsCaptureAgent();
const cfg = {

View File

@@ -494,11 +494,20 @@ function createStreamFnWithExtraParams(
streamParams.seed = resolvedSeed;
}
const readSupportsPromptCacheKey = (m: unknown): boolean => {
const compat = (m as { compat?: unknown })?.compat;
if (!compat || typeof compat !== "object") {
return false;
}
return (compat as Record<string, unknown>).supportsPromptCacheKey === true;
};
const initialCacheRetention = resolveCacheRetention(
extraParams,
provider,
typeof model?.api === "string" ? model.api : undefined,
typeof model?.id === "string" ? model.id : undefined,
readSupportsPromptCacheKey(model),
);
if (Object.keys(streamParams).length > 0 || initialCacheRetention) {
const debugParams = initialCacheRetention
@@ -514,6 +523,7 @@ function createStreamFnWithExtraParams(
provider,
typeof callModel.api === "string" ? callModel.api : undefined,
typeof callModel.id === "string" ? callModel.id : undefined,
readSupportsPromptCacheKey(callModel),
);
const hasStreamParams = Object.keys(streamParams).length > 0 || cacheRetention;
if (!hasStreamParams) {

View File

@@ -30,6 +30,100 @@ describe("prompt cache retention", () => {
).toBeUndefined();
});
it("passes explicit cacheRetention through for openai-completions providers when supportsPromptCacheKey (issue #81281)", () => {
// Regression: openai-completions providers with prefix-caching backends
// (oMLX, llama.cpp, etc.) set compat.supportsPromptCacheKey: true and
// cacheRetention: "long" but the wrapper was silently dropping the
// user's explicit cacheRetention because the provider is neither in the
// anthropic family nor google-eligible.
expect(
resolveCacheRetention(
{ cacheRetention: "long" },
"omlx-local",
"openai-completions",
"local_model",
true,
),
).toBe("long");
expect(
resolveCacheRetention(
{ cacheRetention: "short" },
"omlx-local",
"openai-completions",
"local_model",
true,
),
).toBe("short");
expect(
resolveCacheRetention(
{ cacheRetention: "none" },
"omlx-local",
"openai-completions",
"local_model",
true,
),
).toBe("none");
});
it("does not honor explicit cacheRetention for openai-completions without supportsPromptCacheKey", () => {
// Providers that route via openai-completions but do not advertise prompt
// caching (e.g. amazon-bedrock proxying amazon.* nova models) must keep
// the explicit cacheRetention from leaking into the outgoing payload.
expect(
resolveCacheRetention(
{ cacheRetention: "long" },
"amazon-bedrock",
"openai-completions",
"amazon.nova-micro-v1:0",
),
).toBeUndefined();
expect(
resolveCacheRetention(
{ cacheRetention: "long" },
"omlx-local",
"openai-completions",
"local_model",
false,
),
).toBeUndefined();
});
it("returns undefined for openai-completions without explicit cacheRetention", () => {
// Without an explicit user choice, openai-completions providers fall back
// to the transport-level default ("short") rather than receiving a
// wrapper-injected value.
expect(
resolveCacheRetention(undefined, "omlx-local", "openai-completions", "local_model", true),
).toBeUndefined();
expect(
resolveCacheRetention({}, "omlx-local", "openai-completions", "local_model", true),
).toBeUndefined();
});
it("does not map legacy cacheControlTtl for openai-completions prompt-cache-key providers", () => {
// Legacy TTL aliases were Anthropic/Google semantics; OpenAI-compatible
// completions providers need an explicit cacheRetention value before the
// wrapper forwards retention to the transport.
expect(
resolveCacheRetention(
{ cacheControlTtl: "1h" },
"omlx-local",
"openai-completions",
"local_model",
true,
),
).toBeUndefined();
expect(
resolveCacheRetention(
{ cacheControlTtl: "5m" },
"omlx-local",
"openai-completions",
"local_model",
true,
),
).toBeUndefined();
});
it("identifies supported direct Google cache families", () => {
expect(
isGooglePromptCacheEligible({

View File

@@ -19,6 +19,7 @@ export function resolveCacheRetention(
provider: string,
modelApi?: string,
modelId?: string,
supportsPromptCacheKey?: boolean,
): CacheRetention | undefined {
const hasExplicitCacheConfig =
extraParams?.cacheRetention !== undefined || extraParams?.cacheControlTtl !== undefined;
@@ -29,8 +30,16 @@ export function resolveCacheRetention(
hasExplicitCacheConfig,
});
const googleEligible = isGooglePromptCacheEligible({ modelApi, modelId });
// OpenAI-compatible completions backends (oMLX, llama.cpp, etc.) opt into
// prompt caching via `compat.supportsPromptCacheKey: true`. Without that
// flag they sit outside the anthropic/google family gates, so issue #81281
// dropped the user's explicit `cacheRetention` before the transport layer
// could emit it. Proxies that route non-cacheable models via the same
// openai-completions wire (amazon-bedrock + amazon.* nova models) leave
// the flag unset, so the existing family gate still applies to them.
const cacheKeyEligible = supportsPromptCacheKey === true;
if (!family && !googleEligible) {
if (!family && !googleEligible && !cacheKeyEligible) {
return undefined;
}
@@ -40,10 +49,10 @@ export function resolveCacheRetention(
}
const legacy = extraParams?.cacheControlTtl;
if (legacy === "5m") {
if (legacy === "5m" && (family || googleEligible)) {
return "short";
}
if (legacy === "1h") {
if (legacy === "1h" && (family || googleEligible)) {
return "long";
}