mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-06 05:51:15 +08:00
fix(agents): honor OpenAI-compatible cache retention
Carry over #82973 and fix #81281 by preserving explicit cacheRetention for OpenAI-compatible completions providers that opt into prompt-cache-key support. The change keeps explicit cacheRetention suppressed for OpenAI-compatible providers without compat.supportsPromptCacheKey, adds regression coverage for both paths, and updates prompt-caching docs for prompt_cache_key / prompt_cache_retention behavior. Fixes #81281. Supersedes #82973. Co-authored-by: lonexreb <reach2shubhankar@gmail.com>
This commit is contained in:
committed by
GitHub
parent
517ce3df75
commit
3e351b718e
@@ -104,8 +104,8 @@ Per-agent heartbeat is supported at `agents.list[].heartbeat`.
|
||||
### OpenAI (direct API)
|
||||
|
||||
- Prompt caching is automatic on supported recent models. OpenClaw does not need to inject block-level cache markers.
|
||||
- OpenClaw uses `prompt_cache_key` to keep cache routing stable across turns and uses `prompt_cache_retention: "24h"` only when `cacheRetention: "long"` is selected on direct OpenAI hosts.
|
||||
- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`; `cacheRetention: "none"` still suppresses it.
|
||||
- OpenClaw uses `prompt_cache_key` to keep cache routing stable across turns. Direct OpenAI hosts use `prompt_cache_retention: "24h"` when `cacheRetention: "long"` is selected.
|
||||
- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`; with that same opt-in, explicit `cacheRetention: "long"` also forwards `prompt_cache_retention: "24h"`, and `cacheRetention: "none"` suppresses both fields.
|
||||
- OpenAI responses expose cached prompt tokens via `usage.prompt_tokens_details.cached_tokens` (or `input_tokens_details.cached_tokens` on Responses API events). OpenClaw maps that to `cacheRead`.
|
||||
- OpenAI does not expose a separate cache-write token counter, so `cacheWrite` stays `0` on OpenAI paths even when the provider is warming a cache.
|
||||
- OpenAI returns useful tracing and rate-limit headers such as `x-request-id`, `openai-processing-ms`, and `x-ratelimit-*`, but cache-hit accounting should come from the usage payload, not from headers.
|
||||
|
||||
@@ -4637,6 +4637,67 @@ describe("openai transport stream", () => {
|
||||
expect(notOptedIn.prompt_cache_key).toBeUndefined();
|
||||
});
|
||||
|
||||
it("emits prompt_cache_retention=24h for completions when cacheRetention is long", () => {
|
||||
const model = {
|
||||
id: "custom-model",
|
||||
name: "Custom Model",
|
||||
api: "openai-completions",
|
||||
provider: "custom-cpa",
|
||||
baseUrl: "https://proxy.example.com/v1",
|
||||
compat: { supportsPromptCacheKey: true },
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 32768,
|
||||
maxTokens: 8192,
|
||||
} as unknown as Model<"openai-completions">;
|
||||
const context = {
|
||||
systemPrompt: "system",
|
||||
messages: [],
|
||||
tools: [],
|
||||
} as never;
|
||||
|
||||
const longRetention = buildOpenAICompletionsParams(model, context, {
|
||||
sessionId: "session-123",
|
||||
cacheRetention: "long",
|
||||
}) as { prompt_cache_key?: string; prompt_cache_retention?: string };
|
||||
|
||||
expect(longRetention.prompt_cache_key).toBe("session-123");
|
||||
expect(longRetention.prompt_cache_retention).toBe("24h");
|
||||
});
|
||||
|
||||
it("omits prompt_cache_retention for completions when cacheRetention is short or unset", () => {
|
||||
const model = {
|
||||
id: "custom-model",
|
||||
name: "Custom Model",
|
||||
api: "openai-completions",
|
||||
provider: "custom-cpa",
|
||||
baseUrl: "https://proxy.example.com/v1",
|
||||
compat: { supportsPromptCacheKey: true },
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 32768,
|
||||
maxTokens: 8192,
|
||||
} as unknown as Model<"openai-completions">;
|
||||
const context = {
|
||||
systemPrompt: "system",
|
||||
messages: [],
|
||||
tools: [],
|
||||
} as never;
|
||||
|
||||
const shortRetention = buildOpenAICompletionsParams(model, context, {
|
||||
sessionId: "session-123",
|
||||
cacheRetention: "short",
|
||||
});
|
||||
const defaultRetention = buildOpenAICompletionsParams(model, context, {
|
||||
sessionId: "session-123",
|
||||
});
|
||||
|
||||
expect(shortRetention).not.toHaveProperty("prompt_cache_retention");
|
||||
expect(defaultRetention).not.toHaveProperty("prompt_cache_retention");
|
||||
});
|
||||
|
||||
it("sorts Chat Completions tools by function name for stable prompt-cache payloads", () => {
|
||||
const model = {
|
||||
id: "custom-model",
|
||||
|
||||
@@ -3499,6 +3499,15 @@ export function buildOpenAICompletionsParams(
|
||||
}
|
||||
if (compat.supportsPromptCacheKey && cacheRetention !== "none" && options?.sessionId) {
|
||||
params.prompt_cache_key = options.sessionId;
|
||||
// When the caller explicitly opted into long retention, forward the
|
||||
// canonical prompt_cache_retention value alongside the cache key so
|
||||
// OpenAI-compatible completions backends (oMLX, llama.cpp, official
|
||||
// OpenAI, etc.) can honor the 24h prefix-cache lifetime. Without this
|
||||
// the key reaches the wire but the retention preference is silently
|
||||
// dropped (issue #81281).
|
||||
if (cacheRetention === "long") {
|
||||
params.prompt_cache_retention = "24h";
|
||||
}
|
||||
}
|
||||
if (options?.temperature !== undefined) {
|
||||
params.temperature = options.temperature;
|
||||
|
||||
@@ -2647,6 +2647,55 @@ describe("applyExtraParamsToAgent", () => {
|
||||
expect(calls[0]?.cacheRetention).toBe("long");
|
||||
});
|
||||
|
||||
it("passes through explicit cacheRetention for prompt-cache-key openai-completions providers", () => {
|
||||
const { calls, agent } = createOptionsCaptureAgent();
|
||||
const cfg = buildModelConfig("omlx-local/local_model", {
|
||||
cacheRetention: "long",
|
||||
});
|
||||
|
||||
applyExtraParamsToAgent(agent, cfg, "omlx-local", "local_model");
|
||||
|
||||
const model = {
|
||||
api: "openai-completions",
|
||||
provider: "omlx-local",
|
||||
id: "local_model",
|
||||
compat: { supportsPromptCacheKey: true },
|
||||
} as unknown as Model<"openai-completions">;
|
||||
const context: Context = { messages: [] };
|
||||
|
||||
void agent.streamFn?.(model, context, {
|
||||
sessionId: "session-81281",
|
||||
});
|
||||
|
||||
expect(calls).toHaveLength(1);
|
||||
expect(calls[0]?.cacheRetention).toBe("long");
|
||||
expect(calls[0]?.sessionId).toBe("session-81281");
|
||||
});
|
||||
|
||||
it("keeps explicit cacheRetention off openai-completions providers without prompt-cache-key support", () => {
|
||||
const { calls, agent } = createOptionsCaptureAgent();
|
||||
const cfg = buildModelConfig("omlx-local/local_model", {
|
||||
cacheRetention: "long",
|
||||
});
|
||||
|
||||
applyExtraParamsToAgent(agent, cfg, "omlx-local", "local_model");
|
||||
|
||||
const model = {
|
||||
api: "openai-completions",
|
||||
provider: "omlx-local",
|
||||
id: "local_model",
|
||||
} as Model<"openai-completions">;
|
||||
const context: Context = { messages: [] };
|
||||
|
||||
void agent.streamFn?.(model, context, {
|
||||
sessionId: "session-81281",
|
||||
});
|
||||
|
||||
expect(calls).toHaveLength(1);
|
||||
expect(calls[0]?.cacheRetention).toBeUndefined();
|
||||
expect(calls[0]?.sessionId).toBe("session-81281");
|
||||
});
|
||||
|
||||
it("passes through explicit cacheRetention for custom anthropic-messages providers", () => {
|
||||
const { calls, agent } = createOptionsCaptureAgent();
|
||||
const cfg = {
|
||||
|
||||
@@ -494,11 +494,20 @@ function createStreamFnWithExtraParams(
|
||||
streamParams.seed = resolvedSeed;
|
||||
}
|
||||
|
||||
const readSupportsPromptCacheKey = (m: unknown): boolean => {
|
||||
const compat = (m as { compat?: unknown })?.compat;
|
||||
if (!compat || typeof compat !== "object") {
|
||||
return false;
|
||||
}
|
||||
return (compat as Record<string, unknown>).supportsPromptCacheKey === true;
|
||||
};
|
||||
|
||||
const initialCacheRetention = resolveCacheRetention(
|
||||
extraParams,
|
||||
provider,
|
||||
typeof model?.api === "string" ? model.api : undefined,
|
||||
typeof model?.id === "string" ? model.id : undefined,
|
||||
readSupportsPromptCacheKey(model),
|
||||
);
|
||||
if (Object.keys(streamParams).length > 0 || initialCacheRetention) {
|
||||
const debugParams = initialCacheRetention
|
||||
@@ -514,6 +523,7 @@ function createStreamFnWithExtraParams(
|
||||
provider,
|
||||
typeof callModel.api === "string" ? callModel.api : undefined,
|
||||
typeof callModel.id === "string" ? callModel.id : undefined,
|
||||
readSupportsPromptCacheKey(callModel),
|
||||
);
|
||||
const hasStreamParams = Object.keys(streamParams).length > 0 || cacheRetention;
|
||||
if (!hasStreamParams) {
|
||||
|
||||
@@ -30,6 +30,100 @@ describe("prompt cache retention", () => {
|
||||
).toBeUndefined();
|
||||
});
|
||||
|
||||
it("passes explicit cacheRetention through for openai-completions providers when supportsPromptCacheKey (issue #81281)", () => {
|
||||
// Regression: openai-completions providers with prefix-caching backends
|
||||
// (oMLX, llama.cpp, etc.) set compat.supportsPromptCacheKey: true and
|
||||
// cacheRetention: "long" but the wrapper was silently dropping the
|
||||
// user's explicit cacheRetention because the provider is neither in the
|
||||
// anthropic family nor google-eligible.
|
||||
expect(
|
||||
resolveCacheRetention(
|
||||
{ cacheRetention: "long" },
|
||||
"omlx-local",
|
||||
"openai-completions",
|
||||
"local_model",
|
||||
true,
|
||||
),
|
||||
).toBe("long");
|
||||
expect(
|
||||
resolveCacheRetention(
|
||||
{ cacheRetention: "short" },
|
||||
"omlx-local",
|
||||
"openai-completions",
|
||||
"local_model",
|
||||
true,
|
||||
),
|
||||
).toBe("short");
|
||||
expect(
|
||||
resolveCacheRetention(
|
||||
{ cacheRetention: "none" },
|
||||
"omlx-local",
|
||||
"openai-completions",
|
||||
"local_model",
|
||||
true,
|
||||
),
|
||||
).toBe("none");
|
||||
});
|
||||
|
||||
it("does not honor explicit cacheRetention for openai-completions without supportsPromptCacheKey", () => {
|
||||
// Providers that route via openai-completions but do not advertise prompt
|
||||
// caching (e.g. amazon-bedrock proxying amazon.* nova models) must keep
|
||||
// the explicit cacheRetention from leaking into the outgoing payload.
|
||||
expect(
|
||||
resolveCacheRetention(
|
||||
{ cacheRetention: "long" },
|
||||
"amazon-bedrock",
|
||||
"openai-completions",
|
||||
"amazon.nova-micro-v1:0",
|
||||
),
|
||||
).toBeUndefined();
|
||||
expect(
|
||||
resolveCacheRetention(
|
||||
{ cacheRetention: "long" },
|
||||
"omlx-local",
|
||||
"openai-completions",
|
||||
"local_model",
|
||||
false,
|
||||
),
|
||||
).toBeUndefined();
|
||||
});
|
||||
|
||||
it("returns undefined for openai-completions without explicit cacheRetention", () => {
|
||||
// Without an explicit user choice, openai-completions providers fall back
|
||||
// to the transport-level default ("short") rather than receiving a
|
||||
// wrapper-injected value.
|
||||
expect(
|
||||
resolveCacheRetention(undefined, "omlx-local", "openai-completions", "local_model", true),
|
||||
).toBeUndefined();
|
||||
expect(
|
||||
resolveCacheRetention({}, "omlx-local", "openai-completions", "local_model", true),
|
||||
).toBeUndefined();
|
||||
});
|
||||
|
||||
it("does not map legacy cacheControlTtl for openai-completions prompt-cache-key providers", () => {
|
||||
// Legacy TTL aliases were Anthropic/Google semantics; OpenAI-compatible
|
||||
// completions providers need an explicit cacheRetention value before the
|
||||
// wrapper forwards retention to the transport.
|
||||
expect(
|
||||
resolveCacheRetention(
|
||||
{ cacheControlTtl: "1h" },
|
||||
"omlx-local",
|
||||
"openai-completions",
|
||||
"local_model",
|
||||
true,
|
||||
),
|
||||
).toBeUndefined();
|
||||
expect(
|
||||
resolveCacheRetention(
|
||||
{ cacheControlTtl: "5m" },
|
||||
"omlx-local",
|
||||
"openai-completions",
|
||||
"local_model",
|
||||
true,
|
||||
),
|
||||
).toBeUndefined();
|
||||
});
|
||||
|
||||
it("identifies supported direct Google cache families", () => {
|
||||
expect(
|
||||
isGooglePromptCacheEligible({
|
||||
|
||||
@@ -19,6 +19,7 @@ export function resolveCacheRetention(
|
||||
provider: string,
|
||||
modelApi?: string,
|
||||
modelId?: string,
|
||||
supportsPromptCacheKey?: boolean,
|
||||
): CacheRetention | undefined {
|
||||
const hasExplicitCacheConfig =
|
||||
extraParams?.cacheRetention !== undefined || extraParams?.cacheControlTtl !== undefined;
|
||||
@@ -29,8 +30,16 @@ export function resolveCacheRetention(
|
||||
hasExplicitCacheConfig,
|
||||
});
|
||||
const googleEligible = isGooglePromptCacheEligible({ modelApi, modelId });
|
||||
// OpenAI-compatible completions backends (oMLX, llama.cpp, etc.) opt into
|
||||
// prompt caching via `compat.supportsPromptCacheKey: true`. Without that
|
||||
// flag they sit outside the anthropic/google family gates, so issue #81281
|
||||
// dropped the user's explicit `cacheRetention` before the transport layer
|
||||
// could emit it. Proxies that route non-cacheable models via the same
|
||||
// openai-completions wire (amazon-bedrock + amazon.* nova models) leave
|
||||
// the flag unset, so the existing family gate still applies to them.
|
||||
const cacheKeyEligible = supportsPromptCacheKey === true;
|
||||
|
||||
if (!family && !googleEligible) {
|
||||
if (!family && !googleEligible && !cacheKeyEligible) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
@@ -40,10 +49,10 @@ export function resolveCacheRetention(
|
||||
}
|
||||
|
||||
const legacy = extraParams?.cacheControlTtl;
|
||||
if (legacy === "5m") {
|
||||
if (legacy === "5m" && (family || googleEligible)) {
|
||||
return "short";
|
||||
}
|
||||
if (legacy === "1h") {
|
||||
if (legacy === "1h" && (family || googleEligible)) {
|
||||
return "long";
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user