From 1824aa07a0d59f63b42a8b7519b2c1cbd74ac348 Mon Sep 17 00:00:00 2001 From: Alix-007 Date: Tue, 2 Jun 2026 20:52:12 +0800 Subject: [PATCH] fix(mistral): enable prompt cache keys Enable Mistral prompt cache keys without long-retention forwarding. Update cached-read pricing and doctor migration for existing Mistral provider config. Fixes #83709. --- docs/reference/prompt-caching.md | 2 +- extensions/mistral/api.test.ts | 18 ++++ extensions/mistral/api.ts | 8 ++ extensions/mistral/model-definitions.test.ts | 20 ++++- extensions/mistral/openclaw.plugin.json | 16 ++-- packages/llm-core/src/types.ts | 2 + src/agents/openai-transport-stream.test.ts | 35 ++++++++ src/agents/openai-transport-stream.ts | 4 +- .../doctor-legacy-config.migrations.test.ts | 63 +++++++++++++- .../legacy-config-compatibility-base.ts | 4 +- .../shared/legacy-config-core-normalizers.ts | 86 ++++++++++++++++--- src/llm/providers/openai-completions.test.ts | 75 ++++++++++++++++ src/llm/providers/openai-completions.ts | 20 +++-- 13 files changed, 319 insertions(+), 34 deletions(-) diff --git a/docs/reference/prompt-caching.md b/docs/reference/prompt-caching.md index ab468559a9d4..b4060aadec77 100644 --- a/docs/reference/prompt-caching.md +++ b/docs/reference/prompt-caching.md @@ -105,7 +105,7 @@ Per-agent heartbeat is supported at `agents.list[].heartbeat`. - Prompt caching is automatic on supported recent models. OpenClaw does not need to inject block-level cache markers. - OpenClaw uses `prompt_cache_key` to keep cache routing stable across turns. Direct OpenAI hosts use `prompt_cache_retention: "24h"` when `cacheRetention: "long"` is selected. -- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`; with that same opt-in, explicit `cacheRetention: "long"` also forwards `prompt_cache_retention: "24h"`, and `cacheRetention: "none"` suppresses both fields. +- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`. Long-retention forwarding is a separate capability: explicit `cacheRetention: "long"` sends `prompt_cache_retention: "24h"` only when that compat entry also supports long cache retention. Providers such as Mistral can opt into cache keys while setting `compat.supportsLongCacheRetention: false` to suppress the long-retention field. `cacheRetention: "none"` suppresses both fields. - OpenAI responses expose cached prompt tokens via `usage.prompt_tokens_details.cached_tokens` (or `input_tokens_details.cached_tokens` on Responses API events). OpenClaw maps that to `cacheRead`. - OpenAI does not expose a separate cache-write token counter, so `cacheWrite` stays `0` on OpenAI paths even when the provider is warming a cache. - OpenAI returns useful tracing and rate-limit headers such as `x-request-id`, `openai-processing-ms`, and `x-ratelimit-*`, but cache-hit accounting should come from the usage payload, not from headers. diff --git a/extensions/mistral/api.test.ts b/extensions/mistral/api.test.ts index e6e7dcc38a6f..2663afb65ffb 100644 --- a/extensions/mistral/api.test.ts +++ b/extensions/mistral/api.test.ts @@ -12,6 +12,8 @@ import mistralPlugin from "./index.js"; type MistralCompatShape = { maxTokensField?: "max_completion_tokens" | "max_tokens"; reasoningEffortMap?: Record; + supportsLongCacheRetention?: boolean; + supportsPromptCacheKey?: boolean; supportsReasoningEffort?: boolean; supportsStore?: boolean; }; @@ -24,6 +26,14 @@ function supportsStore(model: unknown): boolean | undefined { return readCompat(model)?.supportsStore; } +function supportsPromptCacheKey(model: unknown): boolean | undefined { + return readCompat(model)?.supportsPromptCacheKey; +} + +function supportsLongCacheRetention(model: unknown): boolean | undefined { + return readCompat(model)?.supportsLongCacheRetention; +} + function supportsReasoningEffort(model: unknown): boolean | undefined { return readCompat(model)?.supportsReasoningEffort; } @@ -51,6 +61,8 @@ describe("resolveMistralCompatPatch", () => { it("enables reasoning_effort mapping for mistral-small-latest", () => { expect(resolveMistralCompatPatch({ id: MISTRAL_SMALL_LATEST_ID })).toEqual({ supportsStore: false, + supportsPromptCacheKey: true, + supportsLongCacheRetention: false, supportsReasoningEffort: true, maxTokensField: "max_tokens", reasoningEffortMap: MISTRAL_REASONING_EFFORT_MAP, @@ -60,6 +72,8 @@ describe("resolveMistralCompatPatch", () => { it("enables reasoning_effort mapping for mistral-medium-3-5", () => { expect(resolveMistralCompatPatch({ id: MISTRAL_MEDIUM_3_5_ID })).toEqual({ supportsStore: false, + supportsPromptCacheKey: true, + supportsLongCacheRetention: false, supportsReasoningEffort: true, maxTokensField: "max_tokens", reasoningEffortMap: MISTRAL_REASONING_EFFORT_MAP, @@ -78,6 +92,8 @@ describe("applyMistralModelCompat", () => { it("applies the Mistral request-shape compat flags", () => { const normalized = applyMistralModelCompat({}); expect(supportsStore(normalized)).toBe(false); + expect(supportsPromptCacheKey(normalized)).toBe(true); + expect(supportsLongCacheRetention(normalized)).toBe(false); expect(supportsReasoningEffort(normalized)).toBe(false); expect(maxTokensField(normalized)).toBe("max_tokens"); expect(reasoningEffortMap(normalized)).toBeUndefined(); @@ -128,6 +144,8 @@ describe("applyMistralModelCompat", () => { const model = { compat: { supportsStore: false, + supportsPromptCacheKey: true, + supportsLongCacheRetention: false, supportsReasoningEffort: false, maxTokensField: "max_tokens" as const, }, diff --git a/extensions/mistral/api.ts b/extensions/mistral/api.ts index 10ccbda906dd..749c24dfe049 100644 --- a/extensions/mistral/api.ts +++ b/extensions/mistral/api.ts @@ -14,9 +14,13 @@ const MISTRAL_MAX_TOKENS_FIELD = "max_tokens"; export const MISTRAL_MODEL_TRANSPORT_PATCH = { supportsStore: false, + supportsPromptCacheKey: true, + supportsLongCacheRetention: false, maxTokensField: MISTRAL_MAX_TOKENS_FIELD, } as const satisfies { supportsStore: boolean; + supportsPromptCacheKey: boolean; + supportsLongCacheRetention: boolean; maxTokensField: "max_tokens"; }; @@ -36,6 +40,8 @@ export const MISTRAL_MEDIUM_3_5_ID = "mistral-medium-3-5"; export function resolveMistralCompatPatch(model: { id?: string }): { supportsStore: boolean; + supportsPromptCacheKey: boolean; + supportsLongCacheRetention: boolean; supportsReasoningEffort: boolean; maxTokensField: "max_tokens"; reasoningEffortMap?: Record; @@ -56,6 +62,8 @@ function compatMatchesResolved( const expected = resolveMistralCompatPatch({ id: modelId }); return ( compat?.supportsStore === expected.supportsStore && + compat?.supportsPromptCacheKey === expected.supportsPromptCacheKey && + compat?.supportsLongCacheRetention === expected.supportsLongCacheRetention && compat?.supportsReasoningEffort === expected.supportsReasoningEffort && compat?.maxTokensField === expected.maxTokensField && compat?.reasoningEffortMap === expected.reasoningEffortMap diff --git a/extensions/mistral/model-definitions.test.ts b/extensions/mistral/model-definitions.test.ts index e4af04fe856d..54a81e3097b8 100644 --- a/extensions/mistral/model-definitions.test.ts +++ b/extensions/mistral/model-definitions.test.ts @@ -27,11 +27,29 @@ describe("mistral model definitions", () => { expect(MISTRAL_DEFAULT_COST).toEqual({ input: 0.5, output: 1.5, - cacheRead: 0, + cacheRead: 0.05, cacheWrite: 0, }); }); + it("prices cached Mistral input tokens at ten percent of standard input tokens", () => { + const models = buildMistralCatalogModels(); + + for (const model of models) { + expect(model.cost.cacheRead).toBeCloseTo(model.cost.input * 0.1, 10); + expect(model.cost.cacheWrite).toBe(0); + } + }); + + it("charges nonzero cost for cached-token usage on the default model", () => { + const model = buildMistralModelDefinition(); + const cacheReadTokens = 20_000; + const cacheReadCost = (model.cost.cacheRead / 1_000_000) * cacheReadTokens; + + expect(cacheReadCost).toBeCloseTo(0.001, 10); + expect(cacheReadCost).toBeGreaterThan(0); + }); + it("publishes a curated set of current Mistral catalog models", () => { const models = buildMistralCatalogModels(); const codestral = catalogModelById(models, "codestral-latest"); diff --git a/extensions/mistral/openclaw.plugin.json b/extensions/mistral/openclaw.plugin.json index 3f962d518162..b216d6051324 100644 --- a/extensions/mistral/openclaw.plugin.json +++ b/extensions/mistral/openclaw.plugin.json @@ -33,7 +33,7 @@ "cost": { "input": 0.3, "output": 0.9, - "cacheRead": 0, + "cacheRead": 0.03, "cacheWrite": 0 } }, @@ -46,7 +46,7 @@ "cost": { "input": 0.4, "output": 2, - "cacheRead": 0, + "cacheRead": 0.04, "cacheWrite": 0 } }, @@ -60,7 +60,7 @@ "cost": { "input": 0.5, "output": 1.5, - "cacheRead": 0, + "cacheRead": 0.05, "cacheWrite": 0 } }, @@ -73,7 +73,7 @@ "cost": { "input": 0.5, "output": 1.5, - "cacheRead": 0, + "cacheRead": 0.05, "cacheWrite": 0 } }, @@ -86,7 +86,7 @@ "cost": { "input": 0.4, "output": 2, - "cacheRead": 0, + "cacheRead": 0.04, "cacheWrite": 0 } }, @@ -100,7 +100,7 @@ "cost": { "input": 1.5, "output": 7.5, - "cacheRead": 0, + "cacheRead": 0.15, "cacheWrite": 0 } }, @@ -114,7 +114,7 @@ "cost": { "input": 0.1, "output": 0.3, - "cacheRead": 0, + "cacheRead": 0.01, "cacheWrite": 0 } }, @@ -127,7 +127,7 @@ "cost": { "input": 2, "output": 6, - "cacheRead": 0, + "cacheRead": 0.2, "cacheWrite": 0 } } diff --git a/packages/llm-core/src/types.ts b/packages/llm-core/src/types.ts index 7e91c8995096..93f2b01bf644 100644 --- a/packages/llm-core/src/types.ts +++ b/packages/llm-core/src/types.ts @@ -402,6 +402,8 @@ export interface OpenAICompletionsCompat { cacheControlFormat?: "anthropic"; /** Whether to send known session-affinity headers (`session_id`, `x-client-request-id`, `x-session-affinity`) from `options.sessionId` when caching is enabled. Default: false. */ sendSessionAffinityHeaders?: boolean; + /** Whether the provider supports OpenAI-style `prompt_cache_key`. Default: false for third-party completions providers. */ + supportsPromptCacheKey?: boolean; /** Whether the provider supports long prompt cache retention (`prompt_cache_retention: "24h"` or Anthropic-style `cache_control.ttl: "1h"`, depending on format). Default: true. */ supportsLongCacheRetention?: boolean; } diff --git a/src/agents/openai-transport-stream.test.ts b/src/agents/openai-transport-stream.test.ts index cec9127c1772..dafce9a199f8 100644 --- a/src/agents/openai-transport-stream.test.ts +++ b/src/agents/openai-transport-stream.test.ts @@ -5700,6 +5700,41 @@ describe("openai transport stream", () => { expect(defaultRetention).not.toHaveProperty("prompt_cache_retention"); }); + it("keeps Mistral prompt cache keys without unsupported long retention", () => { + const model = { + id: "mistral-large-latest", + name: "Mistral Large", + api: "openai-completions", + provider: "mistral", + baseUrl: "https://api.mistral.ai/v1", + compat: { + supportsPromptCacheKey: true, + supportsLongCacheRetention: false, + supportsStore: false, + supportsReasoningEffort: false, + maxTokensField: "max_tokens", + }, + reasoning: false, + input: ["text"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 32768, + maxTokens: 8192, + } as unknown as Model<"openai-completions">; + const context = { + systemPrompt: "system", + messages: [], + tools: [], + } as never; + + const params = buildOpenAICompletionsParams(model, context, { + sessionId: "session-123", + cacheRetention: "long", + }) as { prompt_cache_key?: string; prompt_cache_retention?: string }; + + expect(params.prompt_cache_key).toBe("session-123"); + expect(params).not.toHaveProperty("prompt_cache_retention"); + }); + it("sorts Chat Completions tools by function name for stable prompt-cache payloads", () => { const model = { id: "custom-model", diff --git a/src/agents/openai-transport-stream.ts b/src/agents/openai-transport-stream.ts index 24d88c3a1ce7..d3d0e02327bb 100644 --- a/src/agents/openai-transport-stream.ts +++ b/src/agents/openai-transport-stream.ts @@ -3386,6 +3386,7 @@ function getCompat(model: OpenAIModeModel): { vercelGatewayRouting: Record; supportsStrictMode: boolean; supportsPromptCacheKey: boolean; + supportsLongCacheRetention: boolean; requiresStringContent: boolean; strictMessageKeys: boolean; visibleReasoningDetailTypes: string[]; @@ -3418,6 +3419,7 @@ function getCompat(model: OpenAIModeModel): { detected.vercelGatewayRouting, supportsStrictMode: compat.supportsStrictMode ?? detected.supportsStrictMode, supportsPromptCacheKey: compat.supportsPromptCacheKey === true, + supportsLongCacheRetention: compat.supportsLongCacheRetention !== false, requiresStringContent: compat.requiresStringContent ?? false, strictMessageKeys: compat.strictMessageKeys === true, visibleReasoningDetailTypes: @@ -4072,7 +4074,7 @@ export function buildOpenAICompletionsParams( // OpenAI, etc.) can honor the 24h prefix-cache lifetime. Without this // the key reaches the wire but the retention preference is silently // dropped (issue #81281). - if (cacheRetention === "long") { + if (cacheRetention === "long" && compat.supportsLongCacheRetention) { params.prompt_cache_retention = "24h"; } } diff --git a/src/commands/doctor-legacy-config.migrations.test.ts b/src/commands/doctor-legacy-config.migrations.test.ts index cb64ef99b42f..b71ee1006754 100644 --- a/src/commands/doctor-legacy-config.migrations.test.ts +++ b/src/commands/doctor-legacy-config.migrations.test.ts @@ -1648,14 +1648,73 @@ describe("normalizeCompatibilityConfigValues", () => { res.config.models?.providers?.mistral?.models?.map((model) => ({ id: model.id, maxTokens: model.maxTokens, + cacheRead: model.cost.cacheRead, })), ).toEqual([ - { id: "mistral-large-latest", maxTokens: 16384 }, - { id: "magistral-small", maxTokens: 40000 }, + { id: "mistral-large-latest", maxTokens: 16384, cacheRead: 0.05 }, + { id: "magistral-small", maxTokens: 40000, cacheRead: 0.05 }, ]); expect(res.changes).toEqual([ "Normalized models.providers.mistral.models[0].maxTokens (262144 → 16384) to avoid Mistral context-window rejects.", + "Normalized models.providers.mistral.models[0].cost.cacheRead (0 → 0.05) for Mistral prompt-cache billing.", "Normalized models.providers.mistral.models[1].maxTokens (128000 → 40000) to avoid Mistral context-window rejects.", + "Normalized models.providers.mistral.models[1].cost.cacheRead (0 → 0.05) for Mistral prompt-cache billing.", + ]); + }); + + it("normalizes old zero Mistral cacheRead costs while preserving custom costs", () => { + const res = normalizeCompatibilityConfigValues({ + models: { + providers: { + mistral: { + baseUrl: "https://api.mistral.ai/v1", + api: "openai-completions", + models: [ + { + id: "codestral-latest", + name: "Codestral", + reasoning: false, + input: ["text"], + cost: { input: 0.3, output: 0.9, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 256000, + maxTokens: 32000, + }, + { + id: "mistral-medium-3-5", + name: "Mistral Medium 3.5 Custom", + reasoning: false, + input: ["text"], + cost: { input: 1.5, output: 7.5, cacheRead: 0.07, cacheWrite: 0 }, + contextWindow: 128000, + maxTokens: 32000, + }, + { + id: "custom-mistral-model", + name: "Custom Mistral", + reasoning: false, + input: ["text"], + cost: { input: 1, output: 2, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 128000, + maxTokens: 32000, + }, + ], + }, + }, + }, + }); + + expect( + res.config.models?.providers?.mistral?.models?.map((model) => ({ + id: model.id, + cacheRead: model.cost.cacheRead, + })), + ).toEqual([ + { id: "codestral-latest", cacheRead: 0.03 }, + { id: "mistral-medium-3-5", cacheRead: 0.07 }, + { id: "custom-mistral-model", cacheRead: 0 }, + ]); + expect(res.changes).toEqual([ + "Normalized models.providers.mistral.models[0].cost.cacheRead (0 → 0.03) for Mistral prompt-cache billing.", ]); }); }); diff --git a/src/commands/doctor/shared/legacy-config-compatibility-base.ts b/src/commands/doctor/shared/legacy-config-compatibility-base.ts index a43b3327f96f..9afee24e0d24 100644 --- a/src/commands/doctor/shared/legacy-config-compatibility-base.ts +++ b/src/commands/doctor/shared/legacy-config-compatibility-base.ts @@ -3,7 +3,7 @@ import { normalizeLegacyBrowserConfig, normalizeLegacyCrossContextMessageConfig, normalizeLegacyMediaProviderOptions, - normalizeLegacyMistralModelMaxTokens, + normalizeLegacyMistralModelDefaults, normalizeLegacyOpenAIModelProviderApi, normalizeLegacyOllamaNativeNumCtxParams, normalizeLegacyRuntimeModelRefs, @@ -44,5 +44,5 @@ export function normalizeBaseCompatibilityConfigValues( next = normalizeLegacyCrossContextMessageConfig(next, changes); next = normalizeLegacyMediaProviderOptions(next, changes); next = normalizeLegacyOllamaNativeNumCtxParams(next, changes); - return normalizeLegacyMistralModelMaxTokens(next, changes); + return normalizeLegacyMistralModelDefaults(next, changes); } diff --git a/src/commands/doctor/shared/legacy-config-core-normalizers.ts b/src/commands/doctor/shared/legacy-config-core-normalizers.ts index 307f8a5ec4e6..1e165107be3b 100644 --- a/src/commands/doctor/shared/legacy-config-core-normalizers.ts +++ b/src/commands/doctor/shared/legacy-config-core-normalizers.ts @@ -1356,7 +1356,47 @@ export function normalizeLegacyOllamaNativeNumCtxParams( }; } -export function normalizeLegacyMistralModelMaxTokens( +const MISTRAL_MODEL_CACHE_READ_COST_BY_ID: Record = { + "codestral-latest": 0.03, + "devstral-medium-latest": 0.04, + "magistral-small": 0.05, + "mistral-large-latest": 0.05, + "mistral-medium-2508": 0.04, + "mistral-medium-3-5": 0.15, + "mistral-small-latest": 0.01, + "pixtral-large-latest": 0.2, +}; + +function normalizeLegacyMistralModelCost>(params: { + providerId: string; + model: T; + modelId: string; + index: number; + changes: string[]; +}): { model: T; changed: boolean } { + const cost = params.model.cost; + if (!isRecord(cost) || cost.cacheRead !== 0) { + return { model: params.model, changed: false }; + } + + const normalizedCacheRead = MISTRAL_MODEL_CACHE_READ_COST_BY_ID[params.modelId.toLowerCase()]; + if (normalizedCacheRead === undefined) { + return { model: params.model, changed: false }; + } + + params.changes.push( + `Normalized models.providers.${sanitizeForLog(params.providerId)}.models[${params.index}].cost.cacheRead (0 → ${normalizedCacheRead}) for Mistral prompt-cache billing.`, + ); + return { + model: { + ...params.model, + cost: { ...cost, cacheRead: normalizedCacheRead }, + }, + changed: true, + }; +} + +export function normalizeLegacyMistralModelDefaults( cfg: OpenClawConfig, changes: string[], ): OpenClawConfig { @@ -1382,6 +1422,12 @@ export function normalizeLegacyMistralModelMaxTokens( return model; } const modelId = normalizeOptionalString(model.id) ?? ""; + if (!modelId) { + return model; + } + + let nextModel = model; + let modelChanged = false; const contextWindow = typeof model.contextWindow === "number" && Number.isFinite(model.contextWindow) ? model.contextWindow @@ -1390,25 +1436,39 @@ export function normalizeLegacyMistralModelMaxTokens( typeof model.maxTokens === "number" && Number.isFinite(model.maxTokens) ? model.maxTokens : null; - if (!modelId || contextWindow === null || maxTokens === null) { - return model; + + if (contextWindow !== null && maxTokens !== null) { + const normalizedMaxTokens = resolveNormalizedProviderModelMaxTokens({ + providerId, + modelId, + contextWindow, + rawMaxTokens: maxTokens, + }); + if (normalizedMaxTokens !== maxTokens) { + nextModel = Object.assign({}, nextModel, { maxTokens: normalizedMaxTokens }); + modelChanged = true; + changes.push( + `Normalized models.providers.${providerId}.models[${index}].maxTokens (${maxTokens} → ${normalizedMaxTokens}) to avoid Mistral context-window rejects.`, + ); + } } - const normalizedMaxTokens = resolveNormalizedProviderModelMaxTokens({ + const costNormalization = normalizeLegacyMistralModelCost({ providerId, + model: nextModel, modelId, - contextWindow, - rawMaxTokens: maxTokens, + index, + changes, }); - if (normalizedMaxTokens === maxTokens) { - return model; + if (costNormalization.changed) { + nextModel = costNormalization.model; + modelChanged = true; } - modelsChanged = true; - changes.push( - `Normalized models.providers.${providerId}.models[${index}].maxTokens (${maxTokens} → ${normalizedMaxTokens}) to avoid Mistral context-window rejects.`, - ); - return Object.assign({}, model, { maxTokens: normalizedMaxTokens }); + if (modelChanged) { + modelsChanged = true; + } + return modelChanged ? nextModel : model; }); if (!modelsChanged) { diff --git a/src/llm/providers/openai-completions.test.ts b/src/llm/providers/openai-completions.test.ts index 83d58d596f7a..ec4e8d75f01d 100644 --- a/src/llm/providers/openai-completions.test.ts +++ b/src/llm/providers/openai-completions.test.ts @@ -149,6 +149,81 @@ describe("OpenAI-compatible completions params", () => { expect(result.stopReason).toBe("error"); expect(capturedStop).toEqual(["STOP"]); }); + + it("keeps prompt cache keys when long retention is disabled", async () => { + let capturedCacheKey: unknown; + let capturedRetention: unknown; + const stream = streamOpenAICompletions( + { + ...createModel(32_000), + compat: { + supportsPromptCacheKey: true, + supportsLongCacheRetention: false, + }, + }, + context, + { + apiKey: "sk-test", + sessionId: "session-123", + cacheRetention: "long", + onPayload(payload) { + capturedCacheKey = (payload as { prompt_cache_key?: unknown }).prompt_cache_key; + capturedRetention = (payload as { prompt_cache_retention?: unknown }) + .prompt_cache_retention; + throw new Error("stop before network"); + }, + }, + ); + + const result = await stream.result(); + + expect(result.stopReason).toBe("error"); + expect(capturedCacheKey).toBe("session-123"); + expect(capturedRetention).toBeUndefined(); + }); + + it("omits prompt cache retention when third-party models have not opted into cache keys", async () => { + let capturedCacheKey: unknown; + let capturedRetention: unknown; + const stream = streamOpenAICompletions(createModel(32_000), context, { + apiKey: "sk-test", + sessionId: "session-123", + cacheRetention: "long", + onPayload(payload) { + capturedCacheKey = (payload as { prompt_cache_key?: unknown }).prompt_cache_key; + capturedRetention = (payload as { prompt_cache_retention?: unknown }) + .prompt_cache_retention; + throw new Error("stop before network"); + }, + }); + + const result = await stream.result(); + + expect(result.stopReason).toBe("error"); + expect(capturedCacheKey).toBeUndefined(); + expect(capturedRetention).toBeUndefined(); + }); + + it("keeps OpenAI long retention even when no cache key is available", async () => { + let capturedCacheKey: unknown; + let capturedRetention: unknown; + const stream = streamOpenAICompletions(model, context, { + apiKey: "sk-test", + cacheRetention: "long", + onPayload(payload) { + capturedCacheKey = (payload as { prompt_cache_key?: unknown }).prompt_cache_key; + capturedRetention = (payload as { prompt_cache_retention?: unknown }) + .prompt_cache_retention; + throw new Error("stop before network"); + }, + }); + + const result = await stream.result(); + + expect(result.stopReason).toBe("error"); + expect(capturedCacheKey).toBeUndefined(); + expect(capturedRetention).toBe("24h"); + }); }); describe("openai-completions stop-reason tool-call guard", () => { diff --git a/src/llm/providers/openai-completions.ts b/src/llm/providers/openai-completions.ts index 64966c1432d6..a2e77c2f2f14 100644 --- a/src/llm/providers/openai-completions.ts +++ b/src/llm/providers/openai-completions.ts @@ -594,6 +594,8 @@ function buildParams( reasoning_effort?: string; stream_options?: { include_usage: boolean }; max_tokens?: number; + prompt_cache_key?: string; + prompt_cache_retention?: "24h"; tool_stream?: boolean; enable_thinking?: boolean; chat_template_kwargs?: { enable_thinking: boolean; preserve_thinking: boolean }; @@ -602,17 +604,21 @@ function buildParams( providerOptions?: unknown; }; + const supportsPromptCacheKey = + model.baseUrl.includes("api.openai.com") || compat.supportsPromptCacheKey; + const promptCacheKey = + supportsPromptCacheKey && cacheRetention !== "none" + ? clampOpenAIPromptCacheKey(options?.promptCacheKey ?? options?.sessionId) + : undefined; const params: ChatCompletionRequestParams = { model: model.id, messages, stream: true, - prompt_cache_key: - (model.baseUrl.includes("api.openai.com") && cacheRetention !== "none") || - (cacheRetention === "long" && compat.supportsLongCacheRetention) - ? clampOpenAIPromptCacheKey(options?.promptCacheKey ?? options?.sessionId) - : undefined, + prompt_cache_key: promptCacheKey, prompt_cache_retention: - cacheRetention === "long" && compat.supportsLongCacheRetention ? "24h" : undefined, + supportsPromptCacheKey && cacheRetention === "long" && compat.supportsLongCacheRetention + ? "24h" + : undefined, }; if (compat.supportsUsageInStreaming) { @@ -1266,6 +1272,7 @@ function detectCompat(model: Model<"openai-completions">): ResolvedOpenAIComplet supportsStrictMode: !isMoonshot && !isTogether && !isCloudflareAiGateway, cacheControlFormat, sendSessionAffinityHeaders: false, + supportsPromptCacheKey: false, supportsLongCacheRetention: !(isTogether || isCloudflareWorkersAI || isCloudflareAiGateway), }; } @@ -1303,6 +1310,7 @@ function getCompat(model: Model<"openai-completions">): ResolvedOpenAICompletion cacheControlFormat: model.compat.cacheControlFormat ?? detected.cacheControlFormat, sendSessionAffinityHeaders: model.compat.sendSessionAffinityHeaders ?? detected.sendSessionAffinityHeaders, + supportsPromptCacheKey: model.compat.supportsPromptCacheKey ?? detected.supportsPromptCacheKey, supportsLongCacheRetention: model.compat.supportsLongCacheRetention ?? detected.supportsLongCacheRetention, };