mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-06 05:51:15 +08:00
fix(mistral): enable prompt cache keys
Enable Mistral prompt cache keys without long-retention forwarding. Update cached-read pricing and doctor migration for existing Mistral provider config. Fixes #83709.
This commit is contained in:
@@ -105,7 +105,7 @@ Per-agent heartbeat is supported at `agents.list[].heartbeat`.
|
||||
|
||||
- Prompt caching is automatic on supported recent models. OpenClaw does not need to inject block-level cache markers.
|
||||
- OpenClaw uses `prompt_cache_key` to keep cache routing stable across turns. Direct OpenAI hosts use `prompt_cache_retention: "24h"` when `cacheRetention: "long"` is selected.
|
||||
- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`; with that same opt-in, explicit `cacheRetention: "long"` also forwards `prompt_cache_retention: "24h"`, and `cacheRetention: "none"` suppresses both fields.
|
||||
- OpenAI-compatible Completions providers receive `prompt_cache_key` only when their model config explicitly sets `compat.supportsPromptCacheKey: true`. Long-retention forwarding is a separate capability: explicit `cacheRetention: "long"` sends `prompt_cache_retention: "24h"` only when that compat entry also supports long cache retention. Providers such as Mistral can opt into cache keys while setting `compat.supportsLongCacheRetention: false` to suppress the long-retention field. `cacheRetention: "none"` suppresses both fields.
|
||||
- OpenAI responses expose cached prompt tokens via `usage.prompt_tokens_details.cached_tokens` (or `input_tokens_details.cached_tokens` on Responses API events). OpenClaw maps that to `cacheRead`.
|
||||
- OpenAI does not expose a separate cache-write token counter, so `cacheWrite` stays `0` on OpenAI paths even when the provider is warming a cache.
|
||||
- OpenAI returns useful tracing and rate-limit headers such as `x-request-id`, `openai-processing-ms`, and `x-ratelimit-*`, but cache-hit accounting should come from the usage payload, not from headers.
|
||||
|
||||
@@ -12,6 +12,8 @@ import mistralPlugin from "./index.js";
|
||||
type MistralCompatShape = {
|
||||
maxTokensField?: "max_completion_tokens" | "max_tokens";
|
||||
reasoningEffortMap?: Record<string, string>;
|
||||
supportsLongCacheRetention?: boolean;
|
||||
supportsPromptCacheKey?: boolean;
|
||||
supportsReasoningEffort?: boolean;
|
||||
supportsStore?: boolean;
|
||||
};
|
||||
@@ -24,6 +26,14 @@ function supportsStore(model: unknown): boolean | undefined {
|
||||
return readCompat(model)?.supportsStore;
|
||||
}
|
||||
|
||||
function supportsPromptCacheKey(model: unknown): boolean | undefined {
|
||||
return readCompat(model)?.supportsPromptCacheKey;
|
||||
}
|
||||
|
||||
function supportsLongCacheRetention(model: unknown): boolean | undefined {
|
||||
return readCompat(model)?.supportsLongCacheRetention;
|
||||
}
|
||||
|
||||
function supportsReasoningEffort(model: unknown): boolean | undefined {
|
||||
return readCompat(model)?.supportsReasoningEffort;
|
||||
}
|
||||
@@ -51,6 +61,8 @@ describe("resolveMistralCompatPatch", () => {
|
||||
it("enables reasoning_effort mapping for mistral-small-latest", () => {
|
||||
expect(resolveMistralCompatPatch({ id: MISTRAL_SMALL_LATEST_ID })).toEqual({
|
||||
supportsStore: false,
|
||||
supportsPromptCacheKey: true,
|
||||
supportsLongCacheRetention: false,
|
||||
supportsReasoningEffort: true,
|
||||
maxTokensField: "max_tokens",
|
||||
reasoningEffortMap: MISTRAL_REASONING_EFFORT_MAP,
|
||||
@@ -60,6 +72,8 @@ describe("resolveMistralCompatPatch", () => {
|
||||
it("enables reasoning_effort mapping for mistral-medium-3-5", () => {
|
||||
expect(resolveMistralCompatPatch({ id: MISTRAL_MEDIUM_3_5_ID })).toEqual({
|
||||
supportsStore: false,
|
||||
supportsPromptCacheKey: true,
|
||||
supportsLongCacheRetention: false,
|
||||
supportsReasoningEffort: true,
|
||||
maxTokensField: "max_tokens",
|
||||
reasoningEffortMap: MISTRAL_REASONING_EFFORT_MAP,
|
||||
@@ -78,6 +92,8 @@ describe("applyMistralModelCompat", () => {
|
||||
it("applies the Mistral request-shape compat flags", () => {
|
||||
const normalized = applyMistralModelCompat({});
|
||||
expect(supportsStore(normalized)).toBe(false);
|
||||
expect(supportsPromptCacheKey(normalized)).toBe(true);
|
||||
expect(supportsLongCacheRetention(normalized)).toBe(false);
|
||||
expect(supportsReasoningEffort(normalized)).toBe(false);
|
||||
expect(maxTokensField(normalized)).toBe("max_tokens");
|
||||
expect(reasoningEffortMap(normalized)).toBeUndefined();
|
||||
@@ -128,6 +144,8 @@ describe("applyMistralModelCompat", () => {
|
||||
const model = {
|
||||
compat: {
|
||||
supportsStore: false,
|
||||
supportsPromptCacheKey: true,
|
||||
supportsLongCacheRetention: false,
|
||||
supportsReasoningEffort: false,
|
||||
maxTokensField: "max_tokens" as const,
|
||||
},
|
||||
|
||||
@@ -14,9 +14,13 @@ const MISTRAL_MAX_TOKENS_FIELD = "max_tokens";
|
||||
|
||||
export const MISTRAL_MODEL_TRANSPORT_PATCH = {
|
||||
supportsStore: false,
|
||||
supportsPromptCacheKey: true,
|
||||
supportsLongCacheRetention: false,
|
||||
maxTokensField: MISTRAL_MAX_TOKENS_FIELD,
|
||||
} as const satisfies {
|
||||
supportsStore: boolean;
|
||||
supportsPromptCacheKey: boolean;
|
||||
supportsLongCacheRetention: boolean;
|
||||
maxTokensField: "max_tokens";
|
||||
};
|
||||
|
||||
@@ -36,6 +40,8 @@ export const MISTRAL_MEDIUM_3_5_ID = "mistral-medium-3-5";
|
||||
|
||||
export function resolveMistralCompatPatch(model: { id?: string }): {
|
||||
supportsStore: boolean;
|
||||
supportsPromptCacheKey: boolean;
|
||||
supportsLongCacheRetention: boolean;
|
||||
supportsReasoningEffort: boolean;
|
||||
maxTokensField: "max_tokens";
|
||||
reasoningEffortMap?: Record<string, string>;
|
||||
@@ -56,6 +62,8 @@ function compatMatchesResolved(
|
||||
const expected = resolveMistralCompatPatch({ id: modelId });
|
||||
return (
|
||||
compat?.supportsStore === expected.supportsStore &&
|
||||
compat?.supportsPromptCacheKey === expected.supportsPromptCacheKey &&
|
||||
compat?.supportsLongCacheRetention === expected.supportsLongCacheRetention &&
|
||||
compat?.supportsReasoningEffort === expected.supportsReasoningEffort &&
|
||||
compat?.maxTokensField === expected.maxTokensField &&
|
||||
compat?.reasoningEffortMap === expected.reasoningEffortMap
|
||||
|
||||
@@ -27,11 +27,29 @@ describe("mistral model definitions", () => {
|
||||
expect(MISTRAL_DEFAULT_COST).toEqual({
|
||||
input: 0.5,
|
||||
output: 1.5,
|
||||
cacheRead: 0,
|
||||
cacheRead: 0.05,
|
||||
cacheWrite: 0,
|
||||
});
|
||||
});
|
||||
|
||||
it("prices cached Mistral input tokens at ten percent of standard input tokens", () => {
|
||||
const models = buildMistralCatalogModels();
|
||||
|
||||
for (const model of models) {
|
||||
expect(model.cost.cacheRead).toBeCloseTo(model.cost.input * 0.1, 10);
|
||||
expect(model.cost.cacheWrite).toBe(0);
|
||||
}
|
||||
});
|
||||
|
||||
it("charges nonzero cost for cached-token usage on the default model", () => {
|
||||
const model = buildMistralModelDefinition();
|
||||
const cacheReadTokens = 20_000;
|
||||
const cacheReadCost = (model.cost.cacheRead / 1_000_000) * cacheReadTokens;
|
||||
|
||||
expect(cacheReadCost).toBeCloseTo(0.001, 10);
|
||||
expect(cacheReadCost).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it("publishes a curated set of current Mistral catalog models", () => {
|
||||
const models = buildMistralCatalogModels();
|
||||
const codestral = catalogModelById(models, "codestral-latest");
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
"cost": {
|
||||
"input": 0.3,
|
||||
"output": 0.9,
|
||||
"cacheRead": 0,
|
||||
"cacheRead": 0.03,
|
||||
"cacheWrite": 0
|
||||
}
|
||||
},
|
||||
@@ -46,7 +46,7 @@
|
||||
"cost": {
|
||||
"input": 0.4,
|
||||
"output": 2,
|
||||
"cacheRead": 0,
|
||||
"cacheRead": 0.04,
|
||||
"cacheWrite": 0
|
||||
}
|
||||
},
|
||||
@@ -60,7 +60,7 @@
|
||||
"cost": {
|
||||
"input": 0.5,
|
||||
"output": 1.5,
|
||||
"cacheRead": 0,
|
||||
"cacheRead": 0.05,
|
||||
"cacheWrite": 0
|
||||
}
|
||||
},
|
||||
@@ -73,7 +73,7 @@
|
||||
"cost": {
|
||||
"input": 0.5,
|
||||
"output": 1.5,
|
||||
"cacheRead": 0,
|
||||
"cacheRead": 0.05,
|
||||
"cacheWrite": 0
|
||||
}
|
||||
},
|
||||
@@ -86,7 +86,7 @@
|
||||
"cost": {
|
||||
"input": 0.4,
|
||||
"output": 2,
|
||||
"cacheRead": 0,
|
||||
"cacheRead": 0.04,
|
||||
"cacheWrite": 0
|
||||
}
|
||||
},
|
||||
@@ -100,7 +100,7 @@
|
||||
"cost": {
|
||||
"input": 1.5,
|
||||
"output": 7.5,
|
||||
"cacheRead": 0,
|
||||
"cacheRead": 0.15,
|
||||
"cacheWrite": 0
|
||||
}
|
||||
},
|
||||
@@ -114,7 +114,7 @@
|
||||
"cost": {
|
||||
"input": 0.1,
|
||||
"output": 0.3,
|
||||
"cacheRead": 0,
|
||||
"cacheRead": 0.01,
|
||||
"cacheWrite": 0
|
||||
}
|
||||
},
|
||||
@@ -127,7 +127,7 @@
|
||||
"cost": {
|
||||
"input": 2,
|
||||
"output": 6,
|
||||
"cacheRead": 0,
|
||||
"cacheRead": 0.2,
|
||||
"cacheWrite": 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -402,6 +402,8 @@ export interface OpenAICompletionsCompat {
|
||||
cacheControlFormat?: "anthropic";
|
||||
/** Whether to send known session-affinity headers (`session_id`, `x-client-request-id`, `x-session-affinity`) from `options.sessionId` when caching is enabled. Default: false. */
|
||||
sendSessionAffinityHeaders?: boolean;
|
||||
/** Whether the provider supports OpenAI-style `prompt_cache_key`. Default: false for third-party completions providers. */
|
||||
supportsPromptCacheKey?: boolean;
|
||||
/** Whether the provider supports long prompt cache retention (`prompt_cache_retention: "24h"` or Anthropic-style `cache_control.ttl: "1h"`, depending on format). Default: true. */
|
||||
supportsLongCacheRetention?: boolean;
|
||||
}
|
||||
|
||||
@@ -5700,6 +5700,41 @@ describe("openai transport stream", () => {
|
||||
expect(defaultRetention).not.toHaveProperty("prompt_cache_retention");
|
||||
});
|
||||
|
||||
it("keeps Mistral prompt cache keys without unsupported long retention", () => {
|
||||
const model = {
|
||||
id: "mistral-large-latest",
|
||||
name: "Mistral Large",
|
||||
api: "openai-completions",
|
||||
provider: "mistral",
|
||||
baseUrl: "https://api.mistral.ai/v1",
|
||||
compat: {
|
||||
supportsPromptCacheKey: true,
|
||||
supportsLongCacheRetention: false,
|
||||
supportsStore: false,
|
||||
supportsReasoningEffort: false,
|
||||
maxTokensField: "max_tokens",
|
||||
},
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 32768,
|
||||
maxTokens: 8192,
|
||||
} as unknown as Model<"openai-completions">;
|
||||
const context = {
|
||||
systemPrompt: "system",
|
||||
messages: [],
|
||||
tools: [],
|
||||
} as never;
|
||||
|
||||
const params = buildOpenAICompletionsParams(model, context, {
|
||||
sessionId: "session-123",
|
||||
cacheRetention: "long",
|
||||
}) as { prompt_cache_key?: string; prompt_cache_retention?: string };
|
||||
|
||||
expect(params.prompt_cache_key).toBe("session-123");
|
||||
expect(params).not.toHaveProperty("prompt_cache_retention");
|
||||
});
|
||||
|
||||
it("sorts Chat Completions tools by function name for stable prompt-cache payloads", () => {
|
||||
const model = {
|
||||
id: "custom-model",
|
||||
|
||||
@@ -3386,6 +3386,7 @@ function getCompat(model: OpenAIModeModel): {
|
||||
vercelGatewayRouting: Record<string, unknown>;
|
||||
supportsStrictMode: boolean;
|
||||
supportsPromptCacheKey: boolean;
|
||||
supportsLongCacheRetention: boolean;
|
||||
requiresStringContent: boolean;
|
||||
strictMessageKeys: boolean;
|
||||
visibleReasoningDetailTypes: string[];
|
||||
@@ -3418,6 +3419,7 @@ function getCompat(model: OpenAIModeModel): {
|
||||
detected.vercelGatewayRouting,
|
||||
supportsStrictMode: compat.supportsStrictMode ?? detected.supportsStrictMode,
|
||||
supportsPromptCacheKey: compat.supportsPromptCacheKey === true,
|
||||
supportsLongCacheRetention: compat.supportsLongCacheRetention !== false,
|
||||
requiresStringContent: compat.requiresStringContent ?? false,
|
||||
strictMessageKeys: compat.strictMessageKeys === true,
|
||||
visibleReasoningDetailTypes:
|
||||
@@ -4072,7 +4074,7 @@ export function buildOpenAICompletionsParams(
|
||||
// OpenAI, etc.) can honor the 24h prefix-cache lifetime. Without this
|
||||
// the key reaches the wire but the retention preference is silently
|
||||
// dropped (issue #81281).
|
||||
if (cacheRetention === "long") {
|
||||
if (cacheRetention === "long" && compat.supportsLongCacheRetention) {
|
||||
params.prompt_cache_retention = "24h";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1648,14 +1648,73 @@ describe("normalizeCompatibilityConfigValues", () => {
|
||||
res.config.models?.providers?.mistral?.models?.map((model) => ({
|
||||
id: model.id,
|
||||
maxTokens: model.maxTokens,
|
||||
cacheRead: model.cost.cacheRead,
|
||||
})),
|
||||
).toEqual([
|
||||
{ id: "mistral-large-latest", maxTokens: 16384 },
|
||||
{ id: "magistral-small", maxTokens: 40000 },
|
||||
{ id: "mistral-large-latest", maxTokens: 16384, cacheRead: 0.05 },
|
||||
{ id: "magistral-small", maxTokens: 40000, cacheRead: 0.05 },
|
||||
]);
|
||||
expect(res.changes).toEqual([
|
||||
"Normalized models.providers.mistral.models[0].maxTokens (262144 → 16384) to avoid Mistral context-window rejects.",
|
||||
"Normalized models.providers.mistral.models[0].cost.cacheRead (0 → 0.05) for Mistral prompt-cache billing.",
|
||||
"Normalized models.providers.mistral.models[1].maxTokens (128000 → 40000) to avoid Mistral context-window rejects.",
|
||||
"Normalized models.providers.mistral.models[1].cost.cacheRead (0 → 0.05) for Mistral prompt-cache billing.",
|
||||
]);
|
||||
});
|
||||
|
||||
it("normalizes old zero Mistral cacheRead costs while preserving custom costs", () => {
|
||||
const res = normalizeCompatibilityConfigValues({
|
||||
models: {
|
||||
providers: {
|
||||
mistral: {
|
||||
baseUrl: "https://api.mistral.ai/v1",
|
||||
api: "openai-completions",
|
||||
models: [
|
||||
{
|
||||
id: "codestral-latest",
|
||||
name: "Codestral",
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
cost: { input: 0.3, output: 0.9, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 256000,
|
||||
maxTokens: 32000,
|
||||
},
|
||||
{
|
||||
id: "mistral-medium-3-5",
|
||||
name: "Mistral Medium 3.5 Custom",
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
cost: { input: 1.5, output: 7.5, cacheRead: 0.07, cacheWrite: 0 },
|
||||
contextWindow: 128000,
|
||||
maxTokens: 32000,
|
||||
},
|
||||
{
|
||||
id: "custom-mistral-model",
|
||||
name: "Custom Mistral",
|
||||
reasoning: false,
|
||||
input: ["text"],
|
||||
cost: { input: 1, output: 2, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 128000,
|
||||
maxTokens: 32000,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
expect(
|
||||
res.config.models?.providers?.mistral?.models?.map((model) => ({
|
||||
id: model.id,
|
||||
cacheRead: model.cost.cacheRead,
|
||||
})),
|
||||
).toEqual([
|
||||
{ id: "codestral-latest", cacheRead: 0.03 },
|
||||
{ id: "mistral-medium-3-5", cacheRead: 0.07 },
|
||||
{ id: "custom-mistral-model", cacheRead: 0 },
|
||||
]);
|
||||
expect(res.changes).toEqual([
|
||||
"Normalized models.providers.mistral.models[0].cost.cacheRead (0 → 0.03) for Mistral prompt-cache billing.",
|
||||
]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -3,7 +3,7 @@ import {
|
||||
normalizeLegacyBrowserConfig,
|
||||
normalizeLegacyCrossContextMessageConfig,
|
||||
normalizeLegacyMediaProviderOptions,
|
||||
normalizeLegacyMistralModelMaxTokens,
|
||||
normalizeLegacyMistralModelDefaults,
|
||||
normalizeLegacyOpenAIModelProviderApi,
|
||||
normalizeLegacyOllamaNativeNumCtxParams,
|
||||
normalizeLegacyRuntimeModelRefs,
|
||||
@@ -44,5 +44,5 @@ export function normalizeBaseCompatibilityConfigValues(
|
||||
next = normalizeLegacyCrossContextMessageConfig(next, changes);
|
||||
next = normalizeLegacyMediaProviderOptions(next, changes);
|
||||
next = normalizeLegacyOllamaNativeNumCtxParams(next, changes);
|
||||
return normalizeLegacyMistralModelMaxTokens(next, changes);
|
||||
return normalizeLegacyMistralModelDefaults(next, changes);
|
||||
}
|
||||
|
||||
@@ -1356,7 +1356,47 @@ export function normalizeLegacyOllamaNativeNumCtxParams(
|
||||
};
|
||||
}
|
||||
|
||||
export function normalizeLegacyMistralModelMaxTokens(
|
||||
const MISTRAL_MODEL_CACHE_READ_COST_BY_ID: Record<string, number> = {
|
||||
"codestral-latest": 0.03,
|
||||
"devstral-medium-latest": 0.04,
|
||||
"magistral-small": 0.05,
|
||||
"mistral-large-latest": 0.05,
|
||||
"mistral-medium-2508": 0.04,
|
||||
"mistral-medium-3-5": 0.15,
|
||||
"mistral-small-latest": 0.01,
|
||||
"pixtral-large-latest": 0.2,
|
||||
};
|
||||
|
||||
function normalizeLegacyMistralModelCost<T extends Record<string, unknown>>(params: {
|
||||
providerId: string;
|
||||
model: T;
|
||||
modelId: string;
|
||||
index: number;
|
||||
changes: string[];
|
||||
}): { model: T; changed: boolean } {
|
||||
const cost = params.model.cost;
|
||||
if (!isRecord(cost) || cost.cacheRead !== 0) {
|
||||
return { model: params.model, changed: false };
|
||||
}
|
||||
|
||||
const normalizedCacheRead = MISTRAL_MODEL_CACHE_READ_COST_BY_ID[params.modelId.toLowerCase()];
|
||||
if (normalizedCacheRead === undefined) {
|
||||
return { model: params.model, changed: false };
|
||||
}
|
||||
|
||||
params.changes.push(
|
||||
`Normalized models.providers.${sanitizeForLog(params.providerId)}.models[${params.index}].cost.cacheRead (0 → ${normalizedCacheRead}) for Mistral prompt-cache billing.`,
|
||||
);
|
||||
return {
|
||||
model: {
|
||||
...params.model,
|
||||
cost: { ...cost, cacheRead: normalizedCacheRead },
|
||||
},
|
||||
changed: true,
|
||||
};
|
||||
}
|
||||
|
||||
export function normalizeLegacyMistralModelDefaults(
|
||||
cfg: OpenClawConfig,
|
||||
changes: string[],
|
||||
): OpenClawConfig {
|
||||
@@ -1382,6 +1422,12 @@ export function normalizeLegacyMistralModelMaxTokens(
|
||||
return model;
|
||||
}
|
||||
const modelId = normalizeOptionalString(model.id) ?? "";
|
||||
if (!modelId) {
|
||||
return model;
|
||||
}
|
||||
|
||||
let nextModel = model;
|
||||
let modelChanged = false;
|
||||
const contextWindow =
|
||||
typeof model.contextWindow === "number" && Number.isFinite(model.contextWindow)
|
||||
? model.contextWindow
|
||||
@@ -1390,25 +1436,39 @@ export function normalizeLegacyMistralModelMaxTokens(
|
||||
typeof model.maxTokens === "number" && Number.isFinite(model.maxTokens)
|
||||
? model.maxTokens
|
||||
: null;
|
||||
if (!modelId || contextWindow === null || maxTokens === null) {
|
||||
return model;
|
||||
|
||||
if (contextWindow !== null && maxTokens !== null) {
|
||||
const normalizedMaxTokens = resolveNormalizedProviderModelMaxTokens({
|
||||
providerId,
|
||||
modelId,
|
||||
contextWindow,
|
||||
rawMaxTokens: maxTokens,
|
||||
});
|
||||
if (normalizedMaxTokens !== maxTokens) {
|
||||
nextModel = Object.assign({}, nextModel, { maxTokens: normalizedMaxTokens });
|
||||
modelChanged = true;
|
||||
changes.push(
|
||||
`Normalized models.providers.${providerId}.models[${index}].maxTokens (${maxTokens} → ${normalizedMaxTokens}) to avoid Mistral context-window rejects.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const normalizedMaxTokens = resolveNormalizedProviderModelMaxTokens({
|
||||
const costNormalization = normalizeLegacyMistralModelCost({
|
||||
providerId,
|
||||
model: nextModel,
|
||||
modelId,
|
||||
contextWindow,
|
||||
rawMaxTokens: maxTokens,
|
||||
index,
|
||||
changes,
|
||||
});
|
||||
if (normalizedMaxTokens === maxTokens) {
|
||||
return model;
|
||||
if (costNormalization.changed) {
|
||||
nextModel = costNormalization.model;
|
||||
modelChanged = true;
|
||||
}
|
||||
|
||||
modelsChanged = true;
|
||||
changes.push(
|
||||
`Normalized models.providers.${providerId}.models[${index}].maxTokens (${maxTokens} → ${normalizedMaxTokens}) to avoid Mistral context-window rejects.`,
|
||||
);
|
||||
return Object.assign({}, model, { maxTokens: normalizedMaxTokens });
|
||||
if (modelChanged) {
|
||||
modelsChanged = true;
|
||||
}
|
||||
return modelChanged ? nextModel : model;
|
||||
});
|
||||
|
||||
if (!modelsChanged) {
|
||||
|
||||
@@ -149,6 +149,81 @@ describe("OpenAI-compatible completions params", () => {
|
||||
expect(result.stopReason).toBe("error");
|
||||
expect(capturedStop).toEqual(["STOP"]);
|
||||
});
|
||||
|
||||
it("keeps prompt cache keys when long retention is disabled", async () => {
|
||||
let capturedCacheKey: unknown;
|
||||
let capturedRetention: unknown;
|
||||
const stream = streamOpenAICompletions(
|
||||
{
|
||||
...createModel(32_000),
|
||||
compat: {
|
||||
supportsPromptCacheKey: true,
|
||||
supportsLongCacheRetention: false,
|
||||
},
|
||||
},
|
||||
context,
|
||||
{
|
||||
apiKey: "sk-test",
|
||||
sessionId: "session-123",
|
||||
cacheRetention: "long",
|
||||
onPayload(payload) {
|
||||
capturedCacheKey = (payload as { prompt_cache_key?: unknown }).prompt_cache_key;
|
||||
capturedRetention = (payload as { prompt_cache_retention?: unknown })
|
||||
.prompt_cache_retention;
|
||||
throw new Error("stop before network");
|
||||
},
|
||||
},
|
||||
);
|
||||
|
||||
const result = await stream.result();
|
||||
|
||||
expect(result.stopReason).toBe("error");
|
||||
expect(capturedCacheKey).toBe("session-123");
|
||||
expect(capturedRetention).toBeUndefined();
|
||||
});
|
||||
|
||||
it("omits prompt cache retention when third-party models have not opted into cache keys", async () => {
|
||||
let capturedCacheKey: unknown;
|
||||
let capturedRetention: unknown;
|
||||
const stream = streamOpenAICompletions(createModel(32_000), context, {
|
||||
apiKey: "sk-test",
|
||||
sessionId: "session-123",
|
||||
cacheRetention: "long",
|
||||
onPayload(payload) {
|
||||
capturedCacheKey = (payload as { prompt_cache_key?: unknown }).prompt_cache_key;
|
||||
capturedRetention = (payload as { prompt_cache_retention?: unknown })
|
||||
.prompt_cache_retention;
|
||||
throw new Error("stop before network");
|
||||
},
|
||||
});
|
||||
|
||||
const result = await stream.result();
|
||||
|
||||
expect(result.stopReason).toBe("error");
|
||||
expect(capturedCacheKey).toBeUndefined();
|
||||
expect(capturedRetention).toBeUndefined();
|
||||
});
|
||||
|
||||
it("keeps OpenAI long retention even when no cache key is available", async () => {
|
||||
let capturedCacheKey: unknown;
|
||||
let capturedRetention: unknown;
|
||||
const stream = streamOpenAICompletions(model, context, {
|
||||
apiKey: "sk-test",
|
||||
cacheRetention: "long",
|
||||
onPayload(payload) {
|
||||
capturedCacheKey = (payload as { prompt_cache_key?: unknown }).prompt_cache_key;
|
||||
capturedRetention = (payload as { prompt_cache_retention?: unknown })
|
||||
.prompt_cache_retention;
|
||||
throw new Error("stop before network");
|
||||
},
|
||||
});
|
||||
|
||||
const result = await stream.result();
|
||||
|
||||
expect(result.stopReason).toBe("error");
|
||||
expect(capturedCacheKey).toBeUndefined();
|
||||
expect(capturedRetention).toBe("24h");
|
||||
});
|
||||
});
|
||||
|
||||
describe("openai-completions stop-reason tool-call guard", () => {
|
||||
|
||||
@@ -594,6 +594,8 @@ function buildParams(
|
||||
reasoning_effort?: string;
|
||||
stream_options?: { include_usage: boolean };
|
||||
max_tokens?: number;
|
||||
prompt_cache_key?: string;
|
||||
prompt_cache_retention?: "24h";
|
||||
tool_stream?: boolean;
|
||||
enable_thinking?: boolean;
|
||||
chat_template_kwargs?: { enable_thinking: boolean; preserve_thinking: boolean };
|
||||
@@ -602,17 +604,21 @@ function buildParams(
|
||||
providerOptions?: unknown;
|
||||
};
|
||||
|
||||
const supportsPromptCacheKey =
|
||||
model.baseUrl.includes("api.openai.com") || compat.supportsPromptCacheKey;
|
||||
const promptCacheKey =
|
||||
supportsPromptCacheKey && cacheRetention !== "none"
|
||||
? clampOpenAIPromptCacheKey(options?.promptCacheKey ?? options?.sessionId)
|
||||
: undefined;
|
||||
const params: ChatCompletionRequestParams = {
|
||||
model: model.id,
|
||||
messages,
|
||||
stream: true,
|
||||
prompt_cache_key:
|
||||
(model.baseUrl.includes("api.openai.com") && cacheRetention !== "none") ||
|
||||
(cacheRetention === "long" && compat.supportsLongCacheRetention)
|
||||
? clampOpenAIPromptCacheKey(options?.promptCacheKey ?? options?.sessionId)
|
||||
: undefined,
|
||||
prompt_cache_key: promptCacheKey,
|
||||
prompt_cache_retention:
|
||||
cacheRetention === "long" && compat.supportsLongCacheRetention ? "24h" : undefined,
|
||||
supportsPromptCacheKey && cacheRetention === "long" && compat.supportsLongCacheRetention
|
||||
? "24h"
|
||||
: undefined,
|
||||
};
|
||||
|
||||
if (compat.supportsUsageInStreaming) {
|
||||
@@ -1266,6 +1272,7 @@ function detectCompat(model: Model<"openai-completions">): ResolvedOpenAIComplet
|
||||
supportsStrictMode: !isMoonshot && !isTogether && !isCloudflareAiGateway,
|
||||
cacheControlFormat,
|
||||
sendSessionAffinityHeaders: false,
|
||||
supportsPromptCacheKey: false,
|
||||
supportsLongCacheRetention: !(isTogether || isCloudflareWorkersAI || isCloudflareAiGateway),
|
||||
};
|
||||
}
|
||||
@@ -1303,6 +1310,7 @@ function getCompat(model: Model<"openai-completions">): ResolvedOpenAICompletion
|
||||
cacheControlFormat: model.compat.cacheControlFormat ?? detected.cacheControlFormat,
|
||||
sendSessionAffinityHeaders:
|
||||
model.compat.sendSessionAffinityHeaders ?? detected.sendSessionAffinityHeaders,
|
||||
supportsPromptCacheKey: model.compat.supportsPromptCacheKey ?? detected.supportsPromptCacheKey,
|
||||
supportsLongCacheRetention:
|
||||
model.compat.supportsLongCacheRetention ?? detected.supportsLongCacheRetention,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user