Adapt image compression quality by model (#85742)

* feat: adapt image compression quality

* refactor: move image limits into model metadata

* test: cover adaptive image downscaling

* test: cover image tool live providers

* fix: apply media metadata to all image paths

* fix: align providerless image compression

* fix: add chutes runtime image limits

* fix: optimize image data urls with model limits

* fix: type media metadata merge

* fix: optimize data url byte limits after decode

* fix: preserve data url optimizer fallback

* fix: keep low-side image compression fallbacks

* fix: enforce data url image compression policy

* fix: preserve gif data url media policy

* fix: satisfy adaptive image type checks

* test: keep cron provider-runtime mock current
This commit is contained in:
Peter Steinberger
2026-05-23 21:45:55 +01:00
committed by GitHub
parent 00388134c4
commit 4c210e22fa
32 changed files with 1750 additions and 53 deletions

View File

@@ -8,6 +8,7 @@ Docs: https://docs.openclaw.ai
- Gateway/perf: lazy-load startup-idle plugin work, core gateway method handlers, and the embedded ACPX runtime so Gateway health and ready signals no longer wait on unused handler trees or ACPX probes.
- Gateway/perf: cache plugin SDK public-surface alias maps and skip irrelevant macOS Linuxbrew PATH probes so Gateway startup avoids repeated filesystem walks and slow missing-directory stats.
- Image tool: add adaptive model-aware image compression with an `agents.defaults.imageQuality` preference for choosing token-efficient, balanced, or high-detail media handling.
- Meeting Notes: add a source-only external meeting-notes plugin and SDK source-provider contract outside the core npm package, with auto-start capture config, manual transcript imports, read-only `openclaw meeting-notes` CLI access, and Discord voice as the first live source.
- Docs/channels/config: add Signal `configPath`, Telegram wildcard topic defaults, local-time backup archive names, Termux home fallback, include-path validation, secret-scanner-safe placeholder guidance, Gemini CLI/Antigravity media guidance, and macOS VM auto-login guidance. Thanks @NorseGaud, @yudistiraashadi, @huangqian8, @VibhorGautam, @maweibin, @tianxingleo, @IgnacioPro, and @xzcxzcyy-claw.
- Docs: clarify model-usage portability, Codex migration prerequisites, status bootstrap wording, thread-bound subagent limits, hook ownership, and config-preserving safety guidance. Thanks @aniruddhaadak80, @leno23, @TomDjerry, @matthewxmurphy, @vincentkoc, and @stablegenius49.

View File

@@ -327,6 +327,26 @@ Higher values preserve more visual detail.
}
```
### `agents.defaults.imageQuality`
Image-tool compression/detail preference for images loaded from file paths, URLs, and media references.
Default: `auto`.
OpenClaw adapts the resize ladder to the selected image model. For example, Claude Opus 4.7, OpenAI GPT-5.5, Qwen VL, and hosted Llama 4 vision models can use larger images than older/default high-detail vision paths, while multi-image turns are compressed more aggressively in `auto` mode to control token and latency cost.
Values:
- `auto`: adapt to model limits and image count.
- `efficient`: prefer smaller images for lower token and byte usage.
- `balanced`: use the standard middle-ground ladder.
- `high`: preserve more detail for screenshots, diagrams, and document images.
```json5
{
agents: { defaults: { imageQuality: "auto" } },
}
```
### `agents.defaults.userTimezone`
Timezone for system prompt context (not message timestamps). Falls back to host timezone.

View File

@@ -10,6 +10,17 @@ const CLAUDE_CLI_MODEL_LABELS: Record<string, string> = {
"claude-sonnet-4-6": "Claude Sonnet 4.6 (Claude CLI)",
};
function resolveClaudeCliImageMediaInput(id: string): ModelCatalogEntry["mediaInput"] {
const maxSidePx = id === "claude-opus-4-7" ? 2576 : 1568;
return {
image: {
maxSidePx,
preferredSidePx: maxSidePx,
tokenMode: "provider",
},
};
}
function extractClaudeCliModelIds(): string[] {
const ids: string[] = [];
const seen = new Set<string>();
@@ -34,6 +45,7 @@ export function buildClaudeCliCatalogEntries(): ModelCatalogEntry[] {
provider: CLAUDE_CLI_BACKEND_ID,
reasoning: true,
input: ["text", "image"],
mediaInput: resolveClaudeCliImageMediaInput(id),
contextWindow: CLAUDE_CLI_DEFAULT_CONTEXT_WINDOW,
}));
}

View File

@@ -541,6 +541,34 @@ describe("anthropic provider replay hooks", () => {
} as never);
expect(normalized?.input).toEqual(["text", "image"]);
expect(normalized?.mediaInput).toEqual({
image: { maxSidePx: 1568, preferredSidePx: 1568, tokenMode: "provider" },
});
});
it("merges partial Claude image media metadata with provider limits", async () => {
const provider = await registerSingleProviderPlugin(anthropicPlugin);
const normalized = provider.normalizeResolvedModel?.({
provider: "anthropic",
modelId: "claude-opus-4-7",
model: {
id: "claude-opus-4-7",
name: "Claude Opus 4.7",
provider: "anthropic",
api: "anthropic-messages",
reasoning: true,
input: ["text", "image"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 200_000,
maxTokens: 64_000,
mediaInput: { image: { maxBytes: 1 } },
},
} as never);
expect(normalized?.mediaInput).toEqual({
image: { maxBytes: 1, maxSidePx: 2576, preferredSidePx: 2576, tokenMode: "provider" },
});
});
it("normalizes GA 1M Claude variants to 1M context", async () => {
@@ -577,6 +605,29 @@ describe("anthropic provider replay hooks", () => {
}
});
it("does not normalize legacy Claude 4.5 models to 1M context", async () => {
const provider = await registerSingleProviderPlugin(anthropicPlugin);
const normalized = provider.normalizeResolvedModel?.({
provider: "anthropic",
modelId: "claude-sonnet-4-5",
model: {
id: "claude-sonnet-4-5",
name: "Claude Sonnet 4.5",
provider: "anthropic",
api: "anthropic-messages",
reasoning: true,
input: ["text", "image"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 200_000,
contextTokens: 200_000,
maxTokens: 32_000,
},
} as never);
expect(normalized).toBeUndefined();
});
it("resolves claude-cli synthetic oauth auth", async () => {
readClaudeCliCredentialsForRuntimeMock.mockReset();
readClaudeCliCredentialsForRuntimeMock.mockReturnValue({

View File

@@ -381,6 +381,25 @@ function supportsAnthropicImageInput(modelId: string, modelName?: string): boole
.some((candidate) => matchesAnthropicModernModel(candidate));
}
function resolveAnthropicImageMediaInput(modelId: string, modelName?: string) {
if (!supportsAnthropicImageInput(modelId, modelName)) {
return undefined;
}
const refs = [modelId, modelName].filter((value): value is string => typeof value === "string");
const opus47 = refs.some((ref) =>
[ANTHROPIC_OPUS_47_MODEL_ID, ANTHROPIC_OPUS_47_DOT_MODEL_ID].some((prefix) =>
normalizeLowercaseStringOrEmpty(ref).startsWith(prefix),
),
);
return {
image: {
maxSidePx: opus47 ? 2576 : 1568,
preferredSidePx: opus47 ? 2576 : 1568,
tokenMode: "provider" as const,
},
};
}
function applyAnthropicImageInputCapability(params: {
modelId: string;
model: ProviderRuntimeModel;
@@ -401,13 +420,27 @@ function normalizeAnthropicResolvedModel(
ctx: ProviderNormalizeResolvedModelContext,
): ProviderRuntimeModel | undefined {
const imageCapableModel = applyAnthropicImageInputCapability(ctx) ?? ctx.model;
const mediaInput = resolveAnthropicImageMediaInput(ctx.modelId, imageCapableModel.name);
const mediaInputModel = mediaInput
? {
...imageCapableModel,
mediaInput: {
...mediaInput,
...imageCapableModel.mediaInput,
image: {
...mediaInput.image,
...imageCapableModel.mediaInput?.image,
},
},
}
: imageCapableModel;
const contextWindowModel =
applyAnthropicGa1MContextWindow({
config: ctx.config,
provider: ctx.provider,
modelId: ctx.modelId,
model: imageCapableModel,
}) ?? imageCapableModel;
model: mediaInputModel,
}) ?? mediaInputModel;
return contextWindowModel === ctx.model ? undefined : contextWindowModel;
}

View File

@@ -85,6 +85,20 @@ describe("chutes-models", () => {
expect(def.compat.supportsUsageInStreaming).toBe(false);
});
it("keeps Qwen VL image limits in the runtime catalog", () => {
const visionModelIds = ["Qwen/Qwen2.5-VL-32B-Instruct", "Qwen/Qwen3-VL-235B-A22B-Instruct"];
for (const id of visionModelIds) {
const model = CHUTES_MODEL_CATALOG.find((candidate) => candidate.id === id);
expect(model).toBeDefined();
if (!model) {
throw new Error(`expected ${id}`);
}
expect(buildChutesModelDefinition(model).mediaInput).toEqual({
image: { maxPixels: 12845056, preferredSidePx: 2048, tokenMode: "provider" },
});
}
});
it("discoverChutesModels returns static catalog when accessToken is empty", async () => {
const models = await discoverChutesModels("");
expect(models).toHaveLength(CHUTES_MODEL_CATALOG.length);

View File

@@ -349,6 +349,9 @@ export const CHUTES_MODEL_CATALOG: ModelDefinitionConfig[] = [
name: "Qwen/Qwen2.5-VL-32B-Instruct",
reasoning: false,
input: ["text", "image"],
mediaInput: {
image: { maxPixels: 12845056, preferredSidePx: 2048, tokenMode: "provider" },
},
contextWindow: 16384,
maxTokens: 16384,
cost: { input: 0.05, output: 0.22, cacheRead: 0, cacheWrite: 0 },
@@ -358,6 +361,9 @@ export const CHUTES_MODEL_CATALOG: ModelDefinitionConfig[] = [
name: "Qwen/Qwen3-VL-235B-A22B-Instruct",
reasoning: false,
input: ["text", "image"],
mediaInput: {
image: { maxPixels: 12845056, preferredSidePx: 2048, tokenMode: "provider" },
},
contextWindow: 262144,
maxTokens: 262144,
cost: { input: 0.3, output: 1.2, cacheRead: 0, cacheWrite: 0 },

View File

@@ -567,6 +567,9 @@
"name": "Qwen/Qwen2.5-VL-32B-Instruct",
"reasoning": false,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxPixels": 12845056, "preferredSidePx": 2048, "tokenMode": "provider" }
},
"contextWindow": 16384,
"maxTokens": 16384,
"cost": {
@@ -581,6 +584,9 @@
"name": "Qwen/Qwen3-VL-235B-A22B-Instruct",
"reasoning": false,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxPixels": 12845056, "preferredSidePx": 2048, "tokenMode": "provider" }
},
"contextWindow": 262144,
"maxTokens": 262144,
"cost": {

View File

@@ -94,6 +94,9 @@
"name": "Llama 4 Scout 17B",
"reasoning": false,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxPixels": 33177600, "preferredSidePx": 2048, "tokenMode": "provider" }
},
"contextWindow": 131072,
"maxTokens": 8192,
"cost": {

View File

@@ -73,6 +73,9 @@
"name": "GPT-5.4",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 2048, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 272000,
"maxTokens": 128000,
"cost": { "input": 2.5, "output": 15, "cacheRead": 0.25, "cacheWrite": 0 }
@@ -82,6 +85,9 @@
"name": "GPT-5.4 mini",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 2048, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 400000,
"maxTokens": 128000,
"cost": { "input": 0.75, "output": 4.5, "cacheRead": 0.075, "cacheWrite": 0 }
@@ -91,6 +97,9 @@
"name": "GPT-5.4 nano",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 2048, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 400000,
"maxTokens": 128000,
"cost": { "input": 0.2, "output": 1.25, "cacheRead": 0.02, "cacheWrite": 0 }
@@ -100,6 +109,9 @@
"name": "GPT-5.4 Pro",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 2048, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 1050000,
"maxTokens": 128000,
"cost": { "input": 30, "output": 180, "cacheRead": 0, "cacheWrite": 0 }
@@ -109,6 +121,9 @@
"name": "GPT-5.5",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 6000, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 272000,
"maxTokens": 128000,
"cost": { "input": 5, "output": 30, "cacheRead": 0.5, "cacheWrite": 0 }
@@ -190,6 +205,9 @@
"name": "gpt-5.5-pro",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 6000, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 1000000,
"maxTokens": 128000,
"cost": { "input": 30, "output": 180, "cacheRead": 0, "cacheWrite": 0 }
@@ -205,6 +223,9 @@
"name": "gpt-5.5",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 6000, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 400000,
"contextTokens": 272000,
"maxTokens": 128000,
@@ -215,6 +236,9 @@
"name": "gpt-5.4",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 2048, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 1050000,
"contextTokens": 272000,
"maxTokens": 128000,
@@ -225,6 +249,9 @@
"name": "gpt-5.4-pro",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 2048, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 1050000,
"contextTokens": 272000,
"maxTokens": 128000,
@@ -235,6 +262,9 @@
"name": "gpt-5.4-mini",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 2048, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 400000,
"contextTokens": 272000,
"maxTokens": 128000,
@@ -245,6 +275,9 @@
"name": "gpt-5.5-pro",
"reasoning": true,
"input": ["text", "image"],
"mediaInput": {
"image": { "maxSidePx": 6000, "preferredSidePx": 2048, "tokenMode": "detail" }
},
"contextWindow": 1000000,
"contextTokens": 272000,
"maxTokens": 128000,

View File

@@ -1,4 +1,4 @@
import type { ModelCompatConfig } from "../config/types.models.js";
import type { ModelCompatConfig, ModelMediaInputConfig } from "../config/types.models.js";
export type ModelInputType = "text" | "image" | "audio" | "video" | "document";
@@ -12,4 +12,5 @@ export type ModelCatalogEntry = {
reasoning?: boolean;
input?: ModelInputType[];
compat?: ModelCompatConfig;
mediaInput?: ModelMediaInputConfig;
};

View File

@@ -548,7 +548,7 @@ function resolveFallbackSoonestCooldownExpiry(params: {
return soonest;
}
function resolveImageFallbackCandidates(
export function resolveImageFallbackCandidates(
params: {
cfg: OpenClawConfig | undefined;
defaultProvider: string;
@@ -605,7 +605,7 @@ function resolveImageFallbackCandidates(
return candidates;
}
function resolveImageFallbackDefaultProvider(cfg: OpenClawConfig | undefined): string {
export function resolveImageFallbackDefaultProvider(cfg: OpenClawConfig | undefined): string {
const configuredPrimary = resolveAgentModelPrimaryValue(cfg?.agents?.defaults?.imageModel);
if (configuredPrimary?.trim()) {
const aliasIndex = buildModelAliasIndex({

View File

@@ -41,6 +41,9 @@ function createMistralManifestPlugin(overrides?: {
contextWindow: 262144,
maxTokens: 8192,
cost: { input: 1.5, output: 7.5, cacheRead: 0, cacheWrite: 0 },
mediaInput: {
image: { maxSidePx: 2048, preferredSidePx: 1536, tokenMode: "provider" },
},
},
],
},
@@ -80,6 +83,9 @@ describe("resolveBundledStaticCatalogModel", () => {
id: "mistral-medium-3-5",
input: ["text", "image"],
maxTokens: 8192,
mediaInput: {
image: { maxSidePx: 2048, preferredSidePx: 1536, tokenMode: "provider" },
},
name: "Mistral Medium 3.5",
provider: "mistral",
reasoning: true,

View File

@@ -39,6 +39,7 @@ function modelFromStaticCatalogRow(row: NormalizedModelCatalogRow): Model<Api> {
maxTokens: row.maxTokens,
headers: row.headers,
compat: row.compat,
mediaInput: row.mediaInput,
} as Model<Api>;
}

View File

@@ -498,6 +498,9 @@ describe("resolveModel", () => {
input: ["text", "image"],
contextWindow: 262144,
maxTokens: 8192,
mediaInput: {
image: { maxSidePx: 2048, preferredSidePx: 1536, tokenMode: "provider" },
},
});
const cfg = {
models: {
@@ -539,6 +542,101 @@ describe("resolveModel", () => {
expect(discoverModels).not.toHaveBeenCalled();
});
it("merges bundled static media input into resolved models when opted in", async () => {
mockDiscoveredModel(discoverModels, {
provider: "openai",
modelId: "gpt-5.5-pro",
templateModel: {
id: "gpt-5.5-pro",
name: "GPT-5.5 Pro",
provider: "openai",
api: "openai-responses",
baseUrl: "https://api.openai.com/v1",
reasoning: true,
input: ["text", "image"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 272_000,
maxTokens: 128_000,
},
});
resolveBundledStaticCatalogModelMock.mockReturnValueOnce({
provider: "openai",
id: "gpt-5.5-pro",
name: "GPT-5.5 Pro",
api: "openai-responses",
baseUrl: "https://api.openai.com/v1",
reasoning: true,
input: ["text", "image"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 272_000,
maxTokens: 128_000,
mediaInput: {
image: { maxSidePx: 6000, preferredSidePx: 2048, tokenMode: "detail" },
},
});
const result = await resolveModelAsync("openai", "gpt-5.5-pro", "/tmp/agent", undefined, {
allowBundledStaticCatalogFallback: true,
authStorage: { mocked: true } as never,
modelRegistry: discoverModels({ mocked: true } as never, "/tmp/agent"),
runtimeHooks: createRuntimeHooks(),
skipPiDiscovery: true,
});
expect((expectResolvedModel(result) as { mediaInput?: unknown }).mediaInput).toEqual({
image: { maxSidePx: 6000, preferredSidePx: 2048, tokenMode: "detail" },
});
expect(resolveBundledStaticCatalogModelMock).toHaveBeenCalledWith({
provider: "openai",
modelId: "gpt-5.5-pro",
cfg: undefined,
workspaceDir: undefined,
});
});
it("merges configured media input with discovered model metadata", () => {
mockDiscoveredModel(discoverModels, {
provider: "custom",
modelId: "vision-model",
templateModel: {
id: "vision-model",
name: "Vision Model",
provider: "custom",
api: "openai-responses",
baseUrl: "https://models.example.com/v1",
reasoning: false,
input: ["text", "image"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 8192,
maxTokens: 1024,
mediaInput: {
image: { maxSidePx: 2048, preferredSidePx: 1536, tokenMode: "provider" },
},
},
});
const result = resolveModelForTest("custom", "vision-model", "/tmp/agent", {
models: {
providers: {
custom: {
baseUrl: "https://models.example.com/v1",
models: [
{
id: "vision-model",
name: "Vision Model",
mediaInput: { image: { maxBytes: 1 } },
},
],
},
},
},
} as unknown as OpenClawConfig);
expect((expectResolvedModel(result) as { mediaInput?: unknown }).mediaInput).toEqual({
image: { maxBytes: 1, maxSidePx: 2048, preferredSidePx: 1536, tokenMode: "provider" },
});
});
it("does not use bundled static catalog rows unless the caller opts in", async () => {
const result = await resolveModelAsync(
"mistral",

View File

@@ -5,6 +5,7 @@ import {
type AuthStorage,
type ModelRegistry,
} from "@earendil-works/pi-coding-agent";
import type { ModelMediaInputConfig } from "../../config/types.models.js";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import type { ProviderRuntimeModel } from "../../plugins/provider-runtime-model.types.js";
import {
@@ -363,6 +364,29 @@ function resolveProviderRequestTimeoutMs(timeoutSeconds: unknown): number | unde
return Math.floor(timeoutSeconds) * 1000;
}
function mergeModelMediaInput(
base: ModelMediaInputConfig | undefined,
override: ModelMediaInputConfig | undefined,
): ModelMediaInputConfig | undefined {
if (!base) {
return override;
}
if (!override) {
return base;
}
return {
...base,
...override,
image:
base.image || override.image
? {
...base.image,
...override.image,
}
: undefined,
};
}
function matchesProviderScopedModelId(params: {
candidateId?: string;
provider: string;
@@ -702,6 +726,10 @@ function applyConfiguredProviderOverrides(params: {
...(requestTimeoutMs !== undefined ? { requestTimeoutMs } : {}),
headers: requestConfig.headers,
compat: metadataOverrideModel?.compat ?? discoveredModel.compat,
mediaInput: mergeModelMediaInput(
discoveredModel.mediaInput,
metadataOverrideModel?.mediaInput,
),
},
providerRequest,
),
@@ -913,7 +941,10 @@ function resolveConfiguredFallbackModel(params: {
}
const fallbackTransport = resolveProviderTransport({
provider,
api: normalizeResolvedTransportApi(configuredModel?.api) ?? resolveConfiguredProviderDefaultApi(providerConfig) ?? "openai-responses",
api:
normalizeResolvedTransportApi(configuredModel?.api) ??
resolveConfiguredProviderDefaultApi(providerConfig) ??
"openai-responses",
baseUrl: configuredModel?.baseUrl ?? providerConfig?.baseUrl,
cfg,
workspaceDir,
@@ -968,6 +999,7 @@ function resolveConfiguredFallbackModel(params: {
...(resolvedParams ? { params: resolvedParams } : {}),
...(requestTimeoutMs !== undefined ? { requestTimeoutMs } : {}),
headers: requestConfig.headers,
mediaInput: configuredModel?.mediaInput,
} as Model<Api>,
providerRequest,
),
@@ -1270,6 +1302,20 @@ export async function resolveModelAsync(
});
}
}
if (model && options?.allowBundledStaticCatalogFallback) {
const staticCatalogModel = resolveBundledStaticCatalogModel({
provider: normalizedRef.provider,
modelId: normalizedRef.model,
cfg,
workspaceDir,
});
const staticMediaInput = (staticCatalogModel as ProviderRuntimeModel | undefined)?.mediaInput;
const resolvedMediaInput = (model as ProviderRuntimeModel).mediaInput;
const mediaInput = mergeModelMediaInput(staticMediaInput, resolvedMediaInput);
if (mediaInput) {
model = { ...(model as ProviderRuntimeModel), mediaInput } as typeof model;
}
}
if (model) {
return { model, authStorage, modelRegistry };
}

View File

@@ -0,0 +1,253 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it } from "vitest";
import type { ModelApi } from "../../config/types.models.js";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { resizeToJpeg } from "../../media/media-services.js";
import { encodePngRgba, fillPixel } from "../../media/png-encode.js";
import {
describeImageWithModel,
type ImageDescriptionRequest,
type MediaUnderstandingProvider,
} from "../../plugin-sdk/media-understanding.js";
import { isOverloadedErrorMessage, isServerErrorMessage } from "../../plugin-sdk/test-env.js";
import { isLiveTestEnabled } from "../live-test-helpers.js";
import { createImageTool, testing } from "./image-tool.js";
const OPENAI_API_KEY = process.env.OPENAI_API_KEY?.trim() ?? "";
const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY?.trim() ?? "";
const LIVE_IMAGE_TOOL_ENABLED = isLiveTestEnabled(["OPENCLAW_LIVE_IMAGE_TOOL_TEST"]);
const LIVE_OPENAI_MODEL =
process.env.OPENCLAW_LIVE_IMAGE_TOOL_OPENAI_MODEL?.trim() ||
process.env.OPENCLAW_LIVE_IMAGE_TOOL_MODEL?.trim() ||
"gpt-4.1-mini";
const LIVE_ANTHROPIC_MODEL =
process.env.OPENCLAW_LIVE_IMAGE_TOOL_ANTHROPIC_MODEL?.trim() || "claude-sonnet-4-6";
const MODEL_SIDE_LIMIT = 512;
type LiveProviderCase = {
provider: "openai" | "anthropic";
model: string;
apiKey: string;
api: ModelApi;
baseUrl: string;
contextWindow: number;
maxTokens: number;
reasoning: boolean;
live: boolean;
};
const OPENAI_LIVE_CASE: LiveProviderCase = {
provider: "openai",
model: LIVE_OPENAI_MODEL,
apiKey: OPENAI_API_KEY,
api: "openai-responses",
baseUrl: "https://api.openai.com/v1",
contextWindow: 1_047_576,
maxTokens: 32_768,
reasoning: false,
live: LIVE_IMAGE_TOOL_ENABLED && OPENAI_API_KEY.length > 0,
};
const ANTHROPIC_LIVE_CASE: LiveProviderCase = {
provider: "anthropic",
model: LIVE_ANTHROPIC_MODEL,
apiKey: ANTHROPIC_API_KEY,
api: "anthropic-messages",
baseUrl: "https://api.anthropic.com/v1",
contextWindow: 200_000,
maxTokens: 8192,
reasoning: true,
live: LIVE_IMAGE_TOOL_ENABLED && ANTHROPIC_API_KEY.length > 0,
};
function createLargeCenterRedPng(size: number): Buffer {
const buf = Buffer.alloc(size * size * 4, 255);
const centerStart = Math.floor(size * 0.25);
const centerEnd = Math.floor(size * 0.75);
for (let y = 0; y < size; y += 1) {
for (let x = 0; x < size; x += 1) {
const inCenter = x >= centerStart && x < centerEnd && y >= centerStart && y < centerEnd;
fillPixel(buf, x, y, size, inCenter ? 230 : 30, inCenter ? 40 : 110, inCenter ? 35 : 220);
}
}
return encodePngRgba(buf, size, size);
}
function readJpegDimensions(buffer: Buffer): { width: number; height: number } {
let offset = 2;
while (offset + 9 < buffer.length) {
if (buffer[offset] !== 0xff) {
offset += 1;
continue;
}
const marker = buffer[offset + 1];
offset += 2;
if (marker === 0xd8 || marker === 0xd9 || (marker >= 0xd0 && marker <= 0xd7)) {
continue;
}
const segmentLength = buffer.readUInt16BE(offset);
if (marker >= 0xc0 && marker <= 0xcf && ![0xc4, 0xc8, 0xcc].includes(marker)) {
return {
height: buffer.readUInt16BE(offset + 3),
width: buffer.readUInt16BE(offset + 5),
};
}
offset += segmentLength;
}
throw new Error("JPEG dimensions not found");
}
function formatLiveError(error: unknown): string {
return error instanceof Error ? error.message : String(error);
}
function isSkippableLiveError(error: unknown): boolean {
const message = formatLiveError(error);
return (
isOverloadedErrorMessage(message) ||
isServerErrorMessage(message) ||
/timed out|operation was aborted/i.test(message)
);
}
function createLiveConfig(testCase: LiveProviderCase): OpenClawConfig {
return {
agents: {
defaults: {
imageModel: { primary: `${testCase.provider}/${testCase.model}` },
imageQuality: "high",
},
},
models: {
providers: {
[testCase.provider]: {
apiKey: testCase.apiKey,
baseUrl: testCase.baseUrl,
api: testCase.api,
models: [
{
id: testCase.model,
name: testCase.model,
reasoning: testCase.reasoning,
input: ["text", "image"],
contextWindow: testCase.contextWindow,
maxTokens: testCase.maxTokens,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
mediaInput: {
image: { maxSidePx: MODEL_SIDE_LIMIT, preferredSidePx: MODEL_SIDE_LIMIT },
},
},
],
},
},
},
tools: {
media: {
image: {
timeoutSeconds: 90,
models: [{ provider: testCase.provider, model: testCase.model, timeoutSeconds: 90 }],
},
},
},
};
}
async function withLiveWorkspace<T>(
run: (ctx: { agentDir: string; workspaceDir: string; imagePath: string }) => Promise<T>,
) {
const root = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-image-tool-live-"));
try {
const agentDir = path.join(root, "agent");
const workspaceDir = path.join(root, "workspace");
await fs.mkdir(agentDir, { recursive: true });
await fs.mkdir(workspaceDir, { recursive: true });
const sourcePng = createLargeCenterRedPng(2200);
const sourceJpeg = await resizeToJpeg({
buffer: sourcePng,
maxSide: 2200,
quality: 92,
withoutEnlargement: true,
});
const sourceDimensions = readJpegDimensions(sourceJpeg);
expect(Math.max(sourceDimensions.width, sourceDimensions.height)).toBeGreaterThan(
MODEL_SIDE_LIMIT,
);
const imagePath = path.join(workspaceDir, "large-center-red.jpg");
await fs.writeFile(imagePath, sourceJpeg);
return await run({ agentDir, workspaceDir, imagePath });
} finally {
await fs.rm(root, { recursive: true, force: true });
}
}
afterEach(() => {
testing.setProviderDepsForTest();
});
async function runLiveDownscaleCase(testCase: LiveProviderCase) {
let observedDimensions: { width: number; height: number } | undefined;
testing.setProviderDepsForTest({
getMediaUnderstandingProvider: (
_id: string,
_registry: Map<string, MediaUnderstandingProvider>,
) => undefined,
describeImageWithModel: async (params: ImageDescriptionRequest) => {
expect(params.provider).toBe(testCase.provider);
expect(params.model).toBe(testCase.model);
expect(params.mime).toBe("image/jpeg");
observedDimensions = readJpegDimensions(params.buffer);
expect(Math.max(observedDimensions.width, observedDimensions.height)).toBeLessThanOrEqual(
MODEL_SIDE_LIMIT,
);
return await describeImageWithModel(params);
},
});
await withLiveWorkspace(async ({ agentDir, workspaceDir, imagePath }) => {
const tool = createImageTool({
config: createLiveConfig(testCase),
agentDir,
workspaceDir,
});
if (!tool) {
throw new Error("expected image tool");
}
let result: unknown;
try {
result = await tool.execute(`live-${testCase.provider}-large-image`, {
prompt:
"Look at the center of the image. Reply with one lowercase word naming that center color.",
image: imagePath,
});
} catch (err) {
if (isSkippableLiveError(err)) {
console.warn(`[live:image-tool:${testCase.provider}] skipped: ${formatLiveError(err)}`);
return;
}
throw err;
}
const content = (result as { content?: Array<{ type?: string; text?: string }> }).content;
const text = content
?.filter((block) => block.type === "text")
.map((block) => block.text?.toLowerCase() ?? "")
.join(" ");
expect(text).toMatch(/red|crimson|orange/);
expect(observedDimensions).toBeDefined();
});
}
describe.skipIf(!OPENAI_LIVE_CASE.live)("image tool OpenAI live", () => {
it("downscales a large local image before sending it to the live vision model", async () => {
await runLiveDownscaleCase(OPENAI_LIVE_CASE);
}, 180_000);
});
describe.skipIf(!ANTHROPIC_LIVE_CASE.live)("image tool Anthropic live", () => {
it("downscales a large local image before sending it to the live vision model", async () => {
await runLiveDownscaleCase(ANTHROPIC_LIVE_CASE);
}, 180_000);
});

View File

@@ -5,6 +5,7 @@ import path from "node:path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { OpenClawConfig } from "../../config/config.js";
import type { ModelDefinitionConfig } from "../../config/types.models.js";
import { encodePngRgba, fillPixel } from "../../media/png-encode.js";
import type {
ImageDescriptionRequest,
ImagesDescriptionRequest,
@@ -223,7 +224,53 @@ async function withTempAgentDir<T>(run: (agentDir: string) => Promise<T>): Promi
const ONE_PIXEL_PNG_B64 =
"iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIGNIUk0AAHomAACAhAAA+gAAAIDoAAB1MAAA6mAAADqYAAAXcJy6UTwAAAAGYktHRAD/AP8A/6C9p5MAAAAHdElNRQfqBBsGAQr00ED3AAAAJXRFWHRkYXRlOmNyZWF0ZQAyMDI2LTA0LTI3VDA2OjAxOjEwKzAwOjAwPU3tXwAAACV0RVh0ZGF0ZTptb2RpZnkAMjAyNi0wNC0yN1QwNjowMToxMCswMDowMEwQVeMAAAAodEVYdGRhdGU6dGltZXN0YW1wADIwMjYtMDQtMjdUMDY6MDE6MTArMDA6MDAbBXQ8AAAAeElEQVRo3u3awQnDQBAEwT2Q8w/YAikIP5rF1RFMca+FO8/s7rrnqjcA1BsA6g0A9QaAesOfA77zqTf8Blj/AgAAAAAAAJsDqAOoA6gDqAOoc9TXAdQB1AHUAdQB1AHUAdQB1AHU7Qc46gEAAAAANrcecGZ2f8B/ASYSQPlKoEJ/AAAAAElFTkSuQmCC";
const ONE_PIXEL_GIF_B64 = "R0lGODlhAQABAIABAP///wAAACwAAAAAAQABAAACAkQBADs=";
const ONE_PIXEL_JPEG_B64 = "QUJDRA==";
function createLargeColorBlockPng(size: number): Buffer {
const buf = Buffer.alloc(size * size * 4, 255);
const centerStart = Math.floor(size * 0.25);
const centerEnd = Math.floor(size * 0.75);
for (let y = 0; y < size; y += 1) {
for (let x = 0; x < size; x += 1) {
const inCenter = x >= centerStart && x < centerEnd && y >= centerStart && y < centerEnd;
fillPixel(buf, x, y, size, inCenter ? 230 : 30, inCenter ? 40 : 110, inCenter ? 35 : 220);
}
}
return encodePngRgba(buf, size, size);
}
function readJpegDimensions(buffer: Buffer): { width: number; height: number } {
let offset = 2;
while (offset + 9 < buffer.length) {
if (buffer[offset] !== 0xff) {
offset += 1;
continue;
}
const marker = buffer[offset + 1];
offset += 2;
if (marker === 0xd8 || marker === 0xd9 || (marker >= 0xd0 && marker <= 0xd7)) {
continue;
}
const segmentLength = buffer.readUInt16BE(offset);
if (marker >= 0xc0 && marker <= 0xcf && ![0xc4, 0xc8, 0xcc].includes(marker)) {
return {
height: buffer.readUInt16BE(offset + 3),
width: buffer.readUInt16BE(offset + 5),
};
}
offset += segmentLength;
}
throw new Error("JPEG dimensions not found");
}
function readPngDimensions(buffer: Buffer): { width: number; height: number } {
if (buffer.length < 24 || buffer.toString("ascii", 12, 16) !== "IHDR") {
throw new Error("PNG dimensions not found");
}
return {
width: buffer.readUInt32BE(16),
height: buffer.readUInt32BE(20),
};
}
async function withTempWorkspacePng(
cb: (args: { workspaceDir: string; imagePath: string }) => Promise<void>,
@@ -1370,7 +1417,7 @@ describe("image tool implicit imageModel config", () => {
).toBe(true);
expect(userContent.some((block) => block.type === "image_url")).toBe(true);
expect(userContent.find((block) => block.type === "image_url")?.image_url?.url).toContain(
"data:image/png;base64,",
"data:image/",
);
expect(bodyRaw).not.toContain('"role":"developer"');
expectToolText(result, "ok moonshot");
@@ -1783,6 +1830,136 @@ describe("image tool data URL support", () => {
bufferFromSpy.mockRestore();
}
});
it("applies model image maxBytes to data URLs", async () => {
await withTempAgentDir(async (agentDir) => {
installImageUnderstandingProviderStubs();
const model = {
...makeModelDefinition("tiny-vision", ["text", "image"]),
mediaInput: { image: { maxBytes: 1 } },
} satisfies ModelDefinitionConfig;
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: "openai/tiny-vision" },
},
},
models: {
providers: {
openai: {
api: "openai-responses",
baseUrl: "https://api.openai.com/v1",
models: [model],
},
},
},
};
const tool = createRequiredImageTool({ config: cfg, agentDir });
await expect(
tool.execute("t1", {
prompt: "Describe this image.",
image: `data:image/png;base64,${ONE_PIXEL_PNG_B64}`,
}),
).rejects.toThrow(/could not be reduced below/i);
});
});
it("downscales data URL images to the resolved model side limit", async () => {
await withTempAgentDir(async (agentDir) => {
let observedDimensions: { width: number; height: number } | undefined;
installImageUnderstandingProviderStubs({
id: "openai",
capabilities: ["image"],
describeImage: async (params) => {
observedDimensions =
params.mime === "image/png"
? readPngDimensions(params.buffer)
: readJpegDimensions(params.buffer);
return { text: "ok", model: params.model };
},
});
const model = {
...makeModelDefinition("tiny-vision", ["text", "image"]),
mediaInput: { image: { maxSidePx: 512, preferredSidePx: 512 } },
} satisfies ModelDefinitionConfig;
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: "openai/tiny-vision" },
imageQuality: "high",
},
},
models: {
providers: {
openai: {
api: "openai-responses",
apiKey: "test-key",
baseUrl: "https://api.openai.com/v1",
models: [model],
},
},
},
};
const tool = createRequiredImageTool({ config: cfg, agentDir });
const source = createLargeColorBlockPng(1600);
await expectImageToolExecOk(tool, `data:image/png;base64,${source.toString("base64")}`);
expect(observedDimensions).toBeDefined();
if (!observedDimensions) {
throw new Error("expected observed data URL dimensions");
}
expect(Math.max(observedDimensions.width, observedDimensions.height)).toBeLessThanOrEqual(
512,
);
});
});
it("applies configured image quality to data URLs without model media metadata", async () => {
await withTempAgentDir(async (agentDir) => {
let observedDimensions: { width: number; height: number } | undefined;
installImageUnderstandingProviderStubs({
id: "openai",
capabilities: ["image"],
describeImage: async (params) => {
observedDimensions =
params.mime === "image/png"
? readPngDimensions(params.buffer)
: readJpegDimensions(params.buffer);
return { text: "ok", model: params.model };
},
});
const cfg: OpenClawConfig = {
agents: {
defaults: {
imageModel: { primary: "openai/plain-vision" },
imageQuality: "efficient",
},
},
models: {
providers: {
openai: {
api: "openai-responses",
apiKey: "test-key",
baseUrl: "https://api.openai.com/v1",
models: [makeModelDefinition("plain-vision", ["text", "image"])],
},
},
},
};
const tool = createRequiredImageTool({ config: cfg, agentDir });
const source = createLargeColorBlockPng(1600);
await expectImageToolExecOk(tool, `data:image/png;base64,${source.toString("base64")}`);
expect(observedDimensions).toBeDefined();
if (!observedDimensions) {
throw new Error("expected observed data URL dimensions");
}
expect(Math.max(observedDimensions.width, observedDimensions.height)).toBeLessThanOrEqual(
1280,
);
});
});
});
describe("image tool MiniMax VLM routing", () => {
@@ -1832,7 +2009,7 @@ describe("image tool MiniMax VLM routing", () => {
expect(init?.method).toBe("POST");
expect((init?.headers as Record<string, string>)?.Authorization).toBe("Bearer minimax-test");
expect(String(init?.body)).toContain('"prompt":"Describe the image."');
expect(String(init?.body)).toContain('"image_url":"data:image/png;base64,');
expect(String(init?.body)).toContain('"image_url":"data:image/');
const text = res.content?.find((b) => b.type === "text")?.text ?? "";
expect(text).toBe("ok");
@@ -1840,10 +2017,11 @@ describe("image tool MiniMax VLM routing", () => {
it("accepts images[] for multi-image requests", async () => {
const { fetch, tool } = await createMinimaxVlmFixture({ status_code: 0, status_msg: "" });
const secondPngB64 = createLargeColorBlockPng(2).toString("base64");
const res = await tool.execute("t1", {
prompt: "Compare these images.",
images: [`data:image/png;base64,${pngB64}`, `data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`],
images: [`data:image/png;base64,${pngB64}`, `data:image/png;base64,${secondPngB64}`],
});
expect(fetch).toHaveBeenCalledTimes(2);
@@ -1857,14 +2035,15 @@ describe("image tool MiniMax VLM routing", () => {
it("combines image + images with dedupe and enforces maxImages", async () => {
const { fetch, tool } = await createMinimaxVlmFixture({ status_code: 0, status_msg: "" });
const secondPngB64 = createLargeColorBlockPng(2).toString("base64");
const deduped = await tool.execute("t1", {
prompt: "Compare these images.",
image: `data:image/png;base64,${pngB64}`,
images: [
`data:image/png;base64,${pngB64}`,
`data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`,
`data:image/jpeg;base64,${ONE_PIXEL_JPEG_B64}`,
`data:image/png;base64,${secondPngB64}`,
`data:image/png;base64,${secondPngB64}`,
],
});
@@ -2166,3 +2345,135 @@ describe("image tool response validation", () => {
expect(testing.hasImageReasoningOnlyResponse(message as never)).toBe(false);
});
});
describe("image compression policy", () => {
const cfgWithImageModelMetadata = {
agents: {
defaults: {
imageQuality: "high",
},
},
models: {
providers: {
anthropic: {
baseUrl: "https://api.anthropic.com",
api: "anthropic-messages",
models: [
{
id: "claude-opus-4-7",
name: "Claude Opus 4.7",
reasoning: true,
input: ["text", "image"],
contextWindow: 1_000_000,
maxTokens: 64_000,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
mediaInput: {
image: { maxSidePx: 2576, preferredSidePx: 2576, tokenMode: "provider" },
},
},
{
id: "claude-opus-4-6",
name: "Claude Opus 4.6",
reasoning: true,
input: ["text", "image"],
contextWindow: 1_000_000,
maxTokens: 64_000,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
mediaInput: {
image: { maxSidePx: 1568, preferredSidePx: 1568, tokenMode: "provider" },
},
},
],
},
openai: {
baseUrl: "https://api.openai.com/v1",
api: "openai-responses",
models: [
{
id: "gpt-5.5",
name: "GPT-5.5",
reasoning: true,
input: ["text", "image"],
contextWindow: 272_000,
maxTokens: 128_000,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
mediaInput: {
image: { maxSidePx: 6000, preferredSidePx: 2048, tokenMode: "detail" },
},
},
],
},
},
},
} satisfies OpenClawConfig;
it("derives model metadata, quality preference, and image count from config", async () => {
const cfg = {
...cfgWithImageModelMetadata,
} satisfies OpenClawConfig;
await expect(
testing.resolveImageCompressionPolicy({
cfg,
imageModelConfig: { primary: "anthropic/claude-opus-4-7" },
imageCount: 2,
}),
).resolves.toEqual({
quality: "high",
imageCount: 2,
models: [{ maxSidePx: 2576, preferredSidePx: 2576, tokenMode: "provider" }],
});
});
it("keeps unset image quality as adaptive auto behavior and includes fallback models", async () => {
const { agents: _agents, ...cfg } = cfgWithImageModelMetadata;
await expect(
testing.resolveImageCompressionPolicy({
cfg,
imageModelConfig: {
primary: "openai/gpt-5.5",
fallbacks: ["anthropic/claude-opus-4-6", "unknown/custom-image"],
},
imageCount: 1,
}),
).resolves.toEqual({
imageCount: 1,
models: [
{ maxSidePx: 6000, preferredSidePx: 2048, tokenMode: "detail" },
{ maxSidePx: 1568, preferredSidePx: 1568, tokenMode: "provider" },
{},
],
});
});
it("uses a model override as the compression candidate", async () => {
await expect(
testing.resolveImageCompressionPolicy({
cfg: cfgWithImageModelMetadata,
imageModelConfig: {
primary: "openai/gpt-5.5",
fallbacks: ["anthropic/claude-opus-4-6"],
},
modelOverride: "anthropic/claude-opus-4-6",
imageCount: 1,
}),
).resolves.toMatchObject({
models: [{ maxSidePx: 1568, preferredSidePx: 1568, tokenMode: "provider" }],
});
});
it("resolves providerless overrides before reading compression metadata", async () => {
await expect(
testing.resolveImageCompressionPolicy({
cfg: cfgWithImageModelMetadata,
imageModelConfig: {
primary: "anthropic/claude-opus-4-6",
},
modelOverride: "gpt-5.5",
imageCount: 1,
}),
).resolves.toMatchObject({
models: [{ maxSidePx: 6000, preferredSidePx: 2048, tokenMode: "detail" }],
});
});
});

View File

@@ -16,15 +16,26 @@ import {
classifyMediaReferenceSource,
normalizeMediaReferenceSource,
} from "../../media/media-reference.js";
import { loadWebMedia } from "../../media/web-media.js";
import {
loadWebMedia,
optimizeImageBufferForWebMedia,
type ImageCompressionModelPolicy,
type ImageCompressionPolicy,
} from "../../media/web-media.js";
import {
describeImageWithModel,
describeImagesWithModel,
type MediaUnderstandingProvider,
} from "../../plugin-sdk/media-understanding.js";
import type { ProviderRuntimeModel } from "../../plugins/provider-runtime-model.types.js";
import { resolveUserPath } from "../../utils.js";
import type { AuthProfileStore } from "../auth-profiles/types.js";
import { isMinimaxVlmProvider } from "../minimax-vlm.js";
import {
resolveImageFallbackCandidates,
resolveImageFallbackDefaultProvider,
} from "../model-fallback.js";
import { resolveModelAsync } from "../pi-embedded-runner/model.js";
import {
coerceImageAssistantText,
coerceImageModelConfig,
@@ -117,6 +128,7 @@ export const testing = {
coerceImageAssistantText,
hasImageReasoningOnlyResponse,
resolveImageToolMaxTokens,
resolveImageCompressionPolicy,
setProviderDepsForTest(overrides?: {
buildProviderRegistry?: typeof buildProviderRegistry;
getMediaUnderstandingProvider?: typeof getMediaUnderstandingProvider;
@@ -272,6 +284,68 @@ function pickMaxBytes(cfg?: OpenClawConfig, maxBytesMb?: number): number | undef
return undefined;
}
function resolveCompressionModelCandidates(params: {
cfg?: OpenClawConfig;
imageModelConfig?: ImageModelConfig | null;
modelOverride?: string;
}): Array<{ provider: string; model: string }> {
const overrideConfig = resolveImageModelConfigForOverride({
cfg: params.cfg,
modelOverride: params.modelOverride,
});
const configuredImageModelConfig = params.imageModelConfig
? resolveConfiguredImageModelRefs({
cfg: params.cfg,
imageModelConfig: params.imageModelConfig,
})
: null;
const effectiveImageModelConfig = overrideConfig ?? configuredImageModelConfig;
const effectiveCfg = effectiveImageModelConfig
? applyImageModelConfigDefaults(params.cfg, effectiveImageModelConfig)
: params.cfg;
return resolveImageFallbackCandidates({
cfg: effectiveCfg,
defaultProvider: resolveImageFallbackDefaultProvider(effectiveCfg),
});
}
async function resolveImageCompressionPolicy(params: {
cfg?: OpenClawConfig;
imageModelConfig?: ImageModelConfig | null;
modelOverride?: string;
imageCount: number;
agentDir?: string;
workspaceDir?: string;
}): Promise<ImageCompressionPolicy> {
const modelCandidates = resolveCompressionModelCandidates(params);
const quality = params.cfg?.agents?.defaults?.imageQuality;
const models: ImageCompressionModelPolicy[] = await Promise.all(
modelCandidates.map(async (candidate): Promise<ImageCompressionModelPolicy> => {
try {
const resolved = await resolveModelAsync(
candidate.provider,
candidate.model,
params.agentDir,
params.cfg,
{
allowBundledStaticCatalogFallback: true,
skipPiDiscovery: true,
workspaceDir: params.workspaceDir,
},
);
return (resolved.model as ProviderRuntimeModel | undefined)?.mediaInput?.image ?? {};
} catch {
return {};
}
}),
);
return {
imageCount: params.imageCount,
...(models.length > 0 ? { models } : {}),
...(quality ? { quality } : {}),
};
}
function matchesImageTimeoutEntry(params: {
entry: MediaUnderstandingModelConfig;
source: "capability" | "shared";
@@ -574,6 +648,31 @@ export function createImageTool(options?: {
);
const maxBytesMb = typeof record.maxBytesMb === "number" ? record.maxBytesMb : undefined;
const maxBytes = pickMaxBytes(options?.config, maxBytesMb);
const imageModelConfig =
resolvedImageModelConfig ??
resolveImageModelConfigForOverride({
cfg: options?.config,
modelOverride,
}) ??
resolveImageModelConfigForTool({
cfg: options?.config,
agentDir,
workspaceDir: options?.workspaceDir,
authStore: options?.authProfileStore,
});
if (!imageModelConfig) {
throw new Error(
"No image model is configured. Set agents.defaults.imageModel or configure an image-capable provider.",
);
}
const imageCompression = await resolveImageCompressionPolicy({
cfg: options?.config,
imageModelConfig,
modelOverride,
imageCount: imageInputs.length,
agentDir,
workspaceDir: options?.workspaceDir,
});
const sandboxConfig: SandboxedBridgeMediaPathConfig | null =
options?.sandbox && options?.sandbox.root.trim()
@@ -672,26 +771,39 @@ export function createImageTool(options?: {
);
const media = isDataUrl
? decodeDataUrl(resolvedImage, { maxBytes })
? await (async () => {
const decoded = decodeDataUrl(resolvedImage, { maxBytes });
return await optimizeImageBufferForWebMedia({
buffer: decoded.buffer,
contentType: decoded.mimeType,
maxBytes,
imageCompression,
});
})()
: sandboxConfig
? await loadWebMedia(resolvedPath ?? resolvedImage, {
maxBytes,
sandboxValidated: true,
readFile: createSandboxBridgeReadFile({ sandbox: sandboxConfig }),
imageCompression,
})
: await loadWebMedia(resolvedPath ?? resolvedImage, {
maxBytes,
localRoots: mediaLocalRoots,
ssrfPolicy: remoteMediaSsrfPolicy,
imageCompression,
});
if (media.kind !== "image") {
throw new Error(`Unsupported media type: ${media.kind}`);
}
const mimeType =
("contentType" in media && media.contentType) ||
("mimeType" in media && media.mimeType) ||
"image/png";
const contentType =
"contentType" in media && typeof media.contentType === "string"
? media.contentType
: undefined;
const legacyMimeType =
"mimeType" in media && typeof media.mimeType === "string" ? media.mimeType : undefined;
const mimeType = contentType ?? legacyMimeType ?? "image/png";
loadedImages.push({
buffer: media.buffer,
mimeType,
@@ -703,23 +815,6 @@ export function createImageTool(options?: {
}
// MARK: - Run image prompt with all loaded images
const imageModelConfig =
resolvedImageModelConfig ??
resolveImageModelConfigForOverride({
cfg: options?.config,
modelOverride,
}) ??
resolveImageModelConfigForTool({
cfg: options?.config,
agentDir,
workspaceDir: options?.workspaceDir,
authStore: options?.authProfileStore,
});
if (!imageModelConfig) {
throw new Error(
"No image model is configured. Set agents.defaults.imageModel or configure an image-capable provider.",
);
}
const result = await runImagePrompt({
cfg: options?.config,
agentDir,

View File

@@ -1052,6 +1052,20 @@ export const FIELD_HELP: Record<string, string> = {
"Optional low-level agent runtime policy for this specific model. Model runtime policy overrides the provider runtime policy.",
"models.providers.*.models[].agentRuntime.id":
'Model agent runtime id: "pi", "auto", a registered plugin harness id such as "codex", or a supported CLI backend alias such as "claude-cli".',
"models.providers.*.models[].mediaInput":
"Optional model media capability metadata used by tools to choose conservative image compression defaults.",
"models.providers.*.models[].mediaInput.image":
"Optional image input limits for this model, such as maximum side length, maximum pixels, and preferred compression side.",
"models.providers.*.models[].mediaInput.image.maxBytes":
"Maximum encoded image payload size accepted by the provider for this model.",
"models.providers.*.models[].mediaInput.image.maxPixels":
"Maximum image pixel count accepted by the provider for this model.",
"models.providers.*.models[].mediaInput.image.maxSidePx":
"Maximum image width or height accepted by the provider for this model.",
"models.providers.*.models[].mediaInput.image.preferredSidePx":
"Preferred image resize side for balanced compression. Leave unset to use OpenClaw's conservative default.",
"models.providers.*.models[].mediaInput.image.tokenMode":
'Provider image token accounting style: "tile", "detail", or "provider".',
auth: "Authentication profile root used for multi-profile provider credentials and cooldown-based failover ordering. Keep profiles minimal and explicit so automatic failover behavior stays auditable.",
"channels.googlechat.botLoopProtection":
"Sliding-window guard for accepted Google Chat bot-to-bot loops. Defaults to the shared bot loop protection budget when allowBots lets bot-authored messages reach dispatch.",
@@ -1431,6 +1445,8 @@ export const FIELD_HELP: Record<string, string> = {
"Maximum number of PDF pages to process for the PDF tool (default: 20).",
"agents.defaults.imageMaxDimensionPx":
"Max image side length in pixels when sanitizing transcript/tool-result image payloads (default: 1200).",
"agents.defaults.imageQuality":
'Image-tool media compression preference: "auto" adapts to provider/model limits and image count, "efficient" saves tokens and bytes, "balanced" keeps the current middle ground, and "high" preserves more detail for screenshots and document images.',
"agents.defaults.cliBackends": "Optional CLI backends for text-only fallback (claude-cli, etc.).",
"agents.defaults.compaction":
"Compaction tuning for when context nears token limits, including history share, reserve headroom, and pre-compaction memory flush behavior. Use this when long-running sessions need stable continuity under tight context windows.",

View File

@@ -626,6 +626,13 @@ export const FIELD_LABELS: Record<string, string> = {
"models.providers.*.models": "Model Provider Model List",
"models.providers.*.models[].agentRuntime": "Model Runtime",
"models.providers.*.models[].agentRuntime.id": "Model Runtime ID",
"models.providers.*.models[].mediaInput": "Model Media Input",
"models.providers.*.models[].mediaInput.image": "Model Image Input",
"models.providers.*.models[].mediaInput.image.maxBytes": "Model Image Max Bytes",
"models.providers.*.models[].mediaInput.image.maxPixels": "Model Image Max Pixels",
"models.providers.*.models[].mediaInput.image.maxSidePx": "Model Image Max Side",
"models.providers.*.models[].mediaInput.image.preferredSidePx": "Model Image Preferred Side",
"models.providers.*.models[].mediaInput.image.tokenMode": "Model Image Token Mode",
"auth.cooldowns.billingBackoffHours": "Billing Backoff (hours)",
"auth.cooldowns.billingBackoffHoursByProvider": "Billing Backoff Overrides",
"auth.cooldowns.billingMaxHours": "Billing Backoff Cap (hours)",
@@ -656,6 +663,7 @@ export const FIELD_LABELS: Record<string, string> = {
"agents.defaults.pdfMaxBytesMb": "PDF Max Size (MB)",
"agents.defaults.pdfMaxPages": "PDF Max Pages",
"agents.defaults.imageMaxDimensionPx": "Image Max Dimension (px)",
"agents.defaults.imageQuality": "Image Quality",
"agents.defaults.humanDelay.mode": "Human Delay Mode",
"agents.defaults.humanDelay.minMs": "Human Delay Min (ms)",
"agents.defaults.humanDelay.maxMs": "Human Delay Max (ms)",

View File

@@ -18,6 +18,7 @@ export type AgentContextInjection = "always" | "continuation-skip" | "never";
export type OptionalBootstrapFileName = "SOUL.md" | "USER.md" | "HEARTBEAT.md" | "IDENTITY.md";
export type EmbeddedPiExecutionContract = "default" | "strict-agentic";
export type SubagentDelegationMode = "suggest" | "prefer";
export type AgentImageQualityPreference = "auto" | "efficient" | "balanced" | "high";
export type Gpt5PromptOverlayConfig = {
/** Friendly interaction-style layer for GPT-5-family models (default: friendly). */
@@ -365,6 +366,11 @@ export type AgentDefaultsConfig = {
* Default: 1200.
*/
imageMaxDimensionPx?: number;
/**
* Image compression/detail preference for image-tool media loading.
* Default: auto, which adapts to provider/model limits and image count.
*/
imageQuality?: AgentImageQualityPreference;
typingIntervalSeconds?: number;
/** Typing indicator start mode (never|instant|thinking|message). */
typingMode?: TypingMode;

View File

@@ -89,6 +89,23 @@ export type ModelCompatConfig = SupportedOpenAICompatFields &
requiresOpenAiAnthropicToolPayload?: boolean;
};
export type ModelImageInputConfig = {
/** Provider-documented maximum encoded image payload size. */
maxBytes?: number;
/** Provider-documented maximum accepted input pixels. */
maxPixels?: number;
/** Provider-documented maximum accepted width/height in pixels. */
maxSidePx?: number;
/** Preferred resize side for the default balanced compression policy. */
preferredSidePx?: number;
/** Token accounting style, used as documentation for provider-owned policy. */
tokenMode?: "tile" | "detail" | "provider";
};
export type ModelMediaInputConfig = {
image?: ModelImageInputConfig;
};
export type ModelProviderAuthMode = "api-key" | "aws-sdk" | "oauth" | "token";
export type ModelProviderLocalServiceConfig = {
@@ -140,6 +157,7 @@ export type ModelDefinitionConfig = {
agentRuntime?: AgentRuntimePolicyConfig;
headers?: Record<string, string>;
compat?: ModelCompatConfig;
mediaInput?: ModelMediaInputConfig;
metadataSource?: "models-add";
};

View File

@@ -246,6 +246,7 @@ export const AgentDefaultsSchema = z
timeoutSeconds: z.number().int().positive().optional(),
mediaMaxMb: z.number().positive().optional(),
imageMaxDimensionPx: z.number().int().positive().optional(),
imageQuality: z.enum(["auto", "efficient", "balanced", "high"]).optional(),
typingIntervalSeconds: z.number().int().positive().optional(),
typingMode: TypingModeSchema.optional(),
heartbeat: HeartbeatSchema,

View File

@@ -307,6 +307,22 @@ const ModelAgentRuntimePolicySchema = z
.strict()
.optional();
const ModelImageInputSchema = z
.object({
maxBytes: z.number().int().positive().optional(),
maxPixels: z.number().int().positive().optional(),
maxSidePx: z.number().int().positive().optional(),
preferredSidePx: z.number().int().positive().optional(),
tokenMode: z.union([z.literal("tile"), z.literal("detail"), z.literal("provider")]).optional(),
})
.strict();
const ModelMediaInputSchema = z
.object({
image: ModelImageInputSchema.optional(),
})
.strict();
const ModelDefinitionSchema = z
.object({
id: z.string().min(1),
@@ -348,6 +364,7 @@ const ModelDefinitionSchema = z
agentRuntime: ModelAgentRuntimePolicySchema,
headers: z.record(z.string(), z.string()).optional(),
compat: ModelCompatSchema,
mediaInput: ModelMediaInputSchema.optional(),
metadataSource: z.literal("models-add").optional(),
})
.strict();

View File

@@ -13,7 +13,8 @@ import {
} from "./isolated-agent.test-harness.js";
import { setupIsolatedAgentTurnMocks } from "./isolated-agent.test-setup.js";
vi.mock("../plugins/provider-runtime.js", () => ({
vi.mock("../plugins/provider-runtime.js", async (importOriginal) => ({
...(await importOriginal<typeof import("../plugins/provider-runtime.js")>()),
resolveExternalAuthProfilesWithPlugins: () => [],
}));

View File

@@ -536,7 +536,7 @@ function readJpegMetadata(buffer: Buffer): ImageMetadata | null {
return null;
}
function readImageMetadataFromHeader(buffer: Buffer): ImageMetadata | null {
export function readImageMetadataFromHeader(buffer: Buffer): ImageMetadata | null {
return (
readPngMetadata(buffer) ??
readGifMetadata(buffer) ??
@@ -1235,6 +1235,7 @@ export async function resizeToPng(params: ResizeToPngParams): Promise<Buffer> {
export async function optimizeImageToPng(
buffer: Buffer,
maxBytes: number,
options?: { sides?: readonly number[] },
): Promise<{
buffer: Buffer;
optimizedSize: number;
@@ -1243,7 +1244,7 @@ export async function optimizeImageToPng(
}> {
// Try a grid of sizes/compression levels until under the limit.
// PNG uses compression levels 0-9 (higher = smaller but slower).
const sides = [2048, 1536, 1280, 1024, 800];
const sides = options?.sides?.length ? [...options.sides] : [2048, 1536, 1280, 1024, 800];
const compressionLevels = [6, 7, 8, 9];
let smallest: {
buffer: Buffer;

View File

@@ -7,11 +7,15 @@ import { resolveStateDir } from "../config/paths.js";
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
import { createEmptyPluginRegistry } from "../plugins/registry-empty.js";
import { resetPluginRuntimeStateForTest, setActivePluginRegistry } from "../plugins/runtime.js";
import { resizeToJpeg } from "./media-services.js";
import { encodePngRgba, fillPixel } from "./png-encode.js";
let effectiveImageBytesCap: typeof import("./web-media.js").effectiveImageBytesCap;
let LocalMediaAccessError: typeof import("./web-media.js").LocalMediaAccessError;
let loadWebMedia: typeof import("./web-media.js").loadWebMedia;
let loadWebMediaRaw: typeof import("./web-media.js").loadWebMediaRaw;
let optimizeImageToJpeg: typeof import("./web-media.js").optimizeImageToJpeg;
let resolveImageCompressionGrid: typeof import("./web-media.js").resolveImageCompressionGrid;
const TINY_PNG_BASE64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
@@ -40,8 +44,14 @@ function installCanvasMediaResolver() {
}
beforeAll(async () => {
({ LocalMediaAccessError, loadWebMedia, loadWebMediaRaw, optimizeImageToJpeg } =
await import("./web-media.js"));
({
effectiveImageBytesCap,
LocalMediaAccessError,
loadWebMedia,
loadWebMediaRaw,
optimizeImageToJpeg,
resolveImageCompressionGrid,
} = await import("./web-media.js"));
fixtureRoot = await fs.mkdtemp(path.join(resolvePreferredOpenClawTmpDir(), "web-media-core-"));
tinyPngFile = path.join(fixtureRoot, "tiny.png");
await fs.writeFile(tinyPngFile, Buffer.from(TINY_PNG_BASE64, "base64"));
@@ -77,6 +87,83 @@ afterAll(async () => {
});
describe("loadWebMedia", () => {
function createLargeColorBlockPng(size: number): Buffer {
const buf = Buffer.alloc(size * size * 4, 255);
const centerStart = Math.floor(size * 0.25);
const centerEnd = Math.floor(size * 0.75);
for (let y = 0; y < size; y += 1) {
for (let x = 0; x < size; x += 1) {
const inCenter = x >= centerStart && x < centerEnd && y >= centerStart && y < centerEnd;
fillPixel(buf, x, y, size, inCenter ? 230 : 30, inCenter ? 40 : 110, inCenter ? 35 : 220);
}
}
return encodePngRgba(buf, size, size);
}
function createLargeTransparentColorBlockPng(size: number): Buffer {
const buf = Buffer.alloc(size * size * 4, 0);
const centerStart = Math.floor(size * 0.25);
const centerEnd = Math.floor(size * 0.75);
for (let y = 0; y < size; y += 1) {
for (let x = 0; x < size; x += 1) {
const inCenter = x >= centerStart && x < centerEnd && y >= centerStart && y < centerEnd;
fillPixel(
buf,
x,
y,
size,
inCenter ? 230 : 30,
inCenter ? 40 : 110,
inCenter ? 35 : 220,
inCenter ? 255 : 96,
);
}
}
return encodePngRgba(buf, size, size);
}
function readPngDimensions(buffer: Buffer): { width: number; height: number } {
if (buffer.length < 24 || buffer.toString("ascii", 12, 16) !== "IHDR") {
throw new Error("PNG dimensions not found");
}
return {
width: buffer.readUInt32BE(16),
height: buffer.readUInt32BE(20),
};
}
function createGifHeader(width: number, height: number): Buffer {
const buffer = Buffer.alloc(10);
buffer.write("GIF89a", 0, "ascii");
buffer.writeUInt16LE(width, 6);
buffer.writeUInt16LE(height, 8);
return buffer;
}
function readJpegDimensions(buffer: Buffer): { width: number; height: number } {
let offset = 2;
while (offset + 9 < buffer.length) {
if (buffer[offset] !== 0xff) {
offset += 1;
continue;
}
const marker = buffer[offset + 1];
offset += 2;
if (marker === 0xd8 || marker === 0xd9 || (marker >= 0xd0 && marker <= 0xd7)) {
continue;
}
const segmentLength = buffer.readUInt16BE(offset);
if (marker >= 0xc0 && marker <= 0xcf && ![0xc4, 0xc8, 0xcc].includes(marker)) {
return {
height: buffer.readUInt16BE(offset + 3),
width: buffer.readUInt16BE(offset + 5),
};
}
offset += segmentLength;
}
throw new Error("JPEG dimensions not found");
}
function makeStallingFetch(firstChunk: Uint8Array) {
return vi.fn(
async () =>
@@ -282,9 +369,82 @@ describe("loadWebMedia", () => {
);
});
it("uses model metadata-aware image compression grids", () => {
expect(
resolveImageCompressionGrid({
models: [{ maxSidePx: 2576, preferredSidePx: 2576 }],
quality: "high",
}).sides[0],
).toBe(2576);
expect(
resolveImageCompressionGrid({
models: [{ maxSidePx: 1568, preferredSidePx: 1568 }],
quality: "high",
}).sides[0],
).toBe(1568);
expect(
resolveImageCompressionGrid({
models: [{ maxSidePx: 6000, preferredSidePx: 2048 }],
quality: "high",
}).sides[0],
).toBe(6000);
expect(
resolveImageCompressionGrid({
models: [{ maxSidePx: 6000, preferredSidePx: 2048 }],
quality: "balanced",
}).sides[0],
).toBe(2048);
expect(
resolveImageCompressionGrid({
models: [{ maxSidePx: 6000, maxPixels: 12845056, preferredSidePx: 2048 }],
quality: "high",
}).sides[0],
).toBe(3584);
expect(
resolveImageCompressionGrid({
models: [{ maxPixels: 33177600, preferredSidePx: 2048 }],
quality: "high",
}).sides[0],
).toBe(5760);
expect(
resolveImageCompressionGrid({
models: [
{ maxSidePx: 6000, preferredSidePx: 2048 },
{ maxSidePx: 1568, preferredSidePx: 1568 },
],
quality: "high",
}).sides[0],
).toBe(1568);
expect(
resolveImageCompressionGrid({
models: [{ maxSidePx: 512, preferredSidePx: 512, maxBytes: 64 * 1024 }],
quality: "balanced",
}).sides,
).toEqual([512, 384, 256, 192, 128]);
});
it("adapts automatic image compression for many-image turns", () => {
const single = resolveImageCompressionGrid({
models: [{ maxSidePx: 2576, preferredSidePx: 2576 }],
quality: "auto",
imageCount: 1,
});
const many = resolveImageCompressionGrid({
models: [{ maxSidePx: 2576, preferredSidePx: 2576 }],
quality: "auto",
imageCount: 8,
});
expect(single.sides[0]).toBe(2576);
expect(single.qualities).toEqual([80, 70, 60, 50, 40]);
expect(many.sides[0]).toBe(1280);
expect(many.qualities).toEqual([70, 60, 50, 40]);
});
async function withUnavailableImageOptimizer<T>(fn: () => Promise<T>): Promise<T> {
vi.resetModules();
vi.doMock("./media-services.js", () => ({
vi.doMock("./media-services.js", async (importOriginal) => ({
...(await importOriginal<typeof import("./media-services.js")>()),
convertHeicToJpeg: vi.fn(async (buffer: Buffer) => buffer),
hasAlphaChannel: vi.fn(async () => {
throw new Error(
@@ -335,6 +495,155 @@ describe("loadWebMedia", () => {
});
});
it("sends an in-limit data URL image when optional sharp optimization is unavailable", async () => {
await withUnavailableImageOptimizer(async () => {
const { optimizeImageBufferForWebMedia } = await import("./web-media.js");
const buffer = Buffer.from(TINY_PNG_BASE64, "base64");
const result = await optimizeImageBufferForWebMedia({
buffer,
contentType: "image/png",
maxBytes: 1024,
imageCompression: { models: [{ maxSidePx: 1024 }] },
});
expect(result.kind).toBe("image");
expect(result.contentType).toBe("image/png");
expect(result.buffer.equals(buffer)).toBe(true);
});
});
it("does not bypass the data URL image cap when optional sharp optimization is unavailable", async () => {
await withUnavailableImageOptimizer(async () => {
const { optimizeImageBufferForWebMedia } = await import("./web-media.js");
await expect(
optimizeImageBufferForWebMedia({
buffer: Buffer.from(TINY_PNG_BASE64, "base64"),
contentType: "image/png",
maxBytes: 8,
imageCompression: { models: [{ maxSidePx: 1024 }] },
}),
).rejects.toThrow(/Optional dependency sharp is required/);
});
});
it("does not bypass model dimensions when optional sharp optimization is unavailable", async () => {
await withUnavailableImageOptimizer(async () => {
const { optimizeImageBufferForWebMedia } = await import("./web-media.js");
await expect(
optimizeImageBufferForWebMedia({
buffer: createLargeColorBlockPng(1600),
contentType: "image/png",
maxBytes: 16 * 1024 * 1024,
imageCompression: { models: [{ maxSidePx: 512 }] },
}),
).rejects.toThrow(/Optional dependency sharp is required/);
});
});
it("preserves in-limit GIF buffers when optimizing direct image buffers", async () => {
const { optimizeImageBufferForWebMedia } = await import("./web-media.js");
const buffer = createGifHeader(16, 16);
const result = await optimizeImageBufferForWebMedia({
buffer,
contentType: "image/gif",
maxBytes: 1024,
imageCompression: { models: [{ maxSidePx: 64 }] },
});
expect(result.kind).toBe("image");
expect(result.contentType).toBe("image/gif");
expect(result.buffer.equals(buffer)).toBe(true);
});
it("does not bypass model dimensions for GIF buffers", async () => {
const { optimizeImageBufferForWebMedia } = await import("./web-media.js");
await expect(
optimizeImageBufferForWebMedia({
buffer: createGifHeader(1600, 1600),
contentType: "image/gif",
maxBytes: 1024,
imageCompression: { models: [{ maxSidePx: 512 }] },
}),
).rejects.toThrow(/dimensions exceed model image limits/i);
});
it("applies model image maxBytes to the effective image cap", async () => {
await expect(
loadWebMediaRaw(tinyPngFile, {
maxBytes: 1024 * 1024,
localRoots: [fixtureRoot],
imageCompression: {
models: [{ maxBytes: 8 }],
},
}),
).rejects.toThrow(/exceeds/i);
});
it("uses the strictest model image maxBytes across fallback candidates", () => {
expect(
effectiveImageBytesCap(16 * 1024 * 1024, {
models: [{ maxBytes: 8 * 1024 * 1024 }, {}, { maxBytes: 2 * 1024 * 1024 }],
}),
).toBe(2 * 1024 * 1024);
expect(effectiveImageBytesCap(undefined, { models: [{ maxBytes: 1024 }] })).toBe(1024);
});
it("downscales oversized JPEGs to the resolved model side limit before returning media", async () => {
const sourcePng = createLargeColorBlockPng(1600);
const sourceJpeg = await resizeToJpeg({
buffer: sourcePng,
maxSide: 1600,
quality: 92,
withoutEnlargement: true,
});
expect(Math.max(...Object.values(readJpegDimensions(sourceJpeg)))).toBe(1600);
const largeImage = path.join(fixtureRoot, "large-center-red.jpg");
await fs.writeFile(largeImage, sourceJpeg);
const result = await loadWebMedia(largeImage, {
maxBytes: 16 * 1024 * 1024,
localRoots: [fixtureRoot],
imageCompression: {
quality: "high",
models: [{ maxSidePx: 512, preferredSidePx: 512 }],
},
});
expect(result.kind).toBe("image");
expect(result.contentType).toBe("image/jpeg");
const dimensions = readJpegDimensions(result.buffer);
expect(Math.max(dimensions.width, dimensions.height)).toBeLessThanOrEqual(512);
});
it("downscales alpha PNGs to the resolved model side limit before returning media", async () => {
const sourcePng = createLargeTransparentColorBlockPng(1600);
expect(Math.max(...Object.values(readPngDimensions(sourcePng)))).toBe(1600);
const largeImage = path.join(fixtureRoot, "large-transparent.png");
await fs.writeFile(largeImage, sourcePng);
const result = await loadWebMedia(largeImage, {
maxBytes: 16 * 1024 * 1024,
localRoots: [fixtureRoot],
imageCompression: {
quality: "high",
models: [{ maxSidePx: 512, preferredSidePx: 512 }],
},
});
expect(result.kind).toBe("image");
expect(result.contentType).toBe("image/png");
const dimensions = readPngDimensions(result.buffer);
expect(Math.max(dimensions.width, dimensions.height)).toBeLessThanOrEqual(512);
});
it("uses low default dimensions when model metadata is unavailable", async () => {
expect(
resolveImageCompressionGrid({
quality: "high",
models: [{}],
}).sides[0],
).toBe(2048);
});
it("does not send original HEIC media when optional sharp conversion is unavailable", async () => {
await withUnavailableImageOptimizer(async () => {
const heicFile = path.join(fixtureRoot, "photo.heic");

View File

@@ -21,6 +21,7 @@ import {
hasAlphaChannel,
isImageProcessorUnavailableError,
optimizeImageToPng,
readImageMetadataFromHeader,
resizeToJpeg,
} from "./media-services.js";
import {
@@ -45,6 +46,7 @@ export type WebMediaResult = {
type WebMediaOptions = {
maxBytes?: number;
optimizeImages?: boolean;
imageCompression?: ImageCompressionPolicy;
ssrfPolicy?: SsrFPolicy;
proxyUrl?: string;
fetchImpl?: (input: RequestInfo | URL, init?: RequestInit) => Promise<Response>;
@@ -61,6 +63,21 @@ type WebMediaOptions = {
hostReadCapability?: boolean;
};
export type ImageQualityPreference = "auto" | "efficient" | "balanced" | "high";
export type ImageCompressionModelPolicy = {
maxBytes?: number;
maxPixels?: number;
maxSidePx?: number;
preferredSidePx?: number;
};
export type ImageCompressionPolicy = {
quality?: ImageQualityPreference;
models?: ImageCompressionModelPolicy[];
imageCount?: number;
};
async function resolveMediaStoreUriToPath(mediaUrl: string): Promise<string | null> {
if (!/^media:\/\//i.test(mediaUrl)) {
return null;
@@ -335,6 +352,200 @@ type OptimizedImage = {
compressionLevel?: number;
};
const DEFAULT_JPEG_SIDES = [2048, 1536, 1280, 1024, 800] as const;
const DEFAULT_JPEG_QUALITIES = [80, 70, 60, 50, 40] as const;
const DEFAULT_VISION_MAX_SIDE = 2048;
const LOW_IMAGE_SIDE_FALLBACKS = [640, 512, 384, 256, 192, 128] as const;
function normalizeImageQualityPreference(value?: string): ImageQualityPreference {
switch (value) {
case "efficient":
case "balanced":
case "high":
return value;
default:
return "auto";
}
}
function squareLongSideForPixelBudget(pixelBudget: number): number {
return Math.floor(Math.sqrt(pixelBudget));
}
function positiveInteger(value: number | undefined): number | undefined {
return typeof value === "number" && Number.isFinite(value) && value > 0
? Math.floor(value)
: undefined;
}
function effectiveImageQualityPreference(
policy?: ImageCompressionPolicy,
): Exclude<ImageQualityPreference, "auto"> {
const preference = normalizeImageQualityPreference(policy?.quality);
if (preference !== "auto") {
return preference;
}
const imageCount = Math.max(1, Math.floor(policy?.imageCount ?? 1));
if (imageCount >= 6) {
return "efficient";
}
return "balanced";
}
function maxSideForModel(model: ImageCompressionModelPolicy | undefined): number {
const maxSide = positiveInteger(model?.maxSidePx);
const maxPixels = positiveInteger(model?.maxPixels);
const hardLimits = [
maxSide,
maxPixels ? squareLongSideForPixelBudget(maxPixels) : undefined,
].filter((value): value is number => value !== undefined);
if (hardLimits.length > 0) {
return Math.min(...hardLimits);
}
return positiveInteger(model?.preferredSidePx) ?? DEFAULT_VISION_MAX_SIDE;
}
function preferredSideForModel(model: ImageCompressionModelPolicy | undefined): number {
return (
positiveInteger(model?.preferredSidePx) ??
Math.min(maxSideForModel(model), DEFAULT_VISION_MAX_SIDE)
);
}
function policyModelSides(policy: ImageCompressionPolicy | undefined): {
maxSide: number;
preferredSide: number;
} {
const models = policy?.models?.length ? policy.models : [undefined];
const maxSide = Math.min(...models.map((model) => maxSideForModel(model)));
const preferredSide = Math.min(...models.map((model) => preferredSideForModel(model)));
return {
maxSide,
preferredSide: Math.min(preferredSide, maxSide),
};
}
function sideForPreference(
preference: Exclude<ImageQualityPreference, "auto">,
policy?: ImageCompressionPolicy,
): number {
const { maxSide, preferredSide } = policyModelSides(policy);
switch (preference) {
case "efficient":
return Math.min(preferredSide, maxSide, 1280);
case "balanced":
return Math.min(preferredSide, maxSide);
case "high":
return maxSide;
}
return Math.min(preferredSide, maxSide);
}
function imageMaxBytesForPolicy(policy?: ImageCompressionPolicy): number | undefined {
const maxBytes = policy?.models
?.map((model) => positiveInteger(model.maxBytes))
.filter((value): value is number => value !== undefined);
return maxBytes?.length ? Math.min(...maxBytes) : undefined;
}
function imageSatisfiesHardDimensionPolicy(
buffer: Buffer,
policy?: ImageCompressionPolicy,
): boolean {
const models = policy?.models ?? [];
const hardMaxSides = models
.map((model) => positiveInteger(model.maxSidePx))
.filter((value): value is number => value !== undefined);
const hardMaxPixels = models
.map((model) => positiveInteger(model.maxPixels))
.filter((value): value is number => value !== undefined);
if (hardMaxSides.length === 0 && hardMaxPixels.length === 0) {
return true;
}
const meta = readImageMetadataFromHeader(buffer);
if (!meta) {
return false;
}
const maxSide = Math.max(meta.width, meta.height);
const pixels = meta.width * meta.height;
return (
(hardMaxSides.length === 0 || maxSide <= Math.min(...hardMaxSides)) &&
(hardMaxPixels.length === 0 || pixels <= Math.min(...hardMaxPixels))
);
}
function assertImageSatisfiesHardDimensionPolicy(
buffer: Buffer,
policy?: ImageCompressionPolicy,
): void {
if (imageSatisfiesHardDimensionPolicy(buffer, policy)) {
return;
}
const meta = readImageMetadataFromHeader(buffer);
const detail = meta ? `: ${meta.width}x${meta.height}` : "";
throw new Error(`Image dimensions exceed model image limits${detail}`);
}
export function effectiveImageBytesCap(
baseCap: number | undefined,
policy?: ImageCompressionPolicy,
): number | undefined {
const policyCap = imageMaxBytesForPolicy(policy);
if (baseCap === undefined) {
return policyCap;
}
return policyCap === undefined ? baseCap : Math.min(baseCap, policyCap);
}
function buildDescendingLadder(maxSide: number, values: readonly number[]): number[] {
const normalizedMax = Math.max(1, Math.floor(maxSide));
const ladder = [normalizedMax, ...values, ...LOW_IMAGE_SIDE_FALLBACKS]
.map((value) => Math.min(normalizedMax, value))
.filter((value, idx, arr) => value > 0 && arr.indexOf(value) === idx)
.toSorted((a, b) => b - a);
if (ladder.length > 1 || normalizedMax <= 1) {
return ladder;
}
return [
normalizedMax,
Math.floor(normalizedMax * 0.75),
Math.floor(normalizedMax * 0.5),
Math.floor(normalizedMax * 0.25),
]
.filter((value, idx, arr) => value > 0 && arr.indexOf(value) === idx)
.toSorted((a, b) => b - a);
}
export function resolveImageCompressionGrid(policy?: ImageCompressionPolicy): {
sides: number[];
qualities: number[];
} {
const preference = effectiveImageQualityPreference(policy);
const side = sideForPreference(preference, policy);
switch (preference) {
case "efficient":
return {
sides: buildDescendingLadder(side, [1024, 800]),
qualities: [70, 60, 50, 40],
};
case "high":
return {
sides: buildDescendingLadder(side, [3072, 2576, 2048, 1800, 1536, 1280, 1024, 800]),
qualities: [92, 85, 78, 70, 62, 52, 42],
};
case "balanced":
return {
sides: buildDescendingLadder(side, [...DEFAULT_JPEG_SIDES]),
qualities: [...DEFAULT_JPEG_QUALITIES],
};
}
return {
sides: buildDescendingLadder(side, [...DEFAULT_JPEG_SIDES]),
qualities: [...DEFAULT_JPEG_QUALITIES],
};
}
function logOptimizedImage(params: { originalSize: number; optimized: OptimizedImage }): void {
if (!shouldLogVerbose()) {
return;
@@ -357,13 +568,15 @@ async function optimizeImageWithFallback(params: {
buffer: Buffer;
cap: number;
meta?: { contentType?: string; fileName?: string };
imageCompression?: ImageCompressionPolicy;
}): Promise<OptimizedImage> {
const { buffer, cap, meta } = params;
const isPng = meta?.contentType === "image/png" || meta?.fileName?.toLowerCase().endsWith(".png");
const hasAlpha = isPng && (await hasAlphaChannel(buffer));
if (hasAlpha) {
const optimized = await optimizeImageToPng(buffer, cap);
const grid = resolveImageCompressionGrid(params.imageCompression);
const optimized = await optimizeImageToPng(buffer, cap, { sides: grid.sides });
if (optimized.buffer.length <= cap) {
return { ...optimized, format: "png" };
}
@@ -374,10 +587,79 @@ async function optimizeImageWithFallback(params: {
}
}
const optimized = await optimizeImageToJpeg(buffer, cap, meta);
const optimized = await optimizeImageToJpeg(buffer, cap, {
...meta,
...(params.imageCompression ? { imageCompression: params.imageCompression } : {}),
});
return { ...optimized, format: "jpeg" };
}
export async function optimizeImageBufferForWebMedia(params: {
buffer: Buffer;
contentType?: string;
fileName?: string;
maxBytes?: number;
imageCompression?: ImageCompressionPolicy;
}): Promise<WebMediaResult> {
const baseCap = params.maxBytes ?? maxBytesForKind("image");
const cap = effectiveImageBytesCap(baseCap, params.imageCompression) ?? baseCap;
if (params.contentType === "image/gif") {
if (params.buffer.length > cap) {
throw new Error(formatCapLimit("GIF", cap, params.buffer.length));
}
assertImageSatisfiesHardDimensionPolicy(params.buffer, params.imageCompression);
return {
buffer: params.buffer,
contentType: params.contentType,
kind: "image",
fileName: params.fileName,
};
}
const meta = { contentType: params.contentType, fileName: params.fileName };
let optimized: OptimizedImage;
try {
optimized = await optimizeImageWithFallback({
buffer: params.buffer,
cap,
meta,
imageCompression: params.imageCompression,
});
} catch (err) {
if (
isImageProcessorUnavailableError(err) &&
!isHeicSource(meta) &&
params.buffer.length <= cap &&
imageSatisfiesHardDimensionPolicy(params.buffer, params.imageCompression)
) {
if (shouldLogVerbose()) {
logVerbose(
`Image optimizer unavailable; sending original ${formatMb(params.buffer.length)}MB media without optimization`,
);
}
return {
buffer: params.buffer,
contentType: params.contentType,
kind: "image",
fileName: params.fileName,
};
}
throw err;
}
logOptimizedImage({ originalSize: params.buffer.length, optimized });
if (optimized.buffer.length > cap) {
throw new Error(formatCapReduce("Media", cap, optimized.buffer.length));
}
return {
buffer: optimized.buffer,
contentType: optimized.format === "png" ? "image/png" : "image/jpeg",
kind: "image",
fileName:
optimized.format === "jpeg" && isHeicSource(params)
? toJpegFileName(params.fileName)
: params.fileName,
};
}
async function loadWebMediaInternal(
mediaUrl: string,
options: WebMediaOptions = {},
@@ -396,6 +678,7 @@ async function loadWebMediaInternal(
sandboxValidated = false,
readFile: readFileOverride,
hostReadCapability = false,
imageCompression,
} = options;
// Strip MEDIA: prefix used by agent tools (e.g. TTS) to tag media paths.
// Be lenient: LLM output may add extra whitespace (e.g. " MEDIA : /tmp/x.png").
@@ -421,12 +704,18 @@ async function loadWebMediaInternal(
const originalSize = buffer.length;
let optimized: OptimizedImage;
try {
optimized = await optimizeImageWithFallback({ buffer, cap, meta });
optimized = await optimizeImageWithFallback({
buffer,
cap,
meta,
...(imageCompression ? { imageCompression } : {}),
});
} catch (err) {
if (
isImageProcessorUnavailableError(err) &&
!isHeicSource(meta ?? {}) &&
buffer.length <= cap
buffer.length <= cap &&
imageSatisfiesHardDimensionPolicy(buffer, imageCompression)
) {
if (shouldLogVerbose()) {
logVerbose(
@@ -472,11 +761,13 @@ async function loadWebMediaInternal(
// Otherwise fall back to per-kind defaults.
const cap = maxBytes !== undefined ? maxBytes : maxBytesForKind(params.kind ?? "document");
if (params.kind === "image") {
const imageCap = effectiveImageBytesCap(cap, imageCompression) ?? cap;
const isGif = params.contentType === "image/gif";
if (isGif || !optimizeImages) {
if (params.buffer.length > cap) {
throw new Error(formatCapLimit(isGif ? "GIF" : "Media", cap, params.buffer.length));
if (params.buffer.length > imageCap) {
throw new Error(formatCapLimit(isGif ? "GIF" : "Media", imageCap, params.buffer.length));
}
assertImageSatisfiesHardDimensionPolicy(params.buffer, imageCompression);
return {
buffer: params.buffer,
contentType: params.contentType,
@@ -485,7 +776,7 @@ async function loadWebMediaInternal(
};
}
return {
...(await optimizeAndClampImage(params.buffer, cap, {
...(await optimizeAndClampImage(params.buffer, imageCap, {
contentType: params.contentType,
fileName: params.fileName,
})),
@@ -643,7 +934,11 @@ export async function loadWebMediaRaw(
export async function optimizeImageToJpeg(
buffer: Buffer,
maxBytes: number,
opts: { contentType?: string; fileName?: string } = {},
opts: {
contentType?: string;
fileName?: string;
imageCompression?: ImageCompressionPolicy;
} = {},
): Promise<{
buffer: Buffer;
optimizedSize: number;
@@ -659,8 +954,7 @@ export async function optimizeImageToJpeg(
throw new Error(`HEIC image conversion failed: ${String(err)}`, { cause: err });
}
}
const sides = [2048, 1536, 1280, 1024, 800];
const qualities = [80, 70, 60, 50, 40];
const { sides, qualities } = resolveImageCompressionGrid(opts.imageCompression);
let smallest: {
buffer: Buffer;
size: number;

View File

@@ -3,6 +3,8 @@ import {
isModelThinkingFormat,
type ModelApi,
type ModelCompatConfig,
type ModelImageInputConfig,
type ModelMediaInputConfig,
} from "../config/types.models.js";
import { isBlockedObjectKey } from "../infra/prototype-keys.js";
import { normalizeOptionalString } from "../shared/string-coerce.js";
@@ -237,6 +239,33 @@ function normalizeModelCatalogStatus(value: unknown): ModelCatalogStatus | undef
return MODEL_CATALOG_STATUSES.has(status) ? (status as ModelCatalogStatus) : undefined;
}
function normalizeModelCatalogImageTokenMode(value: unknown): ModelImageInputConfig["tokenMode"] {
const tokenMode = normalizeOptionalString(value) ?? "";
if (tokenMode === "tile" || tokenMode === "detail" || tokenMode === "provider") {
return tokenMode;
}
return undefined;
}
function normalizeModelCatalogMediaInput(value: unknown): ModelMediaInputConfig | undefined {
if (!isRecord(value) || !isRecord(value.image)) {
return undefined;
}
const maxBytes = normalizePositiveInteger(value.image.maxBytes);
const maxPixels = normalizePositiveInteger(value.image.maxPixels);
const maxSidePx = normalizePositiveInteger(value.image.maxSidePx);
const preferredSidePx = normalizePositiveInteger(value.image.preferredSidePx);
const tokenMode = normalizeModelCatalogImageTokenMode(value.image.tokenMode);
const normalizedImage = {
...(maxBytes !== undefined ? { maxBytes } : {}),
...(maxPixels !== undefined ? { maxPixels } : {}),
...(maxSidePx !== undefined ? { maxSidePx } : {}),
...(preferredSidePx !== undefined ? { preferredSidePx } : {}),
...(tokenMode ? { tokenMode } : {}),
};
return Object.keys(normalizedImage).length > 0 ? { image: normalizedImage } : undefined;
}
function normalizeModelCatalogModel(value: unknown): ModelCatalogModel | undefined {
if (!isRecord(value)) {
return undefined;
@@ -256,6 +285,7 @@ function normalizeModelCatalogModel(value: unknown): ModelCatalogModel | undefin
const maxTokens = normalizePositiveNumber(value.maxTokens);
const cost = normalizeModelCatalogCost(value.cost);
const compat = normalizeModelCatalogCompat(value.compat);
const mediaInput = normalizeModelCatalogMediaInput(value.mediaInput);
const status = normalizeModelCatalogStatus(value.status);
const statusReason = normalizeOptionalString(value.statusReason) ?? "";
const replaces = normalizeTrimmedStringList(value.replaces);
@@ -274,6 +304,7 @@ function normalizeModelCatalogModel(value: unknown): ModelCatalogModel | undefin
...(maxTokens !== undefined ? { maxTokens } : {}),
...(cost ? { cost } : {}),
...(compat ? { compat } : {}),
...(mediaInput ? { mediaInput } : {}),
...(status ? { status } : {}),
...(statusReason ? { statusReason } : {}),
...(replaces.length > 0 ? { replaces } : {}),
@@ -468,6 +499,7 @@ export function normalizeModelCatalogProviderRows(params: {
const maxTokens = normalizePositiveNumber(model.maxTokens);
const cost = normalizeModelCatalogCost(model.cost);
const compat = normalizeModelCatalogCompat(model.compat);
const mediaInput = normalizeModelCatalogMediaInput(model.mediaInput);
const statusReason = normalizeOptionalString(model.statusReason) ?? "";
const replacedBy = normalizeOptionalString(model.replacedBy) ?? "";
const replaces = normalizeStringList(model.replaces);
@@ -490,6 +522,7 @@ export function normalizeModelCatalogProviderRows(params: {
...(maxTokens !== undefined ? { maxTokens } : {}),
...(cost ? { cost } : {}),
...(compat ? { compat } : {}),
...(mediaInput ? { mediaInput } : {}),
...(statusReason ? { statusReason } : {}),
...(replaces ? { replaces } : {}),
...(replacedBy ? { replacedBy } : {}),

View File

@@ -1,4 +1,4 @@
import type { ModelApi, ModelCompatConfig } from "../config/types.models.js";
import type { ModelApi, ModelCompatConfig, ModelMediaInputConfig } from "../config/types.models.js";
export type ModelCatalogInput = "text" | "image" | "document";
export type ModelCatalogDiscovery = "static" | "refreshable" | "runtime";
@@ -71,6 +71,7 @@ export type ModelCatalogModel = {
maxTokens?: number;
cost?: ModelCatalogCost;
compat?: ModelCompatConfig;
mediaInput?: ModelMediaInputConfig;
status?: ModelCatalogStatus;
statusReason?: string;
replaces?: string[];
@@ -127,6 +128,7 @@ export type NormalizedModelCatalogRow = {
maxTokens?: number;
cost?: ModelCatalogCost;
compat?: ModelCompatConfig;
mediaInput?: ModelMediaInputConfig;
statusReason?: string;
replaces?: string[];
replacedBy?: string;

View File

@@ -1,5 +1,5 @@
import type { Api, Model } from "@earendil-works/pi-ai";
import type { ModelCompatConfig } from "../config/types.models.js";
import type { ModelCompatConfig, ModelMediaInputConfig } from "../config/types.models.js";
/**
* Fully-resolved runtime model shape used after provider/plugin-owned
@@ -10,4 +10,5 @@ export type ProviderRuntimeModel = Omit<Model<Api>, "compat"> & {
contextTokens?: number;
params?: Record<string, unknown>;
requestTimeoutMs?: number;
mediaInput?: ModelMediaInputConfig;
};