Files
openclaw/extensions/vllm/stream.ts
2026-06-04 21:02:07 -04:00

144 lines
4.5 KiB
TypeScript

// Vllm plugin module implements stream behavior.
import type { StreamFn } from "openclaw/plugin-sdk/agent-core";
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
import {
createPayloadPatchStreamWrapper,
isOpenAICompatibleThinkingEnabled,
} from "openclaw/plugin-sdk/provider-stream-shared";
import {
resolveVllmQwenThinkingFormatFromCompat,
type VllmQwenThinkingFormat,
} from "./thinking-policy.js";
type VllmThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
function isVllmProviderId(providerId: string): boolean {
return normalizeProviderId(providerId) === "vllm";
}
function resolveVllmQwenThinkingFormat(
ctx: Pick<ProviderWrapStreamFnContext, "model">,
): VllmQwenThinkingFormat | undefined {
return resolveVllmQwenThinkingFormatFromCompat(ctx.model?.compat);
}
function setQwenChatTemplateThinking(payload: Record<string, unknown>, enabled: boolean): void {
const existing = payload.chat_template_kwargs;
if (existing && typeof existing === "object" && !Array.isArray(existing)) {
const next: Record<string, unknown> = {
...(existing as Record<string, unknown>),
enable_thinking: enabled,
};
if (!Object.hasOwn(next, "preserve_thinking")) {
next.preserve_thinking = true;
}
payload.chat_template_kwargs = next;
return;
}
payload.chat_template_kwargs = {
enable_thinking: enabled,
preserve_thinking: true,
};
}
function isVllmNemotronModel(model: { api?: unknown; provider?: unknown; id?: unknown }): boolean {
return (
model.api === "openai-completions" &&
typeof model.provider === "string" &&
normalizeProviderId(model.provider) === "vllm" &&
typeof model.id === "string" &&
/\bnemotron-3(?:[-_](?:nano|super|ultra))?\b/i.test(model.id)
);
}
function setNemotronThinkingOffChatTemplateKwargs(payload: Record<string, unknown>): void {
const defaults = {
enable_thinking: false,
force_nonempty_content: true,
};
const existing = payload.chat_template_kwargs;
payload.chat_template_kwargs =
existing && typeof existing === "object" && !Array.isArray(existing)
? {
...defaults,
...(existing as Record<string, unknown>),
}
: defaults;
}
export function createVllmQwenThinkingWrapper(params: {
baseStreamFn: StreamFn | undefined;
format: VllmQwenThinkingFormat;
thinkingLevel: VllmThinkingLevel;
}): StreamFn {
return createPayloadPatchStreamWrapper(
params.baseStreamFn,
({ payload: payloadObj, options }) => {
const enableThinking = isOpenAICompatibleThinkingEnabled({
thinkingLevel: params.thinkingLevel,
options,
});
if (params.format === "chat-template") {
setQwenChatTemplateThinking(payloadObj, enableThinking);
} else {
payloadObj.enable_thinking = enableThinking;
}
delete payloadObj.reasoning_effort;
delete payloadObj.reasoningEffort;
delete payloadObj.reasoning;
},
{
shouldPatch: ({ model }) => model.api === "openai-completions" && (model.reasoning ?? true),
},
);
}
export function createVllmProviderThinkingWrapper(params: {
baseStreamFn: StreamFn | undefined;
qwenFormat?: VllmQwenThinkingFormat;
thinkingLevel: VllmThinkingLevel;
}): StreamFn {
const qwenWrapped = params.qwenFormat
? createVllmQwenThinkingWrapper({
baseStreamFn: params.baseStreamFn,
format: params.qwenFormat,
thinkingLevel: params.thinkingLevel,
})
: params.baseStreamFn;
return createPayloadPatchStreamWrapper(
qwenWrapped,
({ payload: payloadObj }) => {
setNemotronThinkingOffChatTemplateKwargs(payloadObj);
},
{
shouldPatch: ({ model }) =>
model.api === "openai-completions" &&
params.thinkingLevel === "off" &&
isVllmNemotronModel(model),
},
);
}
export function wrapVllmProviderStream(ctx: ProviderWrapStreamFnContext): StreamFn | undefined {
if (!isVllmProviderId(ctx.provider) || (ctx.model && ctx.model.api !== "openai-completions")) {
return undefined;
}
const qwenFormat = resolveVllmQwenThinkingFormat(ctx);
const shouldHandleNemotron =
ctx.thinkingLevel === "off" &&
isVllmNemotronModel({
api: "openai-completions",
provider: ctx.provider,
id: ctx.modelId,
});
if (!qwenFormat && !shouldHandleNemotron) {
return undefined;
}
return createVllmProviderThinkingWrapper({
baseStreamFn: ctx.streamFn,
qwenFormat,
thinkingLevel: ctx.thinkingLevel,
});
}