fix(xiaomi): support MiMo voicedesign TTS

Adds Xiaomi MiMo voicedesign TTS support by registering the v2.5 voicedesign model and omitting audio.voice for that model's prompt-driven voice design flow.

Also accepts generic TTS aliases modelId, speakerVoice, and speakerVoiceId for Xiaomi provider config and request overrides.

Fixes exec timeout classification so a process that exits after a missed timeout callback is still reported as timed out, using monotonic deadlines to avoid wall-clock skew.

Verification:
- node scripts/run-vitest.mjs extensions/xiaomi/speech-provider.test.ts
- node scripts/run-vitest.mjs src/process/supervisor/supervisor.test.ts
- node scripts/run-vitest.mjs src/agents/bash-tools.exec-foreground-failures.test.ts
- git diff --check
- autoreview --mode local
- live Xiaomi MiMo voicedesign call returned wav RIFF/WAVE output, 169004 bytes
- GitHub CI success on fb3018ef31: CI 26708919072, CodeQL Critical Quality 26708919082, CodeQL 26708919091, OpenGrep PR Diff 26708919089, Workflow Sanity 26708919083, Dependency Guard 26708918574, Real behavior proof 26708921767

Thanks @GimingRao.

Co-authored-by: Raoyu <2425198313@qq.com>
Co-authored-by: giming <53329020+GimingRao@users.noreply.github.com>
This commit is contained in:
giming
2026-05-31 17:34:51 +08:00
committed by GitHub
parent d9d5d97dbc
commit f94512cd7f
6 changed files with 308 additions and 21 deletions

View File

@@ -24,7 +24,7 @@ The same plugin also registers the `xiaomi` speech (TTS) provider.
| API | OpenAI-compatible (`openai-completions`) |
| Base URLs | Pay-as-you-go: `https://api.xiaomimimo.com/v1`; Token Plan presets: `token-plan-{cn,sgp,ams}...` |
| Default models | `xiaomi/mimo-v2-flash`, `xiaomi-token-plan/mimo-v2.5-pro` |
| TTS default | `mimo-v2.5-tts`, voice `mimo_default` |
| TTS default | `mimo-v2.5-tts`, voice `mimo_default`; voicedesign model `mimo-v2.5-tts-voicedesign` |
## Getting started
@@ -126,10 +126,34 @@ an `assistant` message and optional style guidance as a `user` message.
```
Supported built-in voices include `mimo_default`, `default_zh`, `default_en`,
`Mia`, `Chloe`, `Milo`, and `Dean`. `mimo-v2-tts` is supported for older MiMo
TTS accounts; the default uses the current MiMo-V2.5 TTS model. For voice-note
targets such as Feishu and Telegram, OpenClaw transcodes Xiaomi output to 48kHz
Opus with `ffmpeg` before delivery.
`Mia`, `Chloe`, `Milo`, and `Dean`. Preset-voice models use `audio.voice`, so
OpenClaw sends `speakerVoice` for `mimo-v2.5-tts` and `mimo-v2-tts`.
Xiaomi's voicedesign model, `mimo-v2.5-tts-voicedesign`, generates the voice
from a natural-language style prompt instead of a preset voice id. Configure
`style` with the desired voice description; OpenClaw sends it as the `user`
message, sends the spoken text as the `assistant` message, and omits
`audio.voice` for this model.
```json5
{
messages: {
tts: {
provider: "xiaomi",
providers: {
xiaomi: {
model: "mimo-v2.5-tts-voicedesign",
format: "wav",
style: "Warm, natural female voice with clear pronunciation.",
},
},
},
},
}
```
For voice-note targets such as Feishu and Telegram, OpenClaw transcodes Xiaomi
output to 48kHz Opus with `ffmpeg` before delivery.
## Config example

View File

@@ -372,6 +372,10 @@ preset and adapt the provider block:
</Tab>
</Tabs>
For Xiaomi `mimo-v2.5-tts-voicedesign`, omit `speakerVoice` and set `style` to
the voice-design prompt. OpenClaw sends that prompt as the TTS `user` message
and does not send `audio.voice` for the voicedesign model.
### Per-agent voice overrides
Use `agents.list[].tts` when one agent should speak with a different provider,
@@ -947,10 +951,10 @@ OpenAI and ElevenLabs output formats are fixed per channel as listed above.
<Accordion title="Xiaomi MiMo">
<ParamField path="apiKey" type="string">Env: `XIAOMI_API_KEY`.</ParamField>
<ParamField path="baseUrl" type="string">Default `https://api.xiaomimimo.com/v1`. Env: `XIAOMI_BASE_URL`.</ParamField>
<ParamField path="model" type="string">Default `mimo-v2.5-tts`. Env: `XIAOMI_TTS_MODEL`. Also supports `mimo-v2-tts`.</ParamField>
<ParamField path="speakerVoice" type="string">Default `mimo_default`. Env: `XIAOMI_TTS_VOICE`. Legacy alias: `voice`.</ParamField>
<ParamField path="model" type="string">Default `mimo-v2.5-tts`. Env: `XIAOMI_TTS_MODEL`. Also supports `mimo-v2-tts` and `mimo-v2.5-tts-voicedesign`.</ParamField>
<ParamField path="speakerVoice" type="string">Default `mimo_default` for preset-voice models. Env: `XIAOMI_TTS_VOICE`. Legacy alias: `voice`. Not sent for `mimo-v2.5-tts-voicedesign`.</ParamField>
<ParamField path="format" type='"mp3" | "wav"'>Default `mp3`. Env: `XIAOMI_TTS_FORMAT`.</ParamField>
<ParamField path="style" type="string">Optional natural-language style instruction sent as the user message; not spoken.</ParamField>
<ParamField path="style" type="string">Optional natural-language style instruction sent as the user message; not spoken. For `mimo-v2.5-tts-voicedesign`, this is the voice-design prompt; OpenClaw supplies a default when omitted.</ParamField>
</Accordion>
</AccordionGroup>

View File

@@ -18,6 +18,7 @@ describe("buildXiaomiSpeechProvider", () => {
expect(provider.aliases).toContain("mimo");
expect(provider.models).toContain("mimo-v2.5-tts");
expect(provider.models).toContain("mimo-v2-tts");
expect(provider.models).toContain("mimo-v2.5-tts-voicedesign");
expect(provider.voices).toContain("mimo_default");
});
});
@@ -81,6 +82,24 @@ describe("buildXiaomiSpeechProvider", () => {
});
expect(config.voice).toBe("default_zh");
});
it("accepts generic model and speaker voice aliases", () => {
const config = provider.resolveConfig!({
rawConfig: {
providers: {
xiaomi: {
modelId: "mimo-v2.5-tts-voicedesign",
speakerVoice: "Chloe",
},
},
},
cfg: {} as never,
timeoutMs: 30000,
});
expect(config.model).toBe("mimo-v2.5-tts-voicedesign");
expect(config.voice).toBe("Chloe");
});
});
describe("parseDirectiveToken", () => {
@@ -179,6 +198,80 @@ describe("buildXiaomiSpeechProvider", () => {
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
});
it("omits voice and uses configured style for Xiaomi voice design models", async () => {
const audio = Buffer.from("fake-wav-audio").toString("base64");
const mockFetch = vi.mocked(globalThis.fetch);
mockFetch.mockResolvedValueOnce(
new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
status: 200,
headers: { "Content-Type": "application/json" },
}),
);
const result = await provider.synthesize({
text: "Hello from OpenClaw.",
cfg: {} as never,
providerConfig: {
apiKey: "sk-test",
modelId: "mimo-v2.5-tts-voicedesign",
speakerVoice: "Chloe",
format: "wav",
style: "Warm, bright, natural voice.",
},
target: "audio-file",
timeoutMs: 30000,
});
expect(result.outputFormat).toBe("wav");
expect(result.fileExtension).toBe(".wav");
expect(result.voiceCompatible).toBe(false);
expect(result.audioBuffer.toString()).toBe("fake-wav-audio");
expect(mockFetch).toHaveBeenCalledOnce();
const [, init] = mockFetch.mock.calls[0] ?? [];
const body = JSON.parse(init!.body as string);
expect(body.model).toBe("mimo-v2.5-tts-voicedesign");
expect(body.messages).toEqual([
{ role: "user", content: "Warm, bright, natural voice." },
{ role: "assistant", content: "Hello from OpenClaw." },
]);
expect(body.audio).toEqual({ format: "wav" });
});
it("uses a default style for Xiaomi voice design models", async () => {
const audio = Buffer.from("fake-mp3-audio").toString("base64");
const mockFetch = vi.mocked(globalThis.fetch);
mockFetch.mockResolvedValueOnce(
new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
status: 200,
headers: { "Content-Type": "application/json" },
}),
);
await provider.synthesize({
text: "Hello from OpenClaw.",
cfg: {} as never,
providerConfig: {
apiKey: "sk-test",
model: "mimo-v2.5-tts-voicedesign",
},
target: "audio-file",
timeoutMs: 30000,
});
expect(mockFetch).toHaveBeenCalledOnce();
const [, init] = mockFetch.mock.calls[0] ?? [];
const body = JSON.parse(init!.body as string);
expect(body.messages).toHaveLength(2);
expect(body.messages[0]?.role).toBe("user");
expect(body.messages[0]?.content).toContain("natural");
expect(body.messages[1]).toEqual({
role: "assistant",
content: "Hello from OpenClaw.",
});
expect(body.audio).toEqual({ format: "mp3" });
});
it("transcodes Xiaomi output to Opus for voice-note targets", async () => {
const audio = Buffer.from("fake-mp3-audio").toString("base64");
vi.mocked(globalThis.fetch).mockResolvedValueOnce(
@@ -209,6 +302,43 @@ describe("buildXiaomiSpeechProvider", () => {
});
});
it("transcodes Xiaomi voice design output to Opus for voice-note targets", async () => {
const audio = Buffer.from("fake-wav-audio").toString("base64");
vi.mocked(globalThis.fetch).mockResolvedValueOnce(
new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
status: 200,
headers: { "Content-Type": "application/json" },
}),
);
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("fake-opus-audio"));
const result = await provider.synthesize({
text: "Hello from OpenClaw.",
cfg: {} as never,
providerConfig: {
apiKey: "sk-test",
model: "mimo-v2.5-tts-voicedesign",
format: "wav",
},
target: "voice-note",
timeoutMs: 30000,
});
expect(result.outputFormat).toBe("opus");
expect(result.fileExtension).toBe(".opus");
expect(result.voiceCompatible).toBe(true);
expect(result.audioBuffer.toString()).toBe("fake-opus-audio");
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
audioBuffer: Buffer.from("fake-wav-audio"),
inputExtension: "wav",
tempPrefix: "tts-xiaomi-",
timeoutMs: 30000,
});
const [, init] = vi.mocked(globalThis.fetch).mock.calls[0] ?? [];
const body = JSON.parse(init!.body as string);
expect(body.audio).toEqual({ format: "wav" });
});
it("caps oversized TTS request timeouts before scheduling or fetching", async () => {
const audio = Buffer.from("fake-mp3-audio").toString("base64");
const timeoutSpy = vi

View File

@@ -18,8 +18,11 @@ const DEFAULT_XIAOMI_TTS_BASE_URL = "https://api.xiaomimimo.com/v1";
const DEFAULT_XIAOMI_TTS_MODEL = "mimo-v2.5-tts";
const DEFAULT_XIAOMI_TTS_VOICE = "mimo_default";
const DEFAULT_XIAOMI_TTS_FORMAT = "mp3";
const XIAOMI_TTS_VOICE_DESIGN_MODEL = "mimo-v2.5-tts-voicedesign";
const DEFAULT_XIAOMI_TTS_VOICE_DESIGN_STYLE =
"Warm, natural, and friendly voice with clear pronunciation and conversational pacing.";
const XIAOMI_TTS_MODELS = ["mimo-v2.5-tts", "mimo-v2-tts"] as const;
const XIAOMI_TTS_MODELS = ["mimo-v2.5-tts", "mimo-v2-tts", XIAOMI_TTS_VOICE_DESIGN_MODEL] as const;
const XIAOMI_TTS_VOICES = [
"mimo_default",
@@ -83,9 +86,12 @@ function normalizeXiaomiTtsProviderConfig(
),
model:
trimToUndefined(raw?.model) ??
trimToUndefined(raw?.modelId) ??
trimToUndefined(process.env.XIAOMI_TTS_MODEL) ??
DEFAULT_XIAOMI_TTS_MODEL,
voice:
trimToUndefined(raw?.speakerVoice) ??
trimToUndefined(raw?.speakerVoiceId) ??
trimToUndefined(raw?.voice) ??
trimToUndefined(raw?.voiceId) ??
trimToUndefined(process.env.XIAOMI_TTS_VOICE) ??
@@ -107,8 +113,13 @@ function readXiaomiTtsProviderConfig(config: SpeechProviderConfig): XiaomiTtsPro
path: "messages.tts.providers.xiaomi.apiKey",
}) ?? normalized.apiKey,
baseUrl: normalizeXiaomiTtsBaseUrl(trimToUndefined(config.baseUrl) ?? normalized.baseUrl),
model: trimToUndefined(config.model) ?? normalized.model,
voice: trimToUndefined(config.voice) ?? trimToUndefined(config.voiceId) ?? normalized.voice,
model: trimToUndefined(config.model) ?? trimToUndefined(config.modelId) ?? normalized.model,
voice:
trimToUndefined(config.speakerVoice) ??
trimToUndefined(config.speakerVoiceId) ??
trimToUndefined(config.voice) ??
trimToUndefined(config.voiceId) ??
normalized.voice,
format: normalizeXiaomiTtsFormat(config.format) ?? normalized.format,
style: trimToUndefined(config.style) ?? normalized.style,
};
@@ -121,8 +132,12 @@ function readXiaomiTtsOverrides(
return {};
}
return {
model: trimToUndefined(overrides.model),
voice: trimToUndefined(overrides.voice) ?? trimToUndefined(overrides.voiceId),
model: trimToUndefined(overrides.model) ?? trimToUndefined(overrides.modelId),
voice:
trimToUndefined(overrides.speakerVoice) ??
trimToUndefined(overrides.speakerVoiceId) ??
trimToUndefined(overrides.voice) ??
trimToUndefined(overrides.voiceId),
format: normalizeXiaomiTtsFormat(overrides.format),
style: trimToUndefined(overrides.style),
};
@@ -182,6 +197,24 @@ function buildXiaomiTtsMessages(params: { text: string; style?: string }) {
];
}
function isXiaomiVoiceDesignModel(model: string): boolean {
return model === XIAOMI_TTS_VOICE_DESIGN_MODEL;
}
function resolveXiaomiVoiceDesignStyle(style: string | undefined): string {
return trimToUndefined(style) ?? DEFAULT_XIAOMI_TTS_VOICE_DESIGN_STYLE;
}
function buildXiaomiTtsAudio(params: { model: string; voice: string; format: XiaomiTtsFormat }): {
format: XiaomiTtsFormat;
voice?: string;
} {
if (isXiaomiVoiceDesignModel(params.model)) {
return { format: params.format };
}
return { format: params.format, voice: params.voice };
}
function decodeXiaomiAudioData(body: unknown): Buffer {
const root = asObject(body);
const choices = Array.isArray(root?.choices) ? root.choices : [];
@@ -209,6 +242,9 @@ async function xiaomiTTS(params: {
const requestTimeoutMs = resolveTimerTimeoutMs(timeoutMs, 1);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), requestTimeoutMs);
const resolvedStyle = isXiaomiVoiceDesignModel(model)
? resolveXiaomiVoiceDesignStyle(style)
: style;
try {
const { response, release } = await fetchWithSsrFGuard({
@@ -221,8 +257,8 @@ async function xiaomiTTS(params: {
},
body: JSON.stringify({
model,
messages: buildXiaomiTtsMessages({ text, style }),
audio: { format, voice },
messages: buildXiaomiTtsMessages({ text, style: resolvedStyle }),
audio: buildXiaomiTtsAudio({ model, voice, format }),
}),
signal: controller.signal,
},

View File

@@ -1,3 +1,4 @@
import { performance } from "node:perf_hooks";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { SpawnProcessAdapter } from "./types.js";
@@ -266,6 +267,58 @@ describe("process supervisor", () => {
expect(exit.timedOut).toBe(true);
});
it("classifies a natural close after a missed overall deadline as timed out", async () => {
vi.useFakeTimers();
const nowSpy = vi.spyOn(performance, "now").mockReturnValue(1_000);
const adapter = createStubChildAdapter();
createChildAdapterMock.mockResolvedValue(adapter);
const supervisor = createProcessSupervisor();
const run = await spawnChild(supervisor, {
sessionId: "s-timeout-race",
argv: createSilentIdleArgv(),
timeoutMs: 10,
stdinMode: "pipe-closed",
});
const exitPromise = run.wait();
nowSpy.mockReturnValue(1_011);
adapter.settle(0);
const exit = await exitPromise;
expect(adapter.killMock).not.toHaveBeenCalled();
expect(exit.reason).toBe("overall-timeout");
expect(exit.timedOut).toBe(true);
});
it("uses the refreshed no-output deadline when a missed timer races natural close", async () => {
vi.useFakeTimers();
const nowSpy = vi.spyOn(performance, "now").mockReturnValue(1_000);
const adapter = createStubChildAdapter();
createChildAdapterMock.mockResolvedValue(adapter);
const supervisor = createProcessSupervisor();
const run = await spawnChild(supervisor, {
sessionId: "s-no-output-race",
argv: createSilentIdleArgv(),
timeoutMs: 100,
noOutputTimeoutMs: 10,
stdinMode: "pipe-closed",
});
const exitPromise = run.wait();
nowSpy.mockReturnValue(1_005);
adapter.emitStdout("progress");
nowSpy.mockReturnValue(1_016);
adapter.settle(0);
const exit = await exitPromise;
expect(adapter.killMock).not.toHaveBeenCalled();
expect(exit.reason).toBe("no-output-timeout");
expect(exit.noOutputTimedOut).toBe(true);
expect(exit.timedOut).toBe(true);
});
it("can stream output without retaining it in RunExit payload", async () => {
const adapter = createStubChildAdapter();
createChildAdapterMock.mockResolvedValue(adapter);

View File

@@ -1,4 +1,5 @@
import crypto from "node:crypto";
import { performance } from "node:perf_hooks";
import { normalizeOptionalString } from "@openclaw/normalization-core/string-coerce";
import { getShellConfig } from "../../agents/shell-utils.js";
import { createChildAdapter } from "./adapters/child.js";
@@ -63,6 +64,34 @@ function isTimeoutReason(reason: TerminationReason) {
return reason === "overall-timeout" || reason === "no-output-timeout";
}
function resolveElapsedTimeoutReason(params: {
nowMs: number;
overallTimeoutDeadlineMs: number | null;
noOutputTimeoutDeadlineMs: number | null;
}): TerminationReason | null {
const elapsedDeadlines: Array<{ reason: TerminationReason; deadlineMs: number }> = [];
if (params.overallTimeoutDeadlineMs !== null && params.nowMs >= params.overallTimeoutDeadlineMs) {
elapsedDeadlines.push({
reason: "overall-timeout",
deadlineMs: params.overallTimeoutDeadlineMs,
});
}
if (
params.noOutputTimeoutDeadlineMs !== null &&
params.nowMs >= params.noOutputTimeoutDeadlineMs
) {
elapsedDeadlines.push({
reason: "no-output-timeout",
deadlineMs: params.noOutputTimeoutDeadlineMs,
});
}
if (elapsedDeadlines.length === 0) {
return null;
}
elapsedDeadlines.sort((a, b) => a.deadlineMs - b.deadlineMs);
return elapsedDeadlines[0].reason;
}
export function createProcessSupervisor(): ProcessSupervisor {
const registry = createRunRegistry();
const active = new Map<string, ActiveRun>();
@@ -122,6 +151,8 @@ export function createProcessSupervisor(): ProcessSupervisor {
const overallTimeoutMs = clampTimeout(input.timeoutMs);
const noOutputTimeoutMs = clampTimeout(input.noOutputTimeoutMs);
let overallTimeoutDeadlineMs: number | null = null;
let noOutputTimeoutDeadlineMs: number | null = null;
const setForcedReason = (reason: TerminationReason) => {
if (forcedReason) {
@@ -143,6 +174,7 @@ export function createProcessSupervisor(): ProcessSupervisor {
if (!noOutputTimeoutMs || settled) {
return;
}
noOutputTimeoutDeadlineMs = performance.now() + noOutputTimeoutMs;
if (noOutputTimer) {
clearTimeout(noOutputTimer);
}
@@ -210,11 +242,13 @@ export function createProcessSupervisor(): ProcessSupervisor {
};
if (overallTimeoutMs) {
overallTimeoutDeadlineMs = performance.now() + overallTimeoutMs;
timeoutTimer = setTimeout(() => {
requestCancel("overall-timeout");
}, overallTimeoutMs);
}
if (noOutputTimeoutMs) {
noOutputTimeoutDeadlineMs = performance.now() + noOutputTimeoutMs;
noOutputTimer = setTimeout(() => {
requestCancel("no-output-timeout");
}, noOutputTimeoutMs);
@@ -237,16 +271,22 @@ export function createProcessSupervisor(): ProcessSupervisor {
const waitPromise = (async (): Promise<RunExit> => {
const result = await adapter.wait();
const deadlineReason = resolveElapsedTimeoutReason({
nowMs: performance.now(),
overallTimeoutDeadlineMs,
noOutputTimeoutDeadlineMs,
});
const terminalReason = forcedReason ?? deadlineReason;
if (settled) {
return {
reason: forcedReason ?? "exit",
reason: terminalReason ?? "exit",
exitCode: result.code,
exitSignal: result.signal,
durationMs: Date.now() - startedAtMs,
stdout,
stderr,
timedOut: isTimeoutReason(forcedReason ?? "exit"),
noOutputTimedOut: forcedReason === "no-output-timeout",
timedOut: isTimeoutReason(terminalReason ?? "exit"),
noOutputTimedOut: terminalReason === "no-output-timeout",
};
}
settled = true;
@@ -255,7 +295,7 @@ export function createProcessSupervisor(): ProcessSupervisor {
active.delete(runId);
const reason: TerminationReason =
forcedReason ?? (result.signal != null ? ("signal" as const) : ("exit" as const));
terminalReason ?? (result.signal != null ? ("signal" as const) : ("exit" as const));
const exit: RunExit = {
reason,
exitCode: result.code,
@@ -263,8 +303,8 @@ export function createProcessSupervisor(): ProcessSupervisor {
durationMs: Date.now() - startedAtMs,
stdout,
stderr,
timedOut: isTimeoutReason(forcedReason ?? reason),
noOutputTimedOut: forcedReason === "no-output-timeout",
timedOut: isTimeoutReason(terminalReason ?? reason),
noOutputTimedOut: terminalReason === "no-output-timeout",
};
registry.finalize(runId, {
reason: exit.reason,