diff --git a/docs/providers/xiaomi.md b/docs/providers/xiaomi.md
index 26c41e354dfe..f1d590ee530a 100644
--- a/docs/providers/xiaomi.md
+++ b/docs/providers/xiaomi.md
@@ -24,7 +24,7 @@ The same plugin also registers the `xiaomi` speech (TTS) provider.
| API | OpenAI-compatible (`openai-completions`) |
| Base URLs | Pay-as-you-go: `https://api.xiaomimimo.com/v1`; Token Plan presets: `token-plan-{cn,sgp,ams}...` |
| Default models | `xiaomi/mimo-v2-flash`, `xiaomi-token-plan/mimo-v2.5-pro` |
-| TTS default | `mimo-v2.5-tts`, voice `mimo_default` |
+| TTS default | `mimo-v2.5-tts`, voice `mimo_default`; voicedesign model `mimo-v2.5-tts-voicedesign` |
## Getting started
@@ -126,10 +126,34 @@ an `assistant` message and optional style guidance as a `user` message.
```
Supported built-in voices include `mimo_default`, `default_zh`, `default_en`,
-`Mia`, `Chloe`, `Milo`, and `Dean`. `mimo-v2-tts` is supported for older MiMo
-TTS accounts; the default uses the current MiMo-V2.5 TTS model. For voice-note
-targets such as Feishu and Telegram, OpenClaw transcodes Xiaomi output to 48kHz
-Opus with `ffmpeg` before delivery.
+`Mia`, `Chloe`, `Milo`, and `Dean`. Preset-voice models use `audio.voice`, so
+OpenClaw sends `speakerVoice` for `mimo-v2.5-tts` and `mimo-v2-tts`.
+
+Xiaomi's voicedesign model, `mimo-v2.5-tts-voicedesign`, generates the voice
+from a natural-language style prompt instead of a preset voice id. Configure
+`style` with the desired voice description; OpenClaw sends it as the `user`
+message, sends the spoken text as the `assistant` message, and omits
+`audio.voice` for this model.
+
+```json5
+{
+ messages: {
+ tts: {
+ provider: "xiaomi",
+ providers: {
+ xiaomi: {
+ model: "mimo-v2.5-tts-voicedesign",
+ format: "wav",
+ style: "Warm, natural female voice with clear pronunciation.",
+ },
+ },
+ },
+ },
+}
+```
+
+For voice-note targets such as Feishu and Telegram, OpenClaw transcodes Xiaomi
+output to 48kHz Opus with `ffmpeg` before delivery.
## Config example
diff --git a/docs/tools/tts.md b/docs/tools/tts.md
index f7dad3cd8a57..50bd8ce9e4a6 100644
--- a/docs/tools/tts.md
+++ b/docs/tools/tts.md
@@ -372,6 +372,10 @@ preset and adapt the provider block:
+For Xiaomi `mimo-v2.5-tts-voicedesign`, omit `speakerVoice` and set `style` to
+the voice-design prompt. OpenClaw sends that prompt as the TTS `user` message
+and does not send `audio.voice` for the voicedesign model.
+
### Per-agent voice overrides
Use `agents.list[].tts` when one agent should speak with a different provider,
@@ -947,10 +951,10 @@ OpenAI and ElevenLabs output formats are fixed per channel as listed above.
Env: `XIAOMI_API_KEY`.
Default `https://api.xiaomimimo.com/v1`. Env: `XIAOMI_BASE_URL`.
- Default `mimo-v2.5-tts`. Env: `XIAOMI_TTS_MODEL`. Also supports `mimo-v2-tts`.
- Default `mimo_default`. Env: `XIAOMI_TTS_VOICE`. Legacy alias: `voice`.
+ Default `mimo-v2.5-tts`. Env: `XIAOMI_TTS_MODEL`. Also supports `mimo-v2-tts` and `mimo-v2.5-tts-voicedesign`.
+ Default `mimo_default` for preset-voice models. Env: `XIAOMI_TTS_VOICE`. Legacy alias: `voice`. Not sent for `mimo-v2.5-tts-voicedesign`.
Default `mp3`. Env: `XIAOMI_TTS_FORMAT`.
- Optional natural-language style instruction sent as the user message; not spoken.
+ Optional natural-language style instruction sent as the user message; not spoken. For `mimo-v2.5-tts-voicedesign`, this is the voice-design prompt; OpenClaw supplies a default when omitted.
diff --git a/extensions/xiaomi/speech-provider.test.ts b/extensions/xiaomi/speech-provider.test.ts
index cd3bf55cd6b0..6f7a45d43c3d 100644
--- a/extensions/xiaomi/speech-provider.test.ts
+++ b/extensions/xiaomi/speech-provider.test.ts
@@ -18,6 +18,7 @@ describe("buildXiaomiSpeechProvider", () => {
expect(provider.aliases).toContain("mimo");
expect(provider.models).toContain("mimo-v2.5-tts");
expect(provider.models).toContain("mimo-v2-tts");
+ expect(provider.models).toContain("mimo-v2.5-tts-voicedesign");
expect(provider.voices).toContain("mimo_default");
});
});
@@ -81,6 +82,24 @@ describe("buildXiaomiSpeechProvider", () => {
});
expect(config.voice).toBe("default_zh");
});
+
+ it("accepts generic model and speaker voice aliases", () => {
+ const config = provider.resolveConfig!({
+ rawConfig: {
+ providers: {
+ xiaomi: {
+ modelId: "mimo-v2.5-tts-voicedesign",
+ speakerVoice: "Chloe",
+ },
+ },
+ },
+ cfg: {} as never,
+ timeoutMs: 30000,
+ });
+
+ expect(config.model).toBe("mimo-v2.5-tts-voicedesign");
+ expect(config.voice).toBe("Chloe");
+ });
});
describe("parseDirectiveToken", () => {
@@ -179,6 +198,80 @@ describe("buildXiaomiSpeechProvider", () => {
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
});
+ it("omits voice and uses configured style for Xiaomi voice design models", async () => {
+ const audio = Buffer.from("fake-wav-audio").toString("base64");
+ const mockFetch = vi.mocked(globalThis.fetch);
+ mockFetch.mockResolvedValueOnce(
+ new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
+ status: 200,
+ headers: { "Content-Type": "application/json" },
+ }),
+ );
+
+ const result = await provider.synthesize({
+ text: "Hello from OpenClaw.",
+ cfg: {} as never,
+ providerConfig: {
+ apiKey: "sk-test",
+ modelId: "mimo-v2.5-tts-voicedesign",
+ speakerVoice: "Chloe",
+ format: "wav",
+ style: "Warm, bright, natural voice.",
+ },
+ target: "audio-file",
+ timeoutMs: 30000,
+ });
+
+ expect(result.outputFormat).toBe("wav");
+ expect(result.fileExtension).toBe(".wav");
+ expect(result.voiceCompatible).toBe(false);
+ expect(result.audioBuffer.toString()).toBe("fake-wav-audio");
+
+ expect(mockFetch).toHaveBeenCalledOnce();
+ const [, init] = mockFetch.mock.calls[0] ?? [];
+ const body = JSON.parse(init!.body as string);
+ expect(body.model).toBe("mimo-v2.5-tts-voicedesign");
+ expect(body.messages).toEqual([
+ { role: "user", content: "Warm, bright, natural voice." },
+ { role: "assistant", content: "Hello from OpenClaw." },
+ ]);
+ expect(body.audio).toEqual({ format: "wav" });
+ });
+
+ it("uses a default style for Xiaomi voice design models", async () => {
+ const audio = Buffer.from("fake-mp3-audio").toString("base64");
+ const mockFetch = vi.mocked(globalThis.fetch);
+ mockFetch.mockResolvedValueOnce(
+ new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
+ status: 200,
+ headers: { "Content-Type": "application/json" },
+ }),
+ );
+
+ await provider.synthesize({
+ text: "Hello from OpenClaw.",
+ cfg: {} as never,
+ providerConfig: {
+ apiKey: "sk-test",
+ model: "mimo-v2.5-tts-voicedesign",
+ },
+ target: "audio-file",
+ timeoutMs: 30000,
+ });
+
+ expect(mockFetch).toHaveBeenCalledOnce();
+ const [, init] = mockFetch.mock.calls[0] ?? [];
+ const body = JSON.parse(init!.body as string);
+ expect(body.messages).toHaveLength(2);
+ expect(body.messages[0]?.role).toBe("user");
+ expect(body.messages[0]?.content).toContain("natural");
+ expect(body.messages[1]).toEqual({
+ role: "assistant",
+ content: "Hello from OpenClaw.",
+ });
+ expect(body.audio).toEqual({ format: "mp3" });
+ });
+
it("transcodes Xiaomi output to Opus for voice-note targets", async () => {
const audio = Buffer.from("fake-mp3-audio").toString("base64");
vi.mocked(globalThis.fetch).mockResolvedValueOnce(
@@ -209,6 +302,43 @@ describe("buildXiaomiSpeechProvider", () => {
});
});
+ it("transcodes Xiaomi voice design output to Opus for voice-note targets", async () => {
+ const audio = Buffer.from("fake-wav-audio").toString("base64");
+ vi.mocked(globalThis.fetch).mockResolvedValueOnce(
+ new Response(JSON.stringify({ choices: [{ message: { audio: { data: audio } } }] }), {
+ status: 200,
+ headers: { "Content-Type": "application/json" },
+ }),
+ );
+ transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("fake-opus-audio"));
+
+ const result = await provider.synthesize({
+ text: "Hello from OpenClaw.",
+ cfg: {} as never,
+ providerConfig: {
+ apiKey: "sk-test",
+ model: "mimo-v2.5-tts-voicedesign",
+ format: "wav",
+ },
+ target: "voice-note",
+ timeoutMs: 30000,
+ });
+
+ expect(result.outputFormat).toBe("opus");
+ expect(result.fileExtension).toBe(".opus");
+ expect(result.voiceCompatible).toBe(true);
+ expect(result.audioBuffer.toString()).toBe("fake-opus-audio");
+ expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
+ audioBuffer: Buffer.from("fake-wav-audio"),
+ inputExtension: "wav",
+ tempPrefix: "tts-xiaomi-",
+ timeoutMs: 30000,
+ });
+ const [, init] = vi.mocked(globalThis.fetch).mock.calls[0] ?? [];
+ const body = JSON.parse(init!.body as string);
+ expect(body.audio).toEqual({ format: "wav" });
+ });
+
it("caps oversized TTS request timeouts before scheduling or fetching", async () => {
const audio = Buffer.from("fake-mp3-audio").toString("base64");
const timeoutSpy = vi
diff --git a/extensions/xiaomi/speech-provider.ts b/extensions/xiaomi/speech-provider.ts
index a646be2b914e..f9bf7833bb3b 100644
--- a/extensions/xiaomi/speech-provider.ts
+++ b/extensions/xiaomi/speech-provider.ts
@@ -18,8 +18,11 @@ const DEFAULT_XIAOMI_TTS_BASE_URL = "https://api.xiaomimimo.com/v1";
const DEFAULT_XIAOMI_TTS_MODEL = "mimo-v2.5-tts";
const DEFAULT_XIAOMI_TTS_VOICE = "mimo_default";
const DEFAULT_XIAOMI_TTS_FORMAT = "mp3";
+const XIAOMI_TTS_VOICE_DESIGN_MODEL = "mimo-v2.5-tts-voicedesign";
+const DEFAULT_XIAOMI_TTS_VOICE_DESIGN_STYLE =
+ "Warm, natural, and friendly voice with clear pronunciation and conversational pacing.";
-const XIAOMI_TTS_MODELS = ["mimo-v2.5-tts", "mimo-v2-tts"] as const;
+const XIAOMI_TTS_MODELS = ["mimo-v2.5-tts", "mimo-v2-tts", XIAOMI_TTS_VOICE_DESIGN_MODEL] as const;
const XIAOMI_TTS_VOICES = [
"mimo_default",
@@ -83,9 +86,12 @@ function normalizeXiaomiTtsProviderConfig(
),
model:
trimToUndefined(raw?.model) ??
+ trimToUndefined(raw?.modelId) ??
trimToUndefined(process.env.XIAOMI_TTS_MODEL) ??
DEFAULT_XIAOMI_TTS_MODEL,
voice:
+ trimToUndefined(raw?.speakerVoice) ??
+ trimToUndefined(raw?.speakerVoiceId) ??
trimToUndefined(raw?.voice) ??
trimToUndefined(raw?.voiceId) ??
trimToUndefined(process.env.XIAOMI_TTS_VOICE) ??
@@ -107,8 +113,13 @@ function readXiaomiTtsProviderConfig(config: SpeechProviderConfig): XiaomiTtsPro
path: "messages.tts.providers.xiaomi.apiKey",
}) ?? normalized.apiKey,
baseUrl: normalizeXiaomiTtsBaseUrl(trimToUndefined(config.baseUrl) ?? normalized.baseUrl),
- model: trimToUndefined(config.model) ?? normalized.model,
- voice: trimToUndefined(config.voice) ?? trimToUndefined(config.voiceId) ?? normalized.voice,
+ model: trimToUndefined(config.model) ?? trimToUndefined(config.modelId) ?? normalized.model,
+ voice:
+ trimToUndefined(config.speakerVoice) ??
+ trimToUndefined(config.speakerVoiceId) ??
+ trimToUndefined(config.voice) ??
+ trimToUndefined(config.voiceId) ??
+ normalized.voice,
format: normalizeXiaomiTtsFormat(config.format) ?? normalized.format,
style: trimToUndefined(config.style) ?? normalized.style,
};
@@ -121,8 +132,12 @@ function readXiaomiTtsOverrides(
return {};
}
return {
- model: trimToUndefined(overrides.model),
- voice: trimToUndefined(overrides.voice) ?? trimToUndefined(overrides.voiceId),
+ model: trimToUndefined(overrides.model) ?? trimToUndefined(overrides.modelId),
+ voice:
+ trimToUndefined(overrides.speakerVoice) ??
+ trimToUndefined(overrides.speakerVoiceId) ??
+ trimToUndefined(overrides.voice) ??
+ trimToUndefined(overrides.voiceId),
format: normalizeXiaomiTtsFormat(overrides.format),
style: trimToUndefined(overrides.style),
};
@@ -182,6 +197,24 @@ function buildXiaomiTtsMessages(params: { text: string; style?: string }) {
];
}
+function isXiaomiVoiceDesignModel(model: string): boolean {
+ return model === XIAOMI_TTS_VOICE_DESIGN_MODEL;
+}
+
+function resolveXiaomiVoiceDesignStyle(style: string | undefined): string {
+ return trimToUndefined(style) ?? DEFAULT_XIAOMI_TTS_VOICE_DESIGN_STYLE;
+}
+
+function buildXiaomiTtsAudio(params: { model: string; voice: string; format: XiaomiTtsFormat }): {
+ format: XiaomiTtsFormat;
+ voice?: string;
+} {
+ if (isXiaomiVoiceDesignModel(params.model)) {
+ return { format: params.format };
+ }
+ return { format: params.format, voice: params.voice };
+}
+
function decodeXiaomiAudioData(body: unknown): Buffer {
const root = asObject(body);
const choices = Array.isArray(root?.choices) ? root.choices : [];
@@ -209,6 +242,9 @@ async function xiaomiTTS(params: {
const requestTimeoutMs = resolveTimerTimeoutMs(timeoutMs, 1);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), requestTimeoutMs);
+ const resolvedStyle = isXiaomiVoiceDesignModel(model)
+ ? resolveXiaomiVoiceDesignStyle(style)
+ : style;
try {
const { response, release } = await fetchWithSsrFGuard({
@@ -221,8 +257,8 @@ async function xiaomiTTS(params: {
},
body: JSON.stringify({
model,
- messages: buildXiaomiTtsMessages({ text, style }),
- audio: { format, voice },
+ messages: buildXiaomiTtsMessages({ text, style: resolvedStyle }),
+ audio: buildXiaomiTtsAudio({ model, voice, format }),
}),
signal: controller.signal,
},
diff --git a/src/process/supervisor/supervisor.test.ts b/src/process/supervisor/supervisor.test.ts
index 08bd10ab0f44..af9683e21f81 100644
--- a/src/process/supervisor/supervisor.test.ts
+++ b/src/process/supervisor/supervisor.test.ts
@@ -1,3 +1,4 @@
+import { performance } from "node:perf_hooks";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { SpawnProcessAdapter } from "./types.js";
@@ -266,6 +267,58 @@ describe("process supervisor", () => {
expect(exit.timedOut).toBe(true);
});
+ it("classifies a natural close after a missed overall deadline as timed out", async () => {
+ vi.useFakeTimers();
+ const nowSpy = vi.spyOn(performance, "now").mockReturnValue(1_000);
+ const adapter = createStubChildAdapter();
+ createChildAdapterMock.mockResolvedValue(adapter);
+
+ const supervisor = createProcessSupervisor();
+ const run = await spawnChild(supervisor, {
+ sessionId: "s-timeout-race",
+ argv: createSilentIdleArgv(),
+ timeoutMs: 10,
+ stdinMode: "pipe-closed",
+ });
+
+ const exitPromise = run.wait();
+ nowSpy.mockReturnValue(1_011);
+ adapter.settle(0);
+
+ const exit = await exitPromise;
+ expect(adapter.killMock).not.toHaveBeenCalled();
+ expect(exit.reason).toBe("overall-timeout");
+ expect(exit.timedOut).toBe(true);
+ });
+
+ it("uses the refreshed no-output deadline when a missed timer races natural close", async () => {
+ vi.useFakeTimers();
+ const nowSpy = vi.spyOn(performance, "now").mockReturnValue(1_000);
+ const adapter = createStubChildAdapter();
+ createChildAdapterMock.mockResolvedValue(adapter);
+
+ const supervisor = createProcessSupervisor();
+ const run = await spawnChild(supervisor, {
+ sessionId: "s-no-output-race",
+ argv: createSilentIdleArgv(),
+ timeoutMs: 100,
+ noOutputTimeoutMs: 10,
+ stdinMode: "pipe-closed",
+ });
+
+ const exitPromise = run.wait();
+ nowSpy.mockReturnValue(1_005);
+ adapter.emitStdout("progress");
+ nowSpy.mockReturnValue(1_016);
+ adapter.settle(0);
+
+ const exit = await exitPromise;
+ expect(adapter.killMock).not.toHaveBeenCalled();
+ expect(exit.reason).toBe("no-output-timeout");
+ expect(exit.noOutputTimedOut).toBe(true);
+ expect(exit.timedOut).toBe(true);
+ });
+
it("can stream output without retaining it in RunExit payload", async () => {
const adapter = createStubChildAdapter();
createChildAdapterMock.mockResolvedValue(adapter);
diff --git a/src/process/supervisor/supervisor.ts b/src/process/supervisor/supervisor.ts
index eb86dfc8843e..10eff6124156 100644
--- a/src/process/supervisor/supervisor.ts
+++ b/src/process/supervisor/supervisor.ts
@@ -1,4 +1,5 @@
import crypto from "node:crypto";
+import { performance } from "node:perf_hooks";
import { normalizeOptionalString } from "@openclaw/normalization-core/string-coerce";
import { getShellConfig } from "../../agents/shell-utils.js";
import { createChildAdapter } from "./adapters/child.js";
@@ -63,6 +64,34 @@ function isTimeoutReason(reason: TerminationReason) {
return reason === "overall-timeout" || reason === "no-output-timeout";
}
+function resolveElapsedTimeoutReason(params: {
+ nowMs: number;
+ overallTimeoutDeadlineMs: number | null;
+ noOutputTimeoutDeadlineMs: number | null;
+}): TerminationReason | null {
+ const elapsedDeadlines: Array<{ reason: TerminationReason; deadlineMs: number }> = [];
+ if (params.overallTimeoutDeadlineMs !== null && params.nowMs >= params.overallTimeoutDeadlineMs) {
+ elapsedDeadlines.push({
+ reason: "overall-timeout",
+ deadlineMs: params.overallTimeoutDeadlineMs,
+ });
+ }
+ if (
+ params.noOutputTimeoutDeadlineMs !== null &&
+ params.nowMs >= params.noOutputTimeoutDeadlineMs
+ ) {
+ elapsedDeadlines.push({
+ reason: "no-output-timeout",
+ deadlineMs: params.noOutputTimeoutDeadlineMs,
+ });
+ }
+ if (elapsedDeadlines.length === 0) {
+ return null;
+ }
+ elapsedDeadlines.sort((a, b) => a.deadlineMs - b.deadlineMs);
+ return elapsedDeadlines[0].reason;
+}
+
export function createProcessSupervisor(): ProcessSupervisor {
const registry = createRunRegistry();
const active = new Map();
@@ -122,6 +151,8 @@ export function createProcessSupervisor(): ProcessSupervisor {
const overallTimeoutMs = clampTimeout(input.timeoutMs);
const noOutputTimeoutMs = clampTimeout(input.noOutputTimeoutMs);
+ let overallTimeoutDeadlineMs: number | null = null;
+ let noOutputTimeoutDeadlineMs: number | null = null;
const setForcedReason = (reason: TerminationReason) => {
if (forcedReason) {
@@ -143,6 +174,7 @@ export function createProcessSupervisor(): ProcessSupervisor {
if (!noOutputTimeoutMs || settled) {
return;
}
+ noOutputTimeoutDeadlineMs = performance.now() + noOutputTimeoutMs;
if (noOutputTimer) {
clearTimeout(noOutputTimer);
}
@@ -210,11 +242,13 @@ export function createProcessSupervisor(): ProcessSupervisor {
};
if (overallTimeoutMs) {
+ overallTimeoutDeadlineMs = performance.now() + overallTimeoutMs;
timeoutTimer = setTimeout(() => {
requestCancel("overall-timeout");
}, overallTimeoutMs);
}
if (noOutputTimeoutMs) {
+ noOutputTimeoutDeadlineMs = performance.now() + noOutputTimeoutMs;
noOutputTimer = setTimeout(() => {
requestCancel("no-output-timeout");
}, noOutputTimeoutMs);
@@ -237,16 +271,22 @@ export function createProcessSupervisor(): ProcessSupervisor {
const waitPromise = (async (): Promise => {
const result = await adapter.wait();
+ const deadlineReason = resolveElapsedTimeoutReason({
+ nowMs: performance.now(),
+ overallTimeoutDeadlineMs,
+ noOutputTimeoutDeadlineMs,
+ });
+ const terminalReason = forcedReason ?? deadlineReason;
if (settled) {
return {
- reason: forcedReason ?? "exit",
+ reason: terminalReason ?? "exit",
exitCode: result.code,
exitSignal: result.signal,
durationMs: Date.now() - startedAtMs,
stdout,
stderr,
- timedOut: isTimeoutReason(forcedReason ?? "exit"),
- noOutputTimedOut: forcedReason === "no-output-timeout",
+ timedOut: isTimeoutReason(terminalReason ?? "exit"),
+ noOutputTimedOut: terminalReason === "no-output-timeout",
};
}
settled = true;
@@ -255,7 +295,7 @@ export function createProcessSupervisor(): ProcessSupervisor {
active.delete(runId);
const reason: TerminationReason =
- forcedReason ?? (result.signal != null ? ("signal" as const) : ("exit" as const));
+ terminalReason ?? (result.signal != null ? ("signal" as const) : ("exit" as const));
const exit: RunExit = {
reason,
exitCode: result.code,
@@ -263,8 +303,8 @@ export function createProcessSupervisor(): ProcessSupervisor {
durationMs: Date.now() - startedAtMs,
stdout,
stderr,
- timedOut: isTimeoutReason(forcedReason ?? reason),
- noOutputTimedOut: forcedReason === "no-output-timeout",
+ timedOut: isTimeoutReason(terminalReason ?? reason),
+ noOutputTimedOut: terminalReason === "no-output-timeout",
};
registry.finalize(runId, {
reason: exit.reason,