fix: accept leading fuzzy Discord voice wake names (#86484)

This commit is contained in:
Peter Steinberger
2026-05-25 14:01:15 +01:00
committed by GitHub
parent 5d018034f6
commit 8fe4f34af2
3 changed files with 288 additions and 3 deletions

View File

@@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Discord/OpenAI voice: accept leading fuzzy wake-name transcripts such as "Monty" or "Moti" for a Molty agent while keeping ambient speech gated.
- Discord/OpenAI voice: rotate Realtime sessions at provider max duration without logging the expected session-expiry event as an error.
- Memory/local embeddings: run local GGUF embeddings in an isolated worker sidecar and degrade to configured fallback or keyword search on worker failure so native embedding crashes do not take down the Gateway. (#85348) Thanks @osolmaz.
- Gateway: clear the runtime config snapshot before `SIGUSR1` in-process restarts so config changes survive the next gateway loop. (#86388) Thanks @XuZehan-iCenter.

View File

@@ -2966,6 +2966,133 @@ describe("DiscordVoiceManager", () => {
expectUserMessageIncludes("openclaw wake answer");
});
it("accepts leading fuzzy wake names before realtime agent-proxy consults", async () => {
const manager = createManager(
{
groupPolicy: "open",
voice: {
enabled: true,
mode: "agent-proxy",
realtime: { provider: "openai", consultPolicy: "auto", requireWakeName: true },
},
},
undefined,
{
agents: {
list: [{ id: "agent-1", identity: { name: "Molty" } }],
},
},
);
await manager.join({ guildId: "g1", channelId: "1001" });
const entry = getSessionEntry(manager) as {
realtime?: {
beginSpeakerTurn: (
context: { extraSystemPrompt?: string; senderIsOwner: boolean; speakerLabel: string },
userId: string,
) => { close: () => void; sendInputAudio: (audio: Buffer) => void };
};
};
const bridgeParams = lastRealtimeBridgeParams() as
| {
onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void;
}
| undefined;
const montyTurn = entry.realtime?.beginSpeakerTurn(
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
"u-owner",
);
montyTurn?.sendInputAudio(Buffer.alloc(8));
bridgeParams?.onTranscript?.("user", "Monty, are you with us?", true);
await new Promise((resolve) => setTimeout(resolve, 260));
expect(agentCommandArgsAt(0).message).toContain("are you with us?");
expect(agentCommandArgsAt(0).message).not.toContain("Monty");
const motiTurn = entry.realtime?.beginSpeakerTurn(
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
"u-owner",
);
motiTurn?.sendInputAudio(Buffer.alloc(8));
bridgeParams?.onTranscript?.("user", "Moti, what's going on today?", true);
await new Promise((resolve) => setTimeout(resolve, 260));
expect(agentCommandArgsAt(1).message).toContain("what's going on today?");
expect(agentCommandArgsAt(1).message).not.toContain("Moti");
const openClawTurn = entry.realtime?.beginSpeakerTurn(
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
"u-owner",
);
openClawTurn?.sendInputAudio(Buffer.alloc(8));
bridgeParams?.onTranscript?.("user", "Open claw can you still hear me?", true);
await new Promise((resolve) => setTimeout(resolve, 260));
expect(agentCommandArgsAt(2).message).toContain("can you still hear me?");
expect(agentCommandArgsAt(2).message).not.toContain("Open claw");
});
it("rejects non-wake fuzzy leading phrases before realtime agent-proxy consults", async () => {
const manager = createManager(
{
groupPolicy: "open",
voice: {
enabled: true,
mode: "agent-proxy",
realtime: { provider: "openai", consultPolicy: "auto", requireWakeName: true },
},
},
undefined,
{
agents: {
list: [{ id: "agent-1", identity: { name: "Molty" } }],
},
},
);
await manager.join({ guildId: "g1", channelId: "1001" });
const entry = getSessionEntry(manager) as {
realtime?: {
beginSpeakerTurn: (
context: { extraSystemPrompt?: string; senderIsOwner: boolean; speakerLabel: string },
userId: string,
) => { close: () => void; sendInputAudio: (audio: Buffer) => void };
};
};
const bridgeParams = lastRealtimeBridgeParams() as
| {
onTranscript?: (role: "user" | "assistant", text: string, isFinal: boolean) => void;
}
| undefined;
const multiTurn = entry.realtime?.beginSpeakerTurn(
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
"u-owner",
);
multiTurn?.sendInputAudio(Buffer.alloc(8));
bridgeParams?.onTranscript?.("user", "Multi, step through the maintainer queue.", true);
await new Promise((resolve) => setTimeout(resolve, 260));
const ambientTurn = entry.realtime?.beginSpeakerTurn(
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
"u-owner",
);
ambientTurn?.sendInputAudio(Buffer.alloc(8));
bridgeParams?.onTranscript?.("user", "This is a multi-step maintainer problem.", true);
await new Promise((resolve) => setTimeout(resolve, 260));
const openLawTurn = entry.realtime?.beginSpeakerTurn(
{ extraSystemPrompt: undefined, senderIsOwner: true, speakerLabel: "Owner" },
"u-owner",
);
openLawTurn?.sendInputAudio(Buffer.alloc(8));
bridgeParams?.onTranscript?.("user", "Open law is not the wake phrase.", true);
await new Promise((resolve) => setTimeout(resolve, 260));
expect(agentCommandMock).not.toHaveBeenCalled();
});
it("leaves non-OpenAI agent-proxy realtime auto-response enabled when wake names are requested", async () => {
resolveConfiguredRealtimeVoiceProviderMock.mockReturnValueOnce({
provider: { id: "google" },

View File

@@ -64,6 +64,7 @@ const DISCORD_REALTIME_FORCED_CONSULT_FALLBACK_DELAY_MS = 200;
const DISCORD_REALTIME_DUPLICATE_ERROR_SUPPRESS_MS = 60_000;
const DISCORD_REALTIME_CONTROL_SPEECH_DEDUPE_MS = 5_000;
const DISCORD_REALTIME_OUTPUT_PLAYBACK_WATCHDOG_MARGIN_MS = 1_500;
const DISCORD_REALTIME_WAKE_NAME_FUZZY_PREFIX_WORDS = 3;
const REALTIME_PCM16_BYTES_PER_SAMPLE = 2;
const DISCORD_RAW_PCM_FRAME_BYTES = 3_840;
const DISCORD_REALTIME_OUTPUT_PREROLL_FRAMES = 25;
@@ -353,6 +354,19 @@ function normalizeWakeName(value: string): string | undefined {
return normalized || undefined;
}
function normalizeWakeNameCandidate(value: string): string | undefined {
const normalized = value
.toLowerCase()
.replace(/[^a-z0-9]+/g, " ")
.replace(/\s+/g, " ")
.trim();
return normalized || undefined;
}
function compactWakeName(value: string): string {
return value.replace(/[^a-z0-9]+/g, "");
}
function escapeRegExp(value: string): string {
return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
@@ -384,6 +398,136 @@ function stripLeadingWakeName(text: string, wakeName: string): string {
.trim();
}
type LeadingWakeNameCandidate = {
heardName: string;
endIndex: number;
strongBoundary: boolean;
};
type WakeNameTranscriptResult =
| { allowed: true; text: string; wakeName: string; heardName: string; match: "exact" | "fuzzy" }
| { allowed: false; text: string };
type AllowedWakeNameTranscriptResult = Extract<WakeNameTranscriptResult, { allowed: true }>;
function leadingWakeNameCandidates(text: string): LeadingWakeNameCandidate[] {
const opener = /^\s*(?:(?:hey|ok|okay)(?:\s*[-,:;]+\s*|\s+))?/i.exec(text);
const nameStart = opener?.[0].length ?? 0;
const candidates: LeadingWakeNameCandidate[] = [];
const tokenPattern = /[a-z0-9]+/gi;
tokenPattern.lastIndex = nameStart;
for (
let wordCount = 0;
wordCount < DISCORD_REALTIME_WAKE_NAME_FUZZY_PREFIX_WORDS;
wordCount += 1
) {
const token = tokenPattern.exec(text);
if (!token) {
break;
}
const between = text.slice(
wordCount === 0 ? nameStart : candidates[wordCount - 1]?.endIndex,
token.index,
);
if (wordCount > 0 && !/^[\s'-]+$/.test(between)) {
break;
}
const endIndex = token.index + token[0].length;
const heardName = normalizeWakeNameCandidate(text.slice(nameStart, endIndex));
if (!heardName) {
break;
}
const boundary = text.slice(endIndex).match(/^\s*([,.:;!?-]|$)/);
candidates.push({
heardName,
endIndex,
strongBoundary: Boolean(boundary),
});
}
return candidates;
}
function levenshteinDistance(left: string, right: string): number {
if (left === right) {
return 0;
}
if (!left) {
return right.length;
}
if (!right) {
return left.length;
}
let previous = Array.from({ length: right.length + 1 }, (_, index) => index);
for (let leftIndex = 0; leftIndex < left.length; leftIndex += 1) {
const current = [leftIndex + 1];
for (let rightIndex = 0; rightIndex < right.length; rightIndex += 1) {
const cost = left[leftIndex] === right[rightIndex] ? 0 : 1;
current[rightIndex + 1] = Math.min(
current[rightIndex] + 1,
previous[rightIndex + 1] + 1,
previous[rightIndex] + cost,
);
}
previous = current;
}
return previous[right.length] ?? Math.max(left.length, right.length);
}
function isFuzzyWakeNameMatch(candidate: LeadingWakeNameCandidate, wakeName: string): boolean {
const normalizedWakeName = normalizeWakeNameCandidate(wakeName);
if (!normalizedWakeName) {
return false;
}
const heardCompact = compactWakeName(candidate.heardName);
const wakeCompact = compactWakeName(normalizedWakeName);
if (!heardCompact || !wakeCompact || wakeCompact.length < 5) {
return false;
}
if (!candidate.strongBoundary) {
return false;
}
const distance = levenshteinDistance(heardCompact, wakeCompact);
if (distance <= 1) {
return true;
}
return distance === 2 && wakeCompact.length >= 5 && heardCompact.length !== wakeCompact.length;
}
function stripLeadingWakeNameCandidate(text: string, candidate: LeadingWakeNameCandidate): string {
return text
.slice(candidate.endIndex)
.replace(/^\s*(?:[-,:;.!?]+\s*)?/, "")
.trim();
}
function matchLeadingFuzzyWakeName(
text: string,
wakeNames: string[],
): AllowedWakeNameTranscriptResult | undefined {
for (const candidate of leadingWakeNameCandidates(text)) {
for (const wakeName of wakeNames) {
const normalizedWakeName = normalizeWakeNameCandidate(wakeName);
if (!normalizedWakeName) {
continue;
}
const heardCompact = compactWakeName(candidate.heardName);
const wakeCompact = compactWakeName(normalizedWakeName);
if (heardCompact === wakeCompact || isFuzzyWakeNameMatch(candidate, wakeName)) {
return {
allowed: true,
text: stripLeadingWakeNameCandidate(text, candidate),
wakeName,
heardName: candidate.heardName,
match: heardCompact === wakeCompact ? "exact" : "fuzzy",
};
}
}
}
return undefined;
}
function resolveDiscordRealtimeWakeNames(params: {
config: DiscordRealtimeVoiceConfig;
cfg: OpenClawConfig;
@@ -1273,13 +1417,26 @@ export class DiscordRealtimeVoiceSession implements VoiceRealtimeSession {
this.talkback.enqueue(acceptedText, this.consumePendingSpeakerContext());
}
private resolveWakeNameTranscript(text: string): { allowed: boolean; text: string } {
private resolveWakeNameTranscript(text: string): WakeNameTranscriptResult {
if (!this.requireWakeName) {
return { allowed: true, text };
return { allowed: true, text, wakeName: "", heardName: "", match: "exact" };
}
const wakeName = this.wakeNames.find((name) => includesWakeName(text, name));
if (wakeName) {
return { allowed: true, text: stripLeadingWakeName(text, wakeName) };
return {
allowed: true,
text: stripLeadingWakeName(text, wakeName),
wakeName,
heardName: wakeName,
match: "exact",
};
}
const fuzzyWakeName = matchLeadingFuzzyWakeName(text, this.wakeNames);
if (fuzzyWakeName) {
logger.info(
`discord voice: realtime wake-name gate matched canonical=${fuzzyWakeName.wakeName} heard=${fuzzyWakeName.heardName} match=${fuzzyWakeName.match} voiceSession=${this.params.entry.voiceSessionKey} agent=${this.params.entry.route.agentId}`,
);
return fuzzyWakeName;
}
return { allowed: false, text };
}