fix(perf): fail startup bench on early gateway exit

This commit is contained in:
Vincent Koc
2026-05-25 08:31:27 +02:00
parent cd627803a0
commit a8fc28c71a
3 changed files with 225 additions and 14 deletions

View File

@@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai
- Install/update: bypass npm `min-release-age` policies with `--min-release-age=0` instead of `--before` so hosted installers keep working on npm versions that reject the combined config. (#84749) Thanks @TeodoroRodrigo.
- WebChat: keep message-tool replies visible in the chat while still summarizing internal tool results for the model. Fixes #86347. Thanks @shakkernerd.
- Gateway/perf: fail startup benchmark samples when the Gateway process exits before benchmark teardown, including signal deaths after readiness probes.
- Agents/commitments: serialize commitment store load-modify-save writes so concurrent heartbeat and CLI updates no longer lose dismissal, sent, or attempt state. (#81153) Thanks @ai-hpc.
- xAI/LM Studio: promote plain-text tool-call fallbacks into structured tool calls and strip leaked internal tool syntax before user-facing delivery. (#86222) Thanks @fuller-stack-dev.
- CLI: suppress benign self-update version-skew warnings during package post-update finalization.

View File

@@ -34,6 +34,7 @@ type ProbeTransition = {
type GatewaySample = {
cpuCoreRatio: number | null;
cpuMs: number | null;
exitedBeforeTeardown?: boolean;
exitCode: number | null;
firstOutputMs: number | null;
gatewayReadyLogLine: string | null;
@@ -79,6 +80,15 @@ type BenchmarkFailure = {
sampleIndex: number;
};
type ChildExit = {
exitCode: number | null;
signal: string | null;
};
type StopChildResult = ChildExit & {
exitedBeforeTeardown: boolean;
};
type PluginFixtureResult = {
pluginIds: string[];
pluginsDir: string;
@@ -407,6 +417,17 @@ function collectResultFailures(
reason: `missing ${missing.join(", ")}`,
sampleIndex: index + 1,
});
return;
}
if (sample.exitedBeforeTeardown === true) {
failures.push({
id: result.id,
reason:
sample.signal == null
? `child exited ${sample.exitCode ?? "before teardown"}`
: `child exited by ${sample.signal}`,
sampleIndex: index + 1,
});
}
});
}
@@ -687,36 +708,52 @@ function sanitizedEnv(
return env;
}
async function stopChild(child: ChildProcessWithoutNullStreams): Promise<{
exitCode: number | null;
signal: string | null;
}> {
if (child.exitCode != null || child.signalCode != null) {
return { exitCode: child.exitCode, signal: child.signalCode };
async function stopChild(child: ChildProcessWithoutNullStreams): Promise<StopChildResult> {
const currentExit = (): ChildExit | null =>
child.exitCode != null || child.signalCode != null
? { exitCode: child.exitCode, signal: child.signalCode }
: null;
const existingExit = currentExit();
if (existingExit != null) {
return { ...existingExit, exitedBeforeTeardown: true };
}
const exited = new Promise<{ exitCode: number | null; signal: string | null }>((resolve) => {
child.once("exit", (exitCode, signal) => resolve({ exitCode, signal }));
let observedExit: ChildExit | null = null;
const exited = new Promise<ChildExit>((resolve) => {
child.once("exit", (exitCode, signal) => {
observedExit = { exitCode, signal };
resolve(observedExit);
});
});
killProcessTree(child, "SIGTERM");
await new Promise<void>((resolve) => setImmediate(resolve));
const queuedExit = observedExit ?? currentExit();
if (queuedExit != null) {
return { ...queuedExit, exitedBeforeTeardown: true };
}
const sentTeardownSignal = killProcessTree(child, "SIGTERM");
const timeout = delay(2000).then(() => {
if (child.exitCode == null && child.signalCode == null) {
killProcessTree(child, "SIGKILL");
}
return exited;
});
return Promise.race([exited, timeout]);
const exit = await Promise.race([exited, timeout]);
return { ...exit, exitedBeforeTeardown: !sentTeardownSignal };
}
function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): void {
function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): boolean {
if (process.platform !== "win32" && child.pid !== undefined) {
try {
process.kill(-child.pid, signal);
return;
return true;
} catch {
// Fall back to the direct child below.
}
}
child.kill(signal);
return child.kill(signal);
}
function collectStartupTrace(line: string, startupTrace: Record<string, number>): void {
@@ -975,6 +1012,7 @@ async function runGatewaySample(options: {
return {
cpuCoreRatio,
cpuMs,
exitedBeforeTeardown: exit.exitedBeforeTeardown,
exitCode: exit.exitCode,
firstOutputMs,
gatewayReadyLogLine,
@@ -1109,6 +1147,7 @@ export const testing = {
parsePositiveInt,
resolveEntry,
sanitizedEnv,
stopChild,
summarizeCase,
waitForProbe,
writeConfig,

View File

@@ -1,10 +1,11 @@
import { spawnSync } from "node:child_process";
import { EventEmitter } from "node:events";
import fs from "node:fs";
import { createServer } from "node:http";
import os from "node:os";
import path from "node:path";
import { performance } from "node:perf_hooks";
import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";
import { testing } from "../../scripts/bench-gateway-startup.ts";
async function listenOnLoopback(handler: Parameters<typeof createServer>[0]) {
@@ -154,6 +155,176 @@ describe("gateway startup benchmark script", () => {
]);
});
it("flags samples that become ready and then exit nonzero", () => {
const result = testing.summarizeCase({ config: {}, id: "demo", name: "demo" }, [
{
cpuCoreRatio: 0.5,
cpuMs: 100,
exitedBeforeTeardown: true,
exitCode: 1,
firstOutputMs: 1,
gatewayReadyLogLine: "[gateway] ready",
gatewayReadyLogMs: 20,
healthz: {
firstErrorKind: "econnrefused",
firstRecoveryMs: 10,
ms: 10,
status: 200,
transitions: [],
},
httpListenLogLine: "[gateway] http server listening (0 plugins)",
httpListenLogMs: 5,
maxRssMb: 120,
outputTail: "ready\\nError: startup sidecar crashed",
readyz: {
firstErrorKind: "http-503",
firstRecoveryMs: 18,
ms: 18,
status: 200,
transitions: [],
},
signal: null,
startupTrace: {},
},
]);
expect(testing.collectResultFailures([result], { processMetricsRequired: true })).toEqual([
{
id: "demo",
reason: "child exited 1",
sampleIndex: 1,
},
]);
});
it("does not flag nonzero exits from intentional teardown", () => {
const result = testing.summarizeCase({ config: {}, id: "demo", name: "demo" }, [
{
cpuCoreRatio: 0.5,
cpuMs: 100,
exitedBeforeTeardown: false,
exitCode: 1,
firstOutputMs: 1,
gatewayReadyLogLine: "[gateway] ready",
gatewayReadyLogMs: 20,
healthz: {
firstErrorKind: "econnrefused",
firstRecoveryMs: 10,
ms: 10,
status: 200,
transitions: [],
},
httpListenLogLine: "[gateway] http server listening (0 plugins)",
httpListenLogMs: 5,
maxRssMb: 120,
outputTail: "",
readyz: {
firstErrorKind: "http-503",
firstRecoveryMs: 18,
ms: 18,
status: 200,
transitions: [],
},
signal: null,
startupTrace: {},
},
]);
expect(testing.collectResultFailures([result], { processMetricsRequired: true })).toEqual([]);
});
it("flags samples that become ready and then die from a signal", () => {
const result = testing.summarizeCase({ config: {}, id: "demo", name: "demo" }, [
{
cpuCoreRatio: 0.5,
cpuMs: 100,
exitedBeforeTeardown: true,
exitCode: null,
firstOutputMs: 1,
gatewayReadyLogLine: "[gateway] ready",
gatewayReadyLogMs: 20,
healthz: {
firstErrorKind: "econnrefused",
firstRecoveryMs: 10,
ms: 10,
status: 200,
transitions: [],
},
httpListenLogLine: "[gateway] http server listening (0 plugins)",
httpListenLogMs: 5,
maxRssMb: 120,
outputTail: "ready\\nsegmentation fault",
readyz: {
firstErrorKind: "http-503",
firstRecoveryMs: 18,
ms: 18,
status: 200,
transitions: [],
},
signal: "SIGSEGV",
startupTrace: {},
},
]);
expect(testing.collectResultFailures([result], { processMetricsRequired: true })).toEqual([
{
id: "demo",
reason: "child exited by SIGSEGV",
sampleIndex: 1,
},
]);
});
it("classifies queued child exits before sending teardown signals", async () => {
const child = new EventEmitter() as EventEmitter & {
exitCode: number | null;
kill: ReturnType<typeof vi.fn>;
signalCode: NodeJS.Signals | null;
};
child.exitCode = null;
child.signalCode = null;
child.kill = vi.fn(() => true);
const stopped = testing.stopChild(child as unknown as Parameters<typeof testing.stopChild>[0]);
queueMicrotask(() => {
child.exitCode = 7;
child.emit("exit", 7, null);
});
await expect(stopped).resolves.toEqual({
exitedBeforeTeardown: true,
exitCode: 7,
signal: null,
});
expect(child.kill).not.toHaveBeenCalled();
});
it("classifies failed teardown signaling as a pre-teardown child exit", async () => {
const child = new EventEmitter() as EventEmitter & {
exitCode: number | null;
kill: ReturnType<typeof vi.fn>;
signalCode: NodeJS.Signals | null;
};
child.exitCode = null;
child.signalCode = null;
child.kill = vi.fn(() => {
setImmediate(() => {
child.exitCode = 8;
child.emit("exit", 8, null);
});
return false;
});
await expect(
testing.stopChild(child as unknown as Parameters<typeof testing.stopChild>[0]),
).resolves.toEqual({
exitedBeforeTeardown: true,
exitCode: 8,
signal: null,
});
expect(child.kill).toHaveBeenCalledWith("SIGTERM");
});
it("collects Count-suffixed startup trace metrics", () => {
const startupTrace: Record<string, number> = {};