From a8fc28c71a533974d8a47d6794adc6b23908b699 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Mon, 25 May 2026 08:31:27 +0200 Subject: [PATCH] fix(perf): fail startup bench on early gateway exit --- CHANGELOG.md | 1 + scripts/bench-gateway-startup.ts | 65 ++++++-- test/scripts/bench-gateway-startup.test.ts | 173 ++++++++++++++++++++- 3 files changed, 225 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 928e4f69ba0c..bc97840f1b86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai - Install/update: bypass npm `min-release-age` policies with `--min-release-age=0` instead of `--before` so hosted installers keep working on npm versions that reject the combined config. (#84749) Thanks @TeodoroRodrigo. - WebChat: keep message-tool replies visible in the chat while still summarizing internal tool results for the model. Fixes #86347. Thanks @shakkernerd. +- Gateway/perf: fail startup benchmark samples when the Gateway process exits before benchmark teardown, including signal deaths after readiness probes. - Agents/commitments: serialize commitment store load-modify-save writes so concurrent heartbeat and CLI updates no longer lose dismissal, sent, or attempt state. (#81153) Thanks @ai-hpc. - xAI/LM Studio: promote plain-text tool-call fallbacks into structured tool calls and strip leaked internal tool syntax before user-facing delivery. (#86222) Thanks @fuller-stack-dev. - CLI: suppress benign self-update version-skew warnings during package post-update finalization. diff --git a/scripts/bench-gateway-startup.ts b/scripts/bench-gateway-startup.ts index 17441b819130..4fe2e17314bc 100644 --- a/scripts/bench-gateway-startup.ts +++ b/scripts/bench-gateway-startup.ts @@ -34,6 +34,7 @@ type ProbeTransition = { type GatewaySample = { cpuCoreRatio: number | null; cpuMs: number | null; + exitedBeforeTeardown?: boolean; exitCode: number | null; firstOutputMs: number | null; gatewayReadyLogLine: string | null; @@ -79,6 +80,15 @@ type BenchmarkFailure = { sampleIndex: number; }; +type ChildExit = { + exitCode: number | null; + signal: string | null; +}; + +type StopChildResult = ChildExit & { + exitedBeforeTeardown: boolean; +}; + type PluginFixtureResult = { pluginIds: string[]; pluginsDir: string; @@ -407,6 +417,17 @@ function collectResultFailures( reason: `missing ${missing.join(", ")}`, sampleIndex: index + 1, }); + return; + } + if (sample.exitedBeforeTeardown === true) { + failures.push({ + id: result.id, + reason: + sample.signal == null + ? `child exited ${sample.exitCode ?? "before teardown"}` + : `child exited by ${sample.signal}`, + sampleIndex: index + 1, + }); } }); } @@ -687,36 +708,52 @@ function sanitizedEnv( return env; } -async function stopChild(child: ChildProcessWithoutNullStreams): Promise<{ - exitCode: number | null; - signal: string | null; -}> { - if (child.exitCode != null || child.signalCode != null) { - return { exitCode: child.exitCode, signal: child.signalCode }; +async function stopChild(child: ChildProcessWithoutNullStreams): Promise { + const currentExit = (): ChildExit | null => + child.exitCode != null || child.signalCode != null + ? { exitCode: child.exitCode, signal: child.signalCode } + : null; + + const existingExit = currentExit(); + if (existingExit != null) { + return { ...existingExit, exitedBeforeTeardown: true }; } - const exited = new Promise<{ exitCode: number | null; signal: string | null }>((resolve) => { - child.once("exit", (exitCode, signal) => resolve({ exitCode, signal })); + + let observedExit: ChildExit | null = null; + const exited = new Promise((resolve) => { + child.once("exit", (exitCode, signal) => { + observedExit = { exitCode, signal }; + resolve(observedExit); + }); }); - killProcessTree(child, "SIGTERM"); + + await new Promise((resolve) => setImmediate(resolve)); + const queuedExit = observedExit ?? currentExit(); + if (queuedExit != null) { + return { ...queuedExit, exitedBeforeTeardown: true }; + } + + const sentTeardownSignal = killProcessTree(child, "SIGTERM"); const timeout = delay(2000).then(() => { if (child.exitCode == null && child.signalCode == null) { killProcessTree(child, "SIGKILL"); } return exited; }); - return Promise.race([exited, timeout]); + const exit = await Promise.race([exited, timeout]); + return { ...exit, exitedBeforeTeardown: !sentTeardownSignal }; } -function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): void { +function killProcessTree(child: ChildProcessWithoutNullStreams, signal: NodeJS.Signals): boolean { if (process.platform !== "win32" && child.pid !== undefined) { try { process.kill(-child.pid, signal); - return; + return true; } catch { // Fall back to the direct child below. } } - child.kill(signal); + return child.kill(signal); } function collectStartupTrace(line: string, startupTrace: Record): void { @@ -975,6 +1012,7 @@ async function runGatewaySample(options: { return { cpuCoreRatio, cpuMs, + exitedBeforeTeardown: exit.exitedBeforeTeardown, exitCode: exit.exitCode, firstOutputMs, gatewayReadyLogLine, @@ -1109,6 +1147,7 @@ export const testing = { parsePositiveInt, resolveEntry, sanitizedEnv, + stopChild, summarizeCase, waitForProbe, writeConfig, diff --git a/test/scripts/bench-gateway-startup.test.ts b/test/scripts/bench-gateway-startup.test.ts index 41fcdf8c4008..1dd3828c8eae 100644 --- a/test/scripts/bench-gateway-startup.test.ts +++ b/test/scripts/bench-gateway-startup.test.ts @@ -1,10 +1,11 @@ import { spawnSync } from "node:child_process"; +import { EventEmitter } from "node:events"; import fs from "node:fs"; import { createServer } from "node:http"; import os from "node:os"; import path from "node:path"; import { performance } from "node:perf_hooks"; -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; import { testing } from "../../scripts/bench-gateway-startup.ts"; async function listenOnLoopback(handler: Parameters[0]) { @@ -154,6 +155,176 @@ describe("gateway startup benchmark script", () => { ]); }); + it("flags samples that become ready and then exit nonzero", () => { + const result = testing.summarizeCase({ config: {}, id: "demo", name: "demo" }, [ + { + cpuCoreRatio: 0.5, + cpuMs: 100, + exitedBeforeTeardown: true, + exitCode: 1, + firstOutputMs: 1, + gatewayReadyLogLine: "[gateway] ready", + gatewayReadyLogMs: 20, + healthz: { + firstErrorKind: "econnrefused", + firstRecoveryMs: 10, + ms: 10, + status: 200, + transitions: [], + }, + httpListenLogLine: "[gateway] http server listening (0 plugins)", + httpListenLogMs: 5, + maxRssMb: 120, + outputTail: "ready\\nError: startup sidecar crashed", + readyz: { + firstErrorKind: "http-503", + firstRecoveryMs: 18, + ms: 18, + status: 200, + transitions: [], + }, + signal: null, + startupTrace: {}, + }, + ]); + + expect(testing.collectResultFailures([result], { processMetricsRequired: true })).toEqual([ + { + id: "demo", + reason: "child exited 1", + sampleIndex: 1, + }, + ]); + }); + + it("does not flag nonzero exits from intentional teardown", () => { + const result = testing.summarizeCase({ config: {}, id: "demo", name: "demo" }, [ + { + cpuCoreRatio: 0.5, + cpuMs: 100, + exitedBeforeTeardown: false, + exitCode: 1, + firstOutputMs: 1, + gatewayReadyLogLine: "[gateway] ready", + gatewayReadyLogMs: 20, + healthz: { + firstErrorKind: "econnrefused", + firstRecoveryMs: 10, + ms: 10, + status: 200, + transitions: [], + }, + httpListenLogLine: "[gateway] http server listening (0 plugins)", + httpListenLogMs: 5, + maxRssMb: 120, + outputTail: "", + readyz: { + firstErrorKind: "http-503", + firstRecoveryMs: 18, + ms: 18, + status: 200, + transitions: [], + }, + signal: null, + startupTrace: {}, + }, + ]); + + expect(testing.collectResultFailures([result], { processMetricsRequired: true })).toEqual([]); + }); + + it("flags samples that become ready and then die from a signal", () => { + const result = testing.summarizeCase({ config: {}, id: "demo", name: "demo" }, [ + { + cpuCoreRatio: 0.5, + cpuMs: 100, + exitedBeforeTeardown: true, + exitCode: null, + firstOutputMs: 1, + gatewayReadyLogLine: "[gateway] ready", + gatewayReadyLogMs: 20, + healthz: { + firstErrorKind: "econnrefused", + firstRecoveryMs: 10, + ms: 10, + status: 200, + transitions: [], + }, + httpListenLogLine: "[gateway] http server listening (0 plugins)", + httpListenLogMs: 5, + maxRssMb: 120, + outputTail: "ready\\nsegmentation fault", + readyz: { + firstErrorKind: "http-503", + firstRecoveryMs: 18, + ms: 18, + status: 200, + transitions: [], + }, + signal: "SIGSEGV", + startupTrace: {}, + }, + ]); + + expect(testing.collectResultFailures([result], { processMetricsRequired: true })).toEqual([ + { + id: "demo", + reason: "child exited by SIGSEGV", + sampleIndex: 1, + }, + ]); + }); + + it("classifies queued child exits before sending teardown signals", async () => { + const child = new EventEmitter() as EventEmitter & { + exitCode: number | null; + kill: ReturnType; + signalCode: NodeJS.Signals | null; + }; + child.exitCode = null; + child.signalCode = null; + child.kill = vi.fn(() => true); + + const stopped = testing.stopChild(child as unknown as Parameters[0]); + queueMicrotask(() => { + child.exitCode = 7; + child.emit("exit", 7, null); + }); + + await expect(stopped).resolves.toEqual({ + exitedBeforeTeardown: true, + exitCode: 7, + signal: null, + }); + expect(child.kill).not.toHaveBeenCalled(); + }); + + it("classifies failed teardown signaling as a pre-teardown child exit", async () => { + const child = new EventEmitter() as EventEmitter & { + exitCode: number | null; + kill: ReturnType; + signalCode: NodeJS.Signals | null; + }; + child.exitCode = null; + child.signalCode = null; + child.kill = vi.fn(() => { + setImmediate(() => { + child.exitCode = 8; + child.emit("exit", 8, null); + }); + return false; + }); + + await expect( + testing.stopChild(child as unknown as Parameters[0]), + ).resolves.toEqual({ + exitedBeforeTeardown: true, + exitCode: 8, + signal: null, + }); + expect(child.kill).toHaveBeenCalledWith("SIGTERM"); + }); + it("collects Count-suffixed startup trace metrics", () => { const startupTrace: Record = {};