fix(e2e): stop interrupted docker builds

This commit is contained in:
Vincent Koc
2026-06-03 13:48:24 +02:00
parent e5e6cf04a2
commit d31f4e2d62
3 changed files with 188 additions and 3 deletions

View File

@@ -65,6 +65,7 @@ Docs: https://docs.openclaw.ai
- Release/CI/E2E: reject oversized ClickClack fixture request bodies before release journey smokes can accumulate unbounded payloads.
- Release/CI/E2E: reject oversized OpenAI image-auth mock request bodies before Docker proof runs can accumulate unbounded payloads.
- Release/CI/E2E: require the Kitchen Sink RPC walk to prove every expected plugin tool is cataloged and effective before invoking tool fixtures.
- Release/CI/E2E: stop tracked Docker build commands when centralized build wrappers receive shutdown signals.
- Release/CI/E2E: fail secret-provider proof runs when temporary state cleanup still fails after retries instead of hiding the cleanup error.
- Release/CI/E2E: fail package-candidate ref proofs when temporary source worktree cleanup fails instead of leaving stale worktrees behind.
- Release/CI/E2E: remove package tarball extract directories when tar extraction fails before validation can continue.

View File

@@ -73,6 +73,15 @@ docker_build_timeout_required() {
return 1
}
docker_build_signal_exit_status() {
case "$1" in
129 | 130 | 143)
return 0
;;
esac
return 1
}
docker_build_heartbeat_seconds() {
local configured="${OPENCLAW_DOCKER_BUILD_HEARTBEAT_SECONDS:-30}"
if [[ "$configured" =~ ^[0-9]+$ ]] && [ "$configured" -ge 1 ]; then
@@ -104,11 +113,76 @@ docker_build_run_logged() {
local started_at="$SECONDS"
local next_heartbeat=$heartbeat_seconds
local build_status=0
local build_pid=""
local previous_int_trap
local previous_term_trap
local previous_hup_trap
local heartbeat_sleep_pid=""
previous_int_trap="$(trap -p INT || true)"
previous_term_trap="$(trap -p TERM || true)"
previous_hup_trap="$(trap -p HUP || true)"
docker_build_restore_signal_traps() {
if [ -n "$previous_int_trap" ]; then
eval "$previous_int_trap"
else
trap - INT
fi
if [ -n "$previous_term_trap" ]; then
eval "$previous_term_trap"
else
trap - TERM
fi
if [ -n "$previous_hup_trap" ]; then
eval "$previous_hup_trap"
else
trap - HUP
fi
}
docker_build_signal_process_tree() {
local signal="$1"
local process_id="$2"
local child_pid
if command -v pgrep >/dev/null 2>&1; then
while IFS= read -r child_pid; do
if [ -n "$child_pid" ]; then
docker_build_signal_process_tree "$signal" "$child_pid"
fi
done < <(pgrep -P "$process_id" 2>/dev/null || true)
fi
kill -s "$signal" -- "-$process_id" 2>/dev/null ||
kill -s "$signal" "$process_id" 2>/dev/null ||
true
}
docker_build_stop_tracked_build() {
local signal="$1"
local exit_code="$2"
if [ -n "$heartbeat_sleep_pid" ] && kill -0 "$heartbeat_sleep_pid" 2>/dev/null; then
kill "$heartbeat_sleep_pid" 2>/dev/null || true
wait "$heartbeat_sleep_pid" 2>/dev/null || true
fi
if [ -n "$build_pid" ] && kill -0 "$build_pid" 2>/dev/null; then
docker_build_signal_process_tree "$signal" "$build_pid"
wait "$build_pid" 2>/dev/null || true
fi
docker_build_restore_signal_traps
return "$exit_code"
}
trap 'docker_build_stop_tracked_build TERM 130; return 130' INT
trap 'docker_build_stop_tracked_build TERM 143; return 143' TERM
trap 'docker_build_stop_tracked_build HUP 129; return 129' HUP
docker_build_run_command "$timeout_value" "$@" >"$log_file" 2>&1 &
local build_pid="$!"
build_pid="$!"
while kill -0 "$build_pid" 2>/dev/null; do
/bin/sleep 1
/bin/sleep 1 &
heartbeat_sleep_pid="$!"
wait "$heartbeat_sleep_pid" 2>/dev/null || true
heartbeat_sleep_pid=""
local elapsed_seconds=$((SECONDS - started_at))
if [ "$elapsed_seconds" -ge "$next_heartbeat" ] && kill -0 "$build_pid" 2>/dev/null; then
local log_bytes="0"
@@ -122,6 +196,7 @@ docker_build_run_logged() {
done
wait "$build_pid" || build_status="$?"
docker_build_restore_signal_traps
return "$build_status"
}
@@ -134,6 +209,7 @@ docker_build_with_retries() {
local max_attempts=$((retries + 1))
local log_file
local command=()
local build_status=0
while IFS= read -r -d '' part; do
command+=("$part")
done < <(docker_build_command "$@")
@@ -144,6 +220,13 @@ docker_build_with_retries() {
if docker_build_run_logged "$label" "$timeout_value" "$log_file" "${command[@]}"; then
rm -f "$log_file"
return 0
else
build_status="$?"
fi
if docker_build_signal_exit_status "$build_status"; then
rm -f "$log_file"
return "$build_status"
fi
if [ "$attempt" -ge "$max_attempts" ] || ! docker_build_transient_failure "$log_file"; then

View File

@@ -1,6 +1,7 @@
import { execFileSync } from "node:child_process";
import { execFileSync, spawn } from "node:child_process";
import {
chmodSync,
existsSync,
mkdtempSync,
mkdirSync,
readdirSync,
@@ -10,6 +11,7 @@ import {
} from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { setTimeout as delay } from "node:timers/promises";
import { describe, expect, it } from "vitest";
const HELPER_PATH = "scripts/lib/docker-build.sh";
@@ -293,6 +295,105 @@ output="$(docker_build_run e2e-build -t demo-image .)"
}
});
it("stops the tracked build command without retrying when interrupted", async () => {
const workDir = mkdtempSync(join(tmpdir(), "openclaw-docker-build-signal-"));
try {
const binDir = join(workDir, "bin");
mkdirSync(binDir);
writeFileSync(
join(binDir, "docker"),
`#!/bin/bash
set -euo pipefail
count=0
if [ -f "$TMPDIR/docker-count" ]; then
count="$(<"$TMPDIR/docker-count")"
fi
count="$((count + 1))"
printf '%s\\n' "$count" >"$TMPDIR/docker-count"
printf '%s\\n' "$$" >"$TMPDIR/docker.pid"
printf 'rpc error: code = Unavailable\\n'
trap 'printf "term\\n" >"$TMPDIR/docker.term"; exit 0' TERM
while true; do
/bin/sleep 1
done
`,
);
chmodSync(join(binDir, "docker"), 0o755);
const rootDir = process.cwd();
writeFileSync(
join(workDir, "runner.sh"),
`#!/bin/bash
set -euo pipefail
ROOT_DIR=${shellQuote(rootDir)}
TMPDIR=${shellQuote(workDir)}
export ROOT_DIR TMPDIR
export PATH="$TMPDIR/bin:$PATH"
export OPENCLAW_DOCKER_BUILD_RETRIES=3
source "$ROOT_DIR/scripts/lib/docker-build.sh"
docker_build_run e2e-build -t demo-image .
`,
);
chmodSync(join(workDir, "runner.sh"), 0o755);
const waitForFile = async (filePath: string) => {
for (let attempt = 0; attempt < 50; attempt += 1) {
if (existsSync(filePath)) {
return;
}
await delay(100);
}
throw new Error(`file was not written: ${filePath}`);
};
const waitForExit = async (child: ReturnType<typeof spawn>) =>
await new Promise<{ code: number | null; signal: NodeJS.Signals | null }>((resolve) => {
child.once("exit", (code, signal) => resolve({ code, signal }));
});
const waitForDead = async (pid: number) => {
for (let attempt = 0; attempt < 50; attempt += 1) {
try {
process.kill(pid, 0);
} catch {
return;
}
await delay(100);
}
throw new Error(`process stayed alive: ${pid}`);
};
const runInterruptedBuild = async (signal: NodeJS.Signals, expectedCode: number) => {
rmSync(join(workDir, "docker.pid"), { force: true });
rmSync(join(workDir, "docker.term"), { force: true });
rmSync(join(workDir, "docker-count"), { force: true });
const runner = spawn(join(workDir, "runner.sh"), {
env: { ...process.env, TMPDIR: workDir },
stdio: "ignore",
});
try {
const pidPath = join(workDir, "docker.pid");
await waitForFile(pidPath);
const buildPid = Number.parseInt(readFileSync(pidPath, "utf8"), 10);
runner.kill(signal);
const exit = await waitForExit(runner);
expect(exit).toEqual({ code: expectedCode, signal: null });
await waitForFile(join(workDir, "docker.term"));
expect(readFileSync(join(workDir, "docker-count"), "utf8").trim()).toBe("1");
await waitForDead(buildPid);
} finally {
if (runner.exitCode === null && runner.signalCode === null) {
runner.kill("SIGKILL");
}
}
};
await runInterruptedBuild("SIGTERM", 143);
await runInterruptedBuild("SIGINT", 130);
} finally {
rmSync(workDir, { recursive: true, force: true });
}
});
it("does not delay fast successful centralized Docker builds until the next heartbeat", () => {
const workDir = mkdtempSync(join(tmpdir(), "openclaw-docker-build-fast-heartbeat-"));