diff --git a/CHANGELOG.md b/CHANGELOG.md index b3fb7be5c2c3..0ab045f282ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,7 @@ Docs: https://docs.openclaw.ai - Release/CI/E2E: reject oversized ClickClack fixture request bodies before release journey smokes can accumulate unbounded payloads. - Release/CI/E2E: reject oversized OpenAI image-auth mock request bodies before Docker proof runs can accumulate unbounded payloads. - Release/CI/E2E: require the Kitchen Sink RPC walk to prove every expected plugin tool is cataloged and effective before invoking tool fixtures. +- Release/CI/E2E: stop tracked Docker build commands when centralized build wrappers receive shutdown signals. - Release/CI/E2E: fail secret-provider proof runs when temporary state cleanup still fails after retries instead of hiding the cleanup error. - Release/CI/E2E: fail package-candidate ref proofs when temporary source worktree cleanup fails instead of leaving stale worktrees behind. - Release/CI/E2E: remove package tarball extract directories when tar extraction fails before validation can continue. diff --git a/scripts/lib/docker-build.sh b/scripts/lib/docker-build.sh index 6d47569b90c5..3d6e77817bf3 100644 --- a/scripts/lib/docker-build.sh +++ b/scripts/lib/docker-build.sh @@ -73,6 +73,15 @@ docker_build_timeout_required() { return 1 } +docker_build_signal_exit_status() { + case "$1" in + 129 | 130 | 143) + return 0 + ;; + esac + return 1 +} + docker_build_heartbeat_seconds() { local configured="${OPENCLAW_DOCKER_BUILD_HEARTBEAT_SECONDS:-30}" if [[ "$configured" =~ ^[0-9]+$ ]] && [ "$configured" -ge 1 ]; then @@ -104,11 +113,76 @@ docker_build_run_logged() { local started_at="$SECONDS" local next_heartbeat=$heartbeat_seconds local build_status=0 + local build_pid="" + local previous_int_trap + local previous_term_trap + local previous_hup_trap + local heartbeat_sleep_pid="" + + previous_int_trap="$(trap -p INT || true)" + previous_term_trap="$(trap -p TERM || true)" + previous_hup_trap="$(trap -p HUP || true)" + + docker_build_restore_signal_traps() { + if [ -n "$previous_int_trap" ]; then + eval "$previous_int_trap" + else + trap - INT + fi + if [ -n "$previous_term_trap" ]; then + eval "$previous_term_trap" + else + trap - TERM + fi + if [ -n "$previous_hup_trap" ]; then + eval "$previous_hup_trap" + else + trap - HUP + fi + } + + docker_build_signal_process_tree() { + local signal="$1" + local process_id="$2" + local child_pid + if command -v pgrep >/dev/null 2>&1; then + while IFS= read -r child_pid; do + if [ -n "$child_pid" ]; then + docker_build_signal_process_tree "$signal" "$child_pid" + fi + done < <(pgrep -P "$process_id" 2>/dev/null || true) + fi + kill -s "$signal" -- "-$process_id" 2>/dev/null || + kill -s "$signal" "$process_id" 2>/dev/null || + true + } + + docker_build_stop_tracked_build() { + local signal="$1" + local exit_code="$2" + if [ -n "$heartbeat_sleep_pid" ] && kill -0 "$heartbeat_sleep_pid" 2>/dev/null; then + kill "$heartbeat_sleep_pid" 2>/dev/null || true + wait "$heartbeat_sleep_pid" 2>/dev/null || true + fi + if [ -n "$build_pid" ] && kill -0 "$build_pid" 2>/dev/null; then + docker_build_signal_process_tree "$signal" "$build_pid" + wait "$build_pid" 2>/dev/null || true + fi + docker_build_restore_signal_traps + return "$exit_code" + } + + trap 'docker_build_stop_tracked_build TERM 130; return 130' INT + trap 'docker_build_stop_tracked_build TERM 143; return 143' TERM + trap 'docker_build_stop_tracked_build HUP 129; return 129' HUP docker_build_run_command "$timeout_value" "$@" >"$log_file" 2>&1 & - local build_pid="$!" + build_pid="$!" while kill -0 "$build_pid" 2>/dev/null; do - /bin/sleep 1 + /bin/sleep 1 & + heartbeat_sleep_pid="$!" + wait "$heartbeat_sleep_pid" 2>/dev/null || true + heartbeat_sleep_pid="" local elapsed_seconds=$((SECONDS - started_at)) if [ "$elapsed_seconds" -ge "$next_heartbeat" ] && kill -0 "$build_pid" 2>/dev/null; then local log_bytes="0" @@ -122,6 +196,7 @@ docker_build_run_logged() { done wait "$build_pid" || build_status="$?" + docker_build_restore_signal_traps return "$build_status" } @@ -134,6 +209,7 @@ docker_build_with_retries() { local max_attempts=$((retries + 1)) local log_file local command=() + local build_status=0 while IFS= read -r -d '' part; do command+=("$part") done < <(docker_build_command "$@") @@ -144,6 +220,13 @@ docker_build_with_retries() { if docker_build_run_logged "$label" "$timeout_value" "$log_file" "${command[@]}"; then rm -f "$log_file" return 0 + else + build_status="$?" + fi + + if docker_build_signal_exit_status "$build_status"; then + rm -f "$log_file" + return "$build_status" fi if [ "$attempt" -ge "$max_attempts" ] || ! docker_build_transient_failure "$log_file"; then diff --git a/test/scripts/docker-build-helper.test.ts b/test/scripts/docker-build-helper.test.ts index 3168c43794ab..1c778028c2c6 100644 --- a/test/scripts/docker-build-helper.test.ts +++ b/test/scripts/docker-build-helper.test.ts @@ -1,6 +1,7 @@ -import { execFileSync } from "node:child_process"; +import { execFileSync, spawn } from "node:child_process"; import { chmodSync, + existsSync, mkdtempSync, mkdirSync, readdirSync, @@ -10,6 +11,7 @@ import { } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; +import { setTimeout as delay } from "node:timers/promises"; import { describe, expect, it } from "vitest"; const HELPER_PATH = "scripts/lib/docker-build.sh"; @@ -293,6 +295,105 @@ output="$(docker_build_run e2e-build -t demo-image .)" } }); + it("stops the tracked build command without retrying when interrupted", async () => { + const workDir = mkdtempSync(join(tmpdir(), "openclaw-docker-build-signal-")); + + try { + const binDir = join(workDir, "bin"); + mkdirSync(binDir); + writeFileSync( + join(binDir, "docker"), + `#!/bin/bash +set -euo pipefail +count=0 +if [ -f "$TMPDIR/docker-count" ]; then + count="$(<"$TMPDIR/docker-count")" +fi +count="$((count + 1))" +printf '%s\\n' "$count" >"$TMPDIR/docker-count" +printf '%s\\n' "$$" >"$TMPDIR/docker.pid" +printf 'rpc error: code = Unavailable\\n' +trap 'printf "term\\n" >"$TMPDIR/docker.term"; exit 0' TERM +while true; do + /bin/sleep 1 +done +`, + ); + chmodSync(join(binDir, "docker"), 0o755); + const rootDir = process.cwd(); + writeFileSync( + join(workDir, "runner.sh"), + `#!/bin/bash +set -euo pipefail +ROOT_DIR=${shellQuote(rootDir)} +TMPDIR=${shellQuote(workDir)} +export ROOT_DIR TMPDIR +export PATH="$TMPDIR/bin:$PATH" +export OPENCLAW_DOCKER_BUILD_RETRIES=3 +source "$ROOT_DIR/scripts/lib/docker-build.sh" +docker_build_run e2e-build -t demo-image . +`, + ); + chmodSync(join(workDir, "runner.sh"), 0o755); + + const waitForFile = async (filePath: string) => { + for (let attempt = 0; attempt < 50; attempt += 1) { + if (existsSync(filePath)) { + return; + } + await delay(100); + } + throw new Error(`file was not written: ${filePath}`); + }; + const waitForExit = async (child: ReturnType) => + await new Promise<{ code: number | null; signal: NodeJS.Signals | null }>((resolve) => { + child.once("exit", (code, signal) => resolve({ code, signal })); + }); + const waitForDead = async (pid: number) => { + for (let attempt = 0; attempt < 50; attempt += 1) { + try { + process.kill(pid, 0); + } catch { + return; + } + await delay(100); + } + throw new Error(`process stayed alive: ${pid}`); + }; + const runInterruptedBuild = async (signal: NodeJS.Signals, expectedCode: number) => { + rmSync(join(workDir, "docker.pid"), { force: true }); + rmSync(join(workDir, "docker.term"), { force: true }); + rmSync(join(workDir, "docker-count"), { force: true }); + const runner = spawn(join(workDir, "runner.sh"), { + env: { ...process.env, TMPDIR: workDir }, + stdio: "ignore", + }); + try { + const pidPath = join(workDir, "docker.pid"); + await waitForFile(pidPath); + const buildPid = Number.parseInt(readFileSync(pidPath, "utf8"), 10); + + runner.kill(signal); + const exit = await waitForExit(runner); + + expect(exit).toEqual({ code: expectedCode, signal: null }); + await waitForFile(join(workDir, "docker.term")); + expect(readFileSync(join(workDir, "docker-count"), "utf8").trim()).toBe("1"); + await waitForDead(buildPid); + } finally { + if (runner.exitCode === null && runner.signalCode === null) { + runner.kill("SIGKILL"); + } + } + }; + + await runInterruptedBuild("SIGTERM", 143); + await runInterruptedBuild("SIGINT", 130); + } finally { + rmSync(workDir, { recursive: true, force: true }); + } + }); + it("does not delay fast successful centralized Docker builds until the next heartbeat", () => { const workDir = mkdtempSync(join(tmpdir(), "openclaw-docker-build-fast-heartbeat-"));