From 3104f363298e07307fdbe3f55531069dfc0ae3e2 Mon Sep 17 00:00:00 2001 From: Yoshikazu Terashi Date: Wed, 27 May 2026 17:03:17 +0900 Subject: [PATCH] fix(cron): surface classified run failure causes Surface classified cron failure causes without changing raw cron JSON error text. - add additive CLI `cause` output for finished run entries with `errorReason` - persist/backfill full `FailoverReason` values on cron run-log entries - thread provider context through cron finalization so provider-specific failure causes stay accurate - extend protocol/Swift models and regression coverage for CLI JSON, run-log parsing/search, alerts, and protocol conformance Verification: - `pnpm lint --threads=8` - `pnpm protocol:check` - `pnpm exec oxfmt --check src/cli/cron-cli/shared.ts src/cli/cron-cli/shared.cause-display.test.ts src/cron/run-log.ts src/cron/run-log.error-reason.test.ts src/cron/cron-protocol-conformance.test.ts src/cron/service.failure-alert.test.ts src/cron/service/timer.ts src/cron/service/ops.ts src/gateway/protocol/schema/cron.ts scripts/protocol-gen-swift.ts` - `git diff --check` - AWS Crabbox `cbx_8a6a65ab83b0` / `run_42b73a4a9750`: 4 files, 20 tests passed - autoreview clean, no accepted/actionable findings - GitHub CI/CodeQL/OpenGrep/Workflow Sanity green/skipped/neutral on `aa29b087b2587d0aed3d409de5e7a2c706c32cdf` Co-authored-by: Yoshikazu Terashi --- .../OpenClawProtocol/GatewayModels.swift | 4 + scripts/protocol-gen-swift.ts | 2 +- src/cli/cron-cli/shared.cause-display.test.ts | 52 +++++++ src/cli/cron-cli/shared.ts | 25 +++- src/cron/cron-protocol-conformance.test.ts | 40 +++--- src/cron/run-log.error-reason.test.ts | 125 +++++++++++++++++ src/cron/run-log.ts | 41 +++++- src/cron/service.failure-alert.test.ts | 132 ++++++++++++++++++ src/cron/service/ops.ts | 1 + src/cron/service/timer.ts | 16 ++- src/gateway/protocol/schema/cron.ts | 4 + 11 files changed, 420 insertions(+), 22 deletions(-) create mode 100644 src/cli/cron-cli/shared.cause-display.test.ts create mode 100644 src/cron/run-log.error-reason.test.ts diff --git a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift index e31ead254ed1..7536085f1b91 100644 --- a/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift +++ b/apps/shared/OpenClawKit/Sources/OpenClawProtocol/GatewayModels.swift @@ -5508,6 +5508,7 @@ public struct CronRunLogEntry: Codable, Sendable { public let action: String public let status: AnyCodable? public let error: String? + public let errorreason: AnyCodable? public let summary: String? public let diagnostics: [String: AnyCodable]? public let delivered: Bool? @@ -5531,6 +5532,7 @@ public struct CronRunLogEntry: Codable, Sendable { action: String, status: AnyCodable?, error: String?, + errorreason: AnyCodable? = nil, summary: String?, diagnostics: [String: AnyCodable]?, delivered: Bool?, @@ -5553,6 +5555,7 @@ public struct CronRunLogEntry: Codable, Sendable { self.action = action self.status = status self.error = error + self.errorreason = errorreason self.summary = summary self.diagnostics = diagnostics self.delivered = delivered @@ -5577,6 +5580,7 @@ public struct CronRunLogEntry: Codable, Sendable { case action case status case error + case errorreason = "errorReason" case summary case diagnostics case delivered diff --git a/scripts/protocol-gen-swift.ts b/scripts/protocol-gen-swift.ts index 291afe81c062..9c4a8556f977 100644 --- a/scripts/protocol-gen-swift.ts +++ b/scripts/protocol-gen-swift.ts @@ -47,7 +47,7 @@ const DEFAULTED_OPTIONAL_INIT_PARAMS: Record> = { ArtifactsGetParams: new Set(["agentId"]), ArtifactsDownloadParams: new Set(["agentId"]), MessageActionParams: new Set(["inboundTurnKind"]), - CronRunLogEntry: new Set(["failureNotificationDelivery"]), + CronRunLogEntry: new Set(["errorReason", "failureNotificationDelivery"]), }; const header = `// Generated by scripts/protocol-gen-swift.ts — do not edit by hand\n// swiftlint:disable file_length\nimport Foundation\n\npublic let GATEWAY_PROTOCOL_VERSION = ${PROTOCOL_VERSION}\npublic let GATEWAY_MIN_PROTOCOL_VERSION = ${MIN_CLIENT_PROTOCOL_VERSION}\n\nprivate struct GatewayAnyCodingKey: CodingKey, Hashable {\n let stringValue: String\n let intValue: Int?\n\n init?(stringValue: String) {\n self.stringValue = stringValue\n self.intValue = nil\n }\n\n init?(intValue: Int) {\n self.stringValue = String(intValue)\n self.intValue = intValue\n }\n}\n\npublic enum ErrorCode: String, Codable, Sendable {\n${Object.values( diff --git a/src/cli/cron-cli/shared.cause-display.test.ts b/src/cli/cron-cli/shared.cause-display.test.ts new file mode 100644 index 000000000000..a09695273d89 --- /dev/null +++ b/src/cli/cron-cli/shared.cause-display.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from "vitest"; +import { defaultRuntime } from "../../runtime.js"; +import { printCronJson } from "./shared.js"; + +describe("printCronJson cause display", () => { + it("adds an additive cause without changing raw cron run errors", () => { + let written: unknown; + const original = defaultRuntime.writeJson; + defaultRuntime.writeJson = (value: unknown) => { + written = value; + }; + try { + printCronJson({ + entries: [ + { + ts: 1, + jobId: "job-1", + action: "finished", + status: "error", + errorReason: "timeout", + error: "cron: job execution timed out", + }, + ], + }); + } finally { + defaultRuntime.writeJson = original; + } + + const result = written as { entries: Array> }; + expect(result.entries[0]?.cause).toBe("timeout"); + expect(result.entries[0]?.error).toBe("cron: job execution timed out"); + expect(result.entries[0]?.errorReason).toBe("timeout"); + }); + + it("does not add cause fields to non-run-log entries", () => { + let written: unknown; + const original = defaultRuntime.writeJson; + defaultRuntime.writeJson = (value: unknown) => { + written = value; + }; + try { + printCronJson({ + entries: [{ errorReason: "timeout", status: "error" }], + }); + } finally { + defaultRuntime.writeJson = original; + } + + const result = written as { entries: Array> }; + expect(result.entries[0]?.cause).toBeUndefined(); + }); +}); diff --git a/src/cli/cron-cli/shared.ts b/src/cli/cron-cli/shared.ts index 54ac298eec9b..de6bfc6ad77a 100644 --- a/src/cli/cron-cli/shared.ts +++ b/src/cli/cron-cli/shared.ts @@ -25,8 +25,31 @@ export const getCronChannelOptions = () => { return pluginIds.length > 0 ? ["last", ...pluginIds].join("|") : "last|"; }; +function addCronRunCauseFields(value: unknown): unknown { + if (!value || typeof value !== "object") { + return value; + } + const record = value as Record; + const entries = record.entries; + if (!Array.isArray(entries)) { + return value; + } + const nextEntries = entries.map((entry) => { + if (!entry || typeof entry !== "object") { + return entry; + } + const item = entry as Record; + if (item.action !== "finished" || typeof item.errorReason !== "string") { + return item; + } + const cause = item.errorReason.trim(); + return cause ? Object.assign({}, item, { cause }) : item; + }); + return { ...record, entries: nextEntries }; +} + export function printCronJson(value: unknown) { - defaultRuntime.writeJson(value); + defaultRuntime.writeJson(addCronRunCauseFields(value)); } /** diff --git a/src/cron/cron-protocol-conformance.test.ts b/src/cron/cron-protocol-conformance.test.ts index 838016c43b5d..5815640ed529 100644 --- a/src/cron/cron-protocol-conformance.test.ts +++ b/src/cron/cron-protocol-conformance.test.ts @@ -2,7 +2,11 @@ import fs from "node:fs/promises"; import path from "node:path"; import { describe, expect, it } from "vitest"; import { MACOS_APP_SOURCES_DIR } from "../compat/legacy-names.js"; -import { CronDeliverySchema, CronJobStateSchema } from "../gateway/protocol/schema.js"; +import { + CronDeliverySchema, + CronJobStateSchema, + CronRunLogEntrySchema, +} from "../gateway/protocol/schema.js"; type SchemaLike = { anyOf?: Array; @@ -30,13 +34,9 @@ function extractDeliveryModes(schema: SchemaLike): string[] { } function extractConstUnionValues(schema: SchemaLike): string[] { - return Array.from( - new Set( - (schema.anyOf ?? []) - .map((entry) => entry?.const) - .filter((value): value is string => typeof value === "string"), - ), - ); + return (schema.anyOf ?? []) + .map((entry) => entry?.const) + .filter((value): value is string => typeof value === "string"); } const UI_FILES = ["ui/src/ui/types.ts", "ui/src/ui/ui-types.ts", "ui/src/ui/views/cron.ts"]; @@ -97,24 +97,32 @@ describe("cron protocol conformance", () => { expect(swift).toContain("let jobs:"); }); - it("cron job state schema keeps the full failover reason set", () => { - const properties = (CronJobStateSchema as SchemaLike).properties ?? {}; - const lastErrorReason = properties.lastErrorReason as SchemaLike | undefined; - if (lastErrorReason === undefined) { - throw new Error("missing lastErrorReason schema"); - } - expect(extractConstUnionValues(lastErrorReason)).toEqual([ + it("cron public schemas keep the full failover reason set", () => { + const expectedReasons = [ "auth", + "auth_permanent", "format", "rate_limit", + "overloaded", "billing", "server_error", "timeout", "model_not_found", + "session_expired", "empty_response", "no_error_details", "unclassified", "unknown", - ]); + ]; + + const stateProperties = (CronJobStateSchema as SchemaLike).properties ?? {}; + const lastErrorReason = stateProperties.lastErrorReason as SchemaLike | undefined; + expect(lastErrorReason).toBeDefined(); + expect(extractConstUnionValues(lastErrorReason ?? {})).toEqual(expectedReasons); + + const runLogProperties = (CronRunLogEntrySchema as SchemaLike).properties ?? {}; + const errorReason = runLogProperties.errorReason as SchemaLike | undefined; + expect(errorReason).toBeDefined(); + expect(extractConstUnionValues(errorReason ?? {})).toEqual(expectedReasons); }); }); diff --git a/src/cron/run-log.error-reason.test.ts b/src/cron/run-log.error-reason.test.ts new file mode 100644 index 000000000000..957ce724db83 --- /dev/null +++ b/src/cron/run-log.error-reason.test.ts @@ -0,0 +1,125 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; +import { readCronRunLogEntriesPage } from "./run-log.js"; + +describe("cron run log errorReason", () => { + it("backfills errorReason from timeout error text for older entries", async () => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-")); + const file = path.join(dir, "job.jsonl"); + await fs.writeFile( + file, + `${JSON.stringify({ + ts: 1, + jobId: "job-1", + action: "finished", + status: "error", + error: "cron: job execution timed out", + })}\n`, + "utf8", + ); + + const page = await readCronRunLogEntriesPage(file, { limit: 10 }); + expect(page.entries[0]?.errorReason).toBe("timeout"); + }); + + it("validates persisted errorReason against the full failover reason set", async () => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-")); + const file = path.join(dir, "job.jsonl"); + const reasons = [ + "auth", + "auth_permanent", + "format", + "rate_limit", + "overloaded", + "billing", + "server_error", + "timeout", + "model_not_found", + "session_expired", + "empty_response", + "no_error_details", + "unclassified", + "unknown", + ]; + await fs.writeFile( + file, + reasons + .map((errorReason, index) => + JSON.stringify({ + ts: index + 1, + jobId: "job-1", + action: "finished", + status: "error", + errorReason, + }), + ) + .join("\n") + "\n", + "utf8", + ); + + const page = await readCronRunLogEntriesPage(file, { limit: 50, sortDir: "asc" }); + expect(page.entries.map((entry) => entry.errorReason)).toEqual(reasons); + }); + + it("derives an invalid persisted reason from raw error text before exposing entries", async () => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-")); + const file = path.join(dir, "job.jsonl"); + await fs.writeFile( + file, + `${JSON.stringify({ + ts: 1, + jobId: "job-1", + action: "finished", + status: "error", + error: "upstream unavailable: 503 overloaded", + errorReason: "not-a-real-reason", + })}\n`, + "utf8", + ); + + const page = await readCronRunLogEntriesPage(file, { limit: 10 }); + expect(page.entries[0]?.errorReason).toBe("overloaded"); + }); + + it("uses provider context when deriving persisted run-log reasons", async () => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-")); + const file = path.join(dir, "job.jsonl"); + await fs.writeFile( + file, + `${JSON.stringify({ + ts: 1, + jobId: "job-1", + action: "finished", + status: "error", + error: "403 Key limit exceeded (monthly limit)", + provider: "openrouter", + })}\n`, + "utf8", + ); + + const page = await readCronRunLogEntriesPage(file, { limit: 10 }); + expect(page.entries[0]?.errorReason).toBe("billing"); + }); + + it("includes derived errorReason values in run-log search", async () => { + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "cron-run-log-")); + const file = path.join(dir, "job.jsonl"); + await fs.writeFile( + file, + `${JSON.stringify({ + ts: 1, + jobId: "job-1", + action: "finished", + status: "error", + error: "cron: job execution timed out", + })}\n`, + "utf8", + ); + + const page = await readCronRunLogEntriesPage(file, { limit: 10, query: "timeout" }); + expect(page.entries).toHaveLength(1); + expect(page.entries[0]?.errorReason).toBe("timeout"); + }); +}); diff --git a/src/cron/run-log.ts b/src/cron/run-log.ts index b9f98737ddc0..03e43e43aead 100644 --- a/src/cron/run-log.ts +++ b/src/cron/run-log.ts @@ -1,6 +1,8 @@ import fsSync from "node:fs"; import fs from "node:fs/promises"; import path from "node:path"; +import { resolveFailoverReasonFromError } from "../agents/failover-error.js"; +import type { FailoverReason } from "../agents/pi-embedded-helpers/types.js"; import { parseByteSize } from "../cli/parse-bytes.js"; import type { CronConfig } from "../config/types.cron.js"; import { appendRegularFile, isPathInside, pathExists, root as fsRoot } from "../infra/fs-safe.js"; @@ -27,6 +29,7 @@ export type CronRunLogEntry = { action: "finished"; status?: CronRunStatus; error?: string; + errorReason?: FailoverReason; summary?: string; diagnostics?: CronRunDiagnostics; delivered?: boolean; @@ -72,6 +75,29 @@ type ReadCronRunLogAllPageOptions = Omit & { jobNameById?: Record; }; +const CRON_FAILOVER_REASONS = new Set([ + "auth", + "auth_permanent", + "format", + "rate_limit", + "overloaded", + "billing", + "server_error", + "timeout", + "model_not_found", + "session_expired", + "empty_response", + "no_error_details", + "unclassified", + "unknown", +]); + +function normalizeCronRunLogErrorReason(value: unknown): FailoverReason | undefined { + return typeof value === "string" && CRON_FAILOVER_REASONS.has(value as FailoverReason) + ? (value as FailoverReason) + : undefined; +} + function assertSafeCronRunLogJobId(jobId: string): string { const trimmed = jobId.trim(); if (!trimmed) { @@ -309,12 +335,20 @@ function parseAllRunLogEntries(raw: string, opts?: { jobId?: string }): CronRunL obj.usage && typeof obj.usage === "object" ? (obj.usage as Record) : undefined; + const normalizedError = typeof obj.error === "string" ? obj.error : undefined; + const normalizedProvider = + typeof obj.provider === "string" && obj.provider.trim() ? obj.provider : undefined; + const normalizedErrorReason = + normalizeCronRunLogErrorReason(obj.errorReason) ?? + resolveFailoverReasonFromError(normalizedError, normalizedProvider) ?? + undefined; const entry: CronRunLogEntry = { ts: obj.ts, jobId: obj.jobId, action: "finished", status: obj.status, - error: obj.error, + error: normalizedError, + errorReason: normalizedErrorReason, summary: obj.summary, runId: typeof obj.runId === "string" && obj.runId.trim() ? obj.runId : undefined, diagnostics: normalizeCronRunDiagnostics(obj.diagnostics), @@ -322,8 +356,7 @@ function parseAllRunLogEntries(raw: string, opts?: { jobId?: string }): CronRunL durationMs: obj.durationMs, nextRunAtMs: obj.nextRunAtMs, model: typeof obj.model === "string" && obj.model.trim() ? obj.model : undefined, - provider: - typeof obj.provider === "string" && obj.provider.trim() ? obj.provider : undefined, + provider: normalizedProvider, usage: usage ? { input_tokens: typeof usage.input_tokens === "number" ? usage.input_tokens : undefined, @@ -447,6 +480,7 @@ export async function readCronRunLogEntriesPage( [ entry.summary ?? "", entry.error ?? "", + entry.errorReason ?? "", entry.diagnostics?.summary ?? "", ...(entry.diagnostics?.entries ?? []).map((diagnostic) => diagnostic.message), entry.jobId, @@ -535,6 +569,7 @@ export async function readCronRunLogEntriesPageAll( return [ entry.summary ?? "", entry.error ?? "", + entry.errorReason ?? "", entry.diagnostics?.summary ?? "", ...(entry.diagnostics?.entries ?? []).map((diagnostic) => diagnostic.message), entry.jobId, diff --git a/src/cron/service.failure-alert.test.ts b/src/cron/service.failure-alert.test.ts index 6adeb89d81fd..e7f4ee6b8322 100644 --- a/src/cron/service.failure-alert.test.ts +++ b/src/cron/service.failure-alert.test.ts @@ -414,6 +414,138 @@ describe("CronService failure alerts", () => { await store.cleanup(); }); + it("surfaces classified causes before raw errors in failure alerts", async () => { + const store = await makeStorePath(); + const sendCronFailureAlert = vi.fn(async () => undefined); + const runIsolatedAgentJob = vi.fn(async () => ({ + status: "error" as const, + error: "cron: job execution timed out", + })); + + const cron = createFailureAlertCron({ + storePath: store.storePath, + cronConfig: { + failureAlert: { + enabled: true, + after: 1, + }, + }, + runIsolatedAgentJob, + sendCronFailureAlert, + }); + + await cron.start(); + const job = await cron.add({ + name: "timeout cause alert", + enabled: true, + schedule: { kind: "every", everyMs: 60_000 }, + sessionTarget: "isolated", + wakeMode: "next-heartbeat", + payload: { kind: "agentTurn", message: "ping" }, + delivery: { mode: "announce", channel: "telegram", to: "19098680" }, + }); + + await cron.run(job.id, "force"); + expect(sendCronFailureAlert).toHaveBeenCalledTimes(1); + const alertText = alertCallArg(sendCronFailureAlert).text; + expect(alertText).toBe( + 'Cron job "timeout cause alert" failed 1 times\n' + + "Cause: timeout\n" + + "Last error: cron: job execution timed out", + ); + + cron.stop(); + await store.cleanup(); + }); + + it("uses provider context when surfacing failure alert causes", async () => { + const store = await makeStorePath(); + const sendCronFailureAlert = vi.fn(async () => undefined); + const runIsolatedAgentJob = vi.fn(async () => ({ + status: "error" as const, + error: "403 Key limit exceeded (monthly limit)", + provider: "openrouter", + })); + + const cron = createFailureAlertCron({ + storePath: store.storePath, + cronConfig: { + failureAlert: { + enabled: true, + after: 1, + }, + }, + runIsolatedAgentJob, + sendCronFailureAlert, + }); + + await cron.start(); + const job = await cron.add({ + name: "provider limit alert", + enabled: true, + schedule: { kind: "every", everyMs: 60_000 }, + sessionTarget: "isolated", + wakeMode: "next-heartbeat", + payload: { kind: "agentTurn", message: "ping" }, + delivery: { mode: "announce", channel: "telegram", to: "19098680" }, + }); + + await cron.run(job.id, "force"); + expect(sendCronFailureAlert).toHaveBeenCalledTimes(1); + const alertText = alertCallArg(sendCronFailureAlert).text; + expect(alertText).toBe( + 'Cron job "provider limit alert" failed 1 times\n' + + "Cause: billing\n" + + "Last error: 403 Key limit exceeded (monthly limit)", + ); + + cron.stop(); + await store.cleanup(); + }); + + it("keeps skipped alert text unchanged when the skip reason looks classifiable", async () => { + const store = await makeStorePath(); + const sendCronFailureAlert = vi.fn(async () => undefined); + const runIsolatedAgentJob = vi.fn(async () => ({ + status: "skipped" as const, + error: "cron: job execution timed out", + })); + + const cron = createFailureAlertCron({ + storePath: store.storePath, + cronConfig: { + failureAlert: { + enabled: true, + after: 1, + includeSkipped: true, + }, + }, + runIsolatedAgentJob, + sendCronFailureAlert, + }); + + await cron.start(); + const job = await cron.add({ + name: "skipped timeout", + enabled: true, + schedule: { kind: "every", everyMs: 60_000 }, + sessionTarget: "isolated", + wakeMode: "next-heartbeat", + payload: { kind: "agentTurn", message: "ping" }, + delivery: { mode: "announce", channel: "telegram", to: "19098680" }, + }); + + await cron.run(job.id, "force"); + expect(sendCronFailureAlert).toHaveBeenCalledTimes(1); + const alertText = alertCallArg(sendCronFailureAlert).text; + expect(alertText).toBe( + 'Cron job "skipped timeout" skipped 1 times\nSkip reason: cron: job execution timed out', + ); + + cron.stop(); + await store.cleanup(); + }); + it("tracks skipped runs without alerting or affecting error backoff when includeSkipped is off", async () => { const store = await makeStorePath(); const sendCronFailureAlert = vi.fn(async () => undefined); diff --git a/src/cron/service/ops.ts b/src/cron/service/ops.ts index 7fb12ce359a1..82486c5eda04 100644 --- a/src/cron/service/ops.ts +++ b/src/cron/service/ops.ts @@ -775,6 +775,7 @@ async function finishPreparedManualRun( error: coreResult.error, diagnostics: coreResult.diagnostics, delivered: coreResult.delivered, + provider: coreResult.provider, startedAt, endedAt, }, diff --git a/src/cron/service/timer.ts b/src/cron/service/timer.ts index 9516e1421794..a221849448ac 100644 --- a/src/cron/service/timer.ts +++ b/src/cron/service/timer.ts @@ -755,14 +755,20 @@ function emitFailureAlert( mode?: "announce" | "webhook"; accountId?: string; status: "error" | "skipped"; + provider?: string; }, ) { const safeJobName = params.job.name || params.job.id; const truncatedError = (params.error?.trim() || "unknown reason").slice(0, 200); + const errorReason = + params.status === "error" && typeof params.error === "string" + ? (resolveFailoverReasonFromError(params.error, params.provider) ?? undefined) + : undefined; const statusVerb = params.status === "skipped" ? "skipped" : "failed"; const detailLabel = params.status === "skipped" ? "Skip reason" : "Last error"; const text = [ `Cron job "${safeJobName}" ${statusVerb} ${params.consecutiveErrors} times`, + ...(errorReason ? [`Cause: ${errorReason}`] : []), `${detailLabel}: ${truncatedError}`, ].join("\n"); @@ -802,6 +808,7 @@ function maybeEmitFailureAlert( alertConfig: ResolvedFailureAlert | null; status: "error" | "skipped"; error?: string; + provider?: string; consecutiveCount: number; }, ) { @@ -828,6 +835,7 @@ function maybeEmitFailureAlert( mode: params.alertConfig.mode, accountId: params.alertConfig.accountId, status: params.status, + provider: params.provider, }); params.job.state.lastFailureAlertAtMs = now; } @@ -845,6 +853,7 @@ export function applyJobResult( error?: string; diagnostics?: CronRunOutcome["diagnostics"]; delivered?: boolean; + provider?: string; startedAt: number; endedAt: number; }, @@ -873,7 +882,7 @@ export function applyJobResult( job.state.lastDiagnosticSummary = summarizeCronRunDiagnostics(job.state.lastDiagnostics); job.state.lastErrorReason = result.status === "error" && typeof result.error === "string" - ? (resolveFailoverReasonFromError(result.error) ?? undefined) + ? (resolveFailoverReasonFromError(result.error, result.provider) ?? undefined) : undefined; if (result.status === "error") { state.deps.log.warn( @@ -915,6 +924,7 @@ export function applyJobResult( alertConfig, status: "error", error: result.error, + provider: result.provider, consecutiveCount: job.state.consecutiveErrors, }); } else if (result.status === "skipped") { @@ -926,6 +936,7 @@ export function applyJobResult( alertConfig, status: "skipped", error: result.error, + provider: result.provider, consecutiveCount: job.state.consecutiveSkipped, }); } else { @@ -1080,6 +1091,7 @@ function applyOutcomeToStoredJob(state: CronServiceState, result: TimedCronRunOu error: result.error, diagnostics: result.diagnostics, delivered: result.delivered, + provider: result.provider, startedAt: result.startedAt, endedAt: result.endedAt, }); @@ -1102,6 +1114,7 @@ function applyOutcomeToStoredJob(state: CronServiceState, result: TimedCronRunOu error: result.error, diagnostics: result.diagnostics, delivered: result.delivered, + provider: result.provider, startedAt: result.startedAt, endedAt: result.endedAt, }); @@ -1952,6 +1965,7 @@ export async function executeJob( error: coreResult.error, diagnostics: coreResult.diagnostics, delivered: coreResult.delivered, + provider: coreResult.provider, startedAt, endedAt, }); diff --git a/src/gateway/protocol/schema/cron.ts b/src/gateway/protocol/schema/cron.ts index c651d0a989b5..644a135be51f 100644 --- a/src/gateway/protocol/schema/cron.ts +++ b/src/gateway/protocol/schema/cron.ts @@ -64,12 +64,15 @@ const CronDeliveryStatusSchema = Type.Union([ ]); const CronFailoverReasonSchema = Type.Union([ Type.Literal("auth"), + Type.Literal("auth_permanent"), Type.Literal("format"), Type.Literal("rate_limit"), + Type.Literal("overloaded"), Type.Literal("billing"), Type.Literal("server_error"), Type.Literal("timeout"), Type.Literal("model_not_found"), + Type.Literal("session_expired"), Type.Literal("empty_response"), Type.Literal("no_error_details"), Type.Literal("unclassified"), @@ -434,6 +437,7 @@ export const CronRunLogEntrySchema = Type.Object( action: Type.Literal("finished"), status: Type.Optional(CronRunStatusSchema), error: Type.Optional(Type.String()), + errorReason: Type.Optional(CronFailoverReasonSchema), summary: Type.Optional(Type.String()), diagnostics: Type.Optional(CronRunDiagnosticsSchema), delivered: Type.Optional(Type.Boolean()),