ci: stabilize release live QA gates

This commit is contained in:
Peter Steinberger
2026-05-26 17:34:35 +01:00
parent cf21c8abcb
commit 27359ec417
4 changed files with 36 additions and 10 deletions

View File

@@ -96,6 +96,9 @@ steps:
- set: wakeMarker
value:
expr: "`QA-CAPABILITY-${randomUUID().slice(0, 8)}`"
- set: wakeStartIndex
value:
expr: "state.getSnapshot().messages.length"
- call: patchConfig
args:
- env:
@@ -111,6 +114,9 @@ steps:
ref: originalImageGenerationModelPrimary
sessionKey:
ref: sessionKey
deliveryContext:
channel: qa-channel
to: dm:qa-operator
note:
ref: wakeMarker
- call: waitForGatewayHealthy
@@ -121,6 +127,16 @@ steps:
args:
- ref: env
- 60000
- call: waitForOutboundMessage
saveAs: wakeReply
args:
- ref: state
- lambda:
params: [candidate]
expr: "candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && String(candidate.text ?? '').includes(wakeMarker)"
- expr: liveTurnTimeoutMs(env, 60000)
- sinceIndex:
ref: wakeStartIndex
- call: waitForCondition
saveAs: afterTools
args:

View File

@@ -494,7 +494,7 @@ describeLive("subagent announce live", () => {
entry: spawnedRun,
message: steerToken,
});
expect(steerResult.status).toBe("accepted");
expect(["accepted", "done"]).toContain(steerResult.status);
const steeredRun = await waitFor("steered child completion", () => {
if (initialError) {

View File

@@ -72,11 +72,19 @@ function createNoopTools() {
];
}
function replayValidationTools(model: Model<Api>) {
// Responses-family providers may force a new tool call whenever tools are
// present. These live probes validate repaired historical transcript shape,
// not fresh tool invocation.
return isOpenAIResponsesFamily(model.api) ? undefined : createNoopTools();
function replayValidationTools() {
return createNoopTools();
}
function disableResponsesReplayToolChoice(payload: unknown, model: Model<Api>): unknown {
if (!isOpenAIResponsesFamily(model.api) || !payload || typeof payload !== "object") {
return undefined;
}
const next = payload as { tool_choice?: unknown };
// Replay probes include historical tool calls to validate transcript repair,
// but they should not force a fresh noop tool call during the live request.
next.tool_choice = "none";
return next;
}
function buildReplayMessages(model: Model<Api>): AgentMessage[] {
@@ -270,12 +278,13 @@ describeLive("tool replay repair live", () => {
{
systemPrompt: "You are a concise assistant. Follow the user's instruction exactly.",
messages: sanitized as never,
tools: replayValidationTools(model),
tools: replayValidationTools(),
},
{
apiKey: requireApiKey(apiKeyInfo, model.provider),
reasoning: "low",
maxTokens: 96,
onPayload: disableResponsesReplayToolChoice,
},
120_000,
);
@@ -345,12 +354,13 @@ describeLive("tool replay repair live", () => {
{
systemPrompt: "You are a concise assistant. Follow the user's instruction exactly.",
messages: transformed as never,
tools: replayValidationTools(model),
tools: replayValidationTools(),
},
{
apiKey: requireApiKey(apiKeyInfo, model.provider),
reasoning: "low",
maxTokens: 96,
onPayload: disableResponsesReplayToolChoice,
},
120_000,
);

View File

@@ -1082,10 +1082,10 @@ describe("getHighSignalLiveModelPriorityIndex", () => {
it("prefers curated Google replacements over big-pickle", () => {
expect(
getHighSignalLiveModelPriorityIndex({ provider: "google", id: "gemini-3.1-pro-preview" }),
).toBe(3);
).toBe(2);
expect(
getHighSignalLiveModelPriorityIndex({ provider: "google", id: "gemini-3-flash-preview" }),
).toBe(4);
).toBe(3);
expect(getHighSignalLiveModelPriorityIndex({ provider: "opencode", id: "big-pickle" })).toBe(
null,
);