From fa384d4de0c489e91b2f8e14e31e31fc662867c7 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 27 May 2026 00:41:25 +0100 Subject: [PATCH] fix: filter claude autoreview streaming --- .agents/skills/autoreview/SKILL.md | 4 +- .agents/skills/autoreview/scripts/autoreview | 91 +++++++++++++++++++- 2 files changed, 91 insertions(+), 4 deletions(-) diff --git a/.agents/skills/autoreview/SKILL.md b/.agents/skills/autoreview/SKILL.md index 6df582ad51e8..e7ff79ab6d9c 100644 --- a/.agents/skills/autoreview/SKILL.md +++ b/.agents/skills/autoreview/SKILL.md @@ -27,7 +27,7 @@ Use when: - For security-audit suppression changes, verify accepted findings remain auditable: suppressed findings stay in structured output, active output keeps an unsuppressible suppression notice, and aggregate findings cannot hide unrelated active risk. - Never switch or override the requested review engine/model. If the review hits model capacity, retry the same command a few times with the same engine/model. - Be patient with large bundles. Structured review can take up to 30 minutes while the model call is active, especially with Codex tools or web search. -- Treat heartbeat lines like `review still running: ... elapsed=... pid=...` as healthy progress, not a hang. Let the helper continue while heartbeats are advancing. Pass `--stream-engine-output` when live engine text is useful; Codex filters tool/file chatter and other engines pass raw output through. +- Treat heartbeat lines like `review still running: ... elapsed=... pid=...` as healthy progress, not a hang. Let the helper continue while heartbeats are advancing. Pass `--stream-engine-output` when live engine text is useful; Codex and Claude filter tool/file chatter, other engines pass raw output through. - Do not kill a review just because it has been quiet for 2-5 minutes, or because it is still running under the 30-minute window. Inspect the process only after missing multiple expected heartbeats, after 30 minutes, or after an obviously failed subprocess; prefer letting the same helper command finish. - Tools are useful in review mode. The helper allows read-only inspection tools and web search by default so reviewers can check dependency contracts, upstream docs, and current behavior. - Security perspective is always included, but it should not cripple legitimate functionality. Report security findings only when the change creates a concrete, actionable risk or removes an important safety check. @@ -171,7 +171,7 @@ The helper: - should be left in `--mode auto` or forced to `--mode branch` for PR/branch work; do not force `--mode local` after committing - writes only to stdout unless `--output`, `--json-output`, or live streamed engine stderr is set - supports `--dry-run`, `--parallel-tests`, `--prompt`, `--prompt-file`, `--dataset`, `--no-tools`, `--no-web-search`, and commit refs -- supports `--stream-engine-output` or `AUTOREVIEW_STREAM_ENGINE_OUTPUT=1` for live engine text while preserving structured validation; Codex hides tool/file event details, emits compact activity summaries, and reports token usage at turn completion +- supports `--stream-engine-output` or `AUTOREVIEW_STREAM_ENGINE_OUTPUT=1` for live engine text while preserving structured validation; Codex and Claude hide tool/file event details, emit compact activity summaries, and report usage at turn completion - supports opt-in review panels with `--panel` / `--reviewers`, plus per-engine `--model` and `--thinking` - allows read-only tools and web search by default where the selected CLI supports them; forbids nested review in the prompt; Codex is run through `codex exec` with read-only sandbox and structured output - prints `review still running: elapsed=s pid=` to stderr at long-running intervals while waiting for the selected review engine, unless streamed output or compact Codex activity has been visible recently diff --git a/.agents/skills/autoreview/scripts/autoreview b/.agents/skills/autoreview/scripts/autoreview index 7c18bc4f983e..653f75c455a9 100755 --- a/.agents/skills/autoreview/scripts/autoreview +++ b/.agents/skills/autoreview/scripts/autoreview @@ -480,7 +480,14 @@ def run_claude(args: argparse.Namespace, repo: Path, prompt: str) -> str: cmd.extend(["--model", args.model]) if args.thinking: cmd.extend(["--effort", args.thinking]) - result = run_with_heartbeat(cmd, repo, input_text=prompt, label="claude", stream_output=args.stream_engine_output) + result = run_with_heartbeat( + cmd, + repo, + input_text=prompt, + label="claude", + stream_output=args.stream_engine_output, + stream_display=ClaudeStreamDisplay() if args.stream_engine_output else None, + ) if result.returncode != 0: raise SystemExit(f"claude engine failed ({result.returncode})\n{result.stderr or result.stdout}") return result.stdout @@ -598,6 +605,80 @@ class CodexStreamDisplay: return text +class ClaudeStreamDisplay: + def __init__(self, *, activity_seconds: int = 20) -> None: + self.activity_seconds = activity_seconds + self.hidden_events = 0 + self.last_visible = time.monotonic() + self.started = False + + def __call__(self, name: str, line: str) -> str | None: + if name != "stdout": + return line + try: + event = json.loads(line) + except json.JSONDecodeError: + return self.visible(line) + event_type = event.get("type") + if event_type == "system" and not self.started: + self.started = True + return self.visible("claude turn started\n") + if event_type == "assistant": + return self.assistant_message(event) + if event_type == "result": + return self.visible(self.flush_hidden() + self.result_summary(event)) + return self.hidden_activity() + + def assistant_message(self, event: dict[str, Any]) -> str | None: + message = event.get("message") + if not isinstance(message, dict): + return self.hidden_activity() + chunks: list[str] = [] + for item in message.get("content", []): + if not isinstance(item, dict): + continue + if item.get("type") == "text" and isinstance(item.get("text"), str): + chunks.append(item["text"].rstrip()) + if chunks: + return self.visible(self.flush_hidden() + "\n".join(chunks) + "\n") + return self.hidden_activity() + + def result_summary(self, event: dict[str, Any]) -> str: + usage = event.get("usage") + fields: list[str] = [] + if isinstance(usage, dict): + for key in ( + "input_tokens", + "cache_read_input_tokens", + "cache_creation_input_tokens", + "output_tokens", + ): + value = usage.get(key) + if isinstance(value, int): + fields.append(f"{key}={value}") + cost = event.get("total_cost_usd") + if isinstance(cost, (int, float)) and not isinstance(cost, bool): + fields.append(f"cost_usd={cost:.6f}") + return "claude usage: " + " ".join(fields) + "\n" if fields else "claude turn completed\n" + + def hidden_activity(self) -> str | None: + self.hidden_events += 1 + if time.monotonic() - self.last_visible < self.activity_seconds: + return None + return self.visible(self.flush_hidden()) + + def flush_hidden(self) -> str: + if not self.hidden_events: + return "" + count = self.hidden_events + self.hidden_events = 0 + return f"claude activity: {count} hidden tool/status events\n" + + def visible(self, text: str) -> str: + self.last_visible = time.monotonic() + return text + + def format_codex_usage(usage: dict[str, Any]) -> str: fields = [ "input_tokens", @@ -646,7 +727,7 @@ def extract_json(text: str) -> dict[str, Any]: def extract_json_from_jsonl(text: str) -> dict[str, Any] | None: - candidates: list[str] = [] + candidates: list[str | dict[str, Any]] = [] for line in text.splitlines(): line = line.strip() if not line: @@ -665,7 +746,13 @@ def extract_json_from_jsonl(text: str) -> dict[str, Any] | None: candidates.append(data["content"]) if isinstance(event.get("result"), str): candidates.append(event["result"]) + if isinstance(event.get("structured_output"), dict): + candidates.append(event["structured_output"]) for candidate in reversed(candidates): + if isinstance(candidate, dict): + if "findings" in candidate: + return candidate + continue parsed = parse_json_candidate(candidate) if isinstance(parsed, dict) and "findings" in parsed: return parsed