From 18801b2fe54493d233e335aee4ae129a8ec457e0 Mon Sep 17 00:00:00 2001 From: Eval Eddie Date: Thu, 14 May 2026 19:17:01 +0000 Subject: [PATCH] Apply same browser-leak protection to claude_code_harness + pi_harness The cubic-ai review caught this leak class in codex_harness (commit e3a3ebf), but claude_code_harness and pi_harness share the same start_remote_daemon API and the same execute() shape: _start_browser is called before env/cmd build and before subprocess.create_subprocess_exec, but _stop_browser only runs in an inner try/finally that begins AFTER subprocess spawn. A failure in env build, system_prompt read, _build_*_cmd, or create_subprocess_exec leaks the provisioned cloud browser. Wraps env+cmd build in try/except _stop_browser, and the subprocess spawn + stderr_task creation in a separate try/except that also kills any partially-started proc before tearing down the browser. Same pattern as the codex_harness fix; symmetric now for all three browser-harness frameworks (cch, pi_harness, codex_harness). --- frameworks/claude_code_harness/run_task.py | 58 ++++++++++++-------- frameworks/pi_harness/run_task.py | 62 +++++++++++++--------- 2 files changed, 74 insertions(+), 46 deletions(-) diff --git a/frameworks/claude_code_harness/run_task.py b/frameworks/claude_code_harness/run_task.py index dc0e18e..f41b567 100644 --- a/frameworks/claude_code_harness/run_task.py +++ b/frameworks/claude_code_harness/run_task.py @@ -1,4 +1,4 @@ -"""Run a single benchmark task using Claude Code driving browser-harness. +"""Run a single benchmark task using Claude Code driving browser-harness. This framework wraps Claude Code (the CLI coding agent) around the browser-harness repo: Claude Code owns the agent loop, we just hand it a task and a workdir @@ -258,17 +258,21 @@ async def execute(task_description: str) -> ExecutionResult: # Pre-provision the browser so Claude starts with a live CDP attach. _start_browser(browser_name, bu_name) - env = { - **os.environ, - "BU_NAME": bu_name, - "DISABLE_TELEMETRY": "1", - "DISABLE_AUTOUPDATER": "1", - "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", - } - - cmd = _build_claude_cmd( - task_description, model_name, max_turns, max_budget_usd, use_bare - ) + try: + env = { + **os.environ, + "BU_NAME": bu_name, + "DISABLE_TELEMETRY": "1", + "DISABLE_AUTOUPDATER": "1", + "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", + } + + cmd = _build_claude_cmd( + task_description, model_name, max_turns, max_budget_usd, use_bare + ) + except Exception: + _stop_browser(browser_name, bu_name) + raise start = time.time() steps: list[str] = [] @@ -284,16 +288,26 @@ async def execute(task_description: str) -> ExecutionResult: # asyncio StreamReader line buffer is 64 KiB which raises ValueError on # long lines, and even a larger limit has a ceiling. Read raw chunks and # split on newlines ourselves. - proc = await asyncio.create_subprocess_exec( - *cmd, - cwd=HARNESS_DIR, - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, # 256 MiB safety cap - ) - - stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + proc: asyncio.subprocess.Process | None = None + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=HARNESS_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, # 256 MiB safety cap + ) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + except Exception: + if proc is not None and proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + _stop_browser(browser_name, bu_name) + raise async def _iter_stdout_lines(): """Yield one stream-json line at a time, regardless of line length.""" diff --git a/frameworks/pi_harness/run_task.py b/frameworks/pi_harness/run_task.py index 785a27f..b05ef8f 100644 --- a/frameworks/pi_harness/run_task.py +++ b/frameworks/pi_harness/run_task.py @@ -1,4 +1,4 @@ -"""Run a single benchmark task using pi (the @earendil-works/pi-coding-agent CLI) +"""Run a single benchmark task using pi (the @earendil-works/pi-coding-agent CLI) driving browser-harness. This framework is a near-mirror of `claude_code_harness`, except the coding @@ -249,19 +249,23 @@ async def execute(task_description: str) -> ExecutionResult: # Pre-provision the browser so pi starts with a live CDP attach. _start_browser(browser_name, bu_name) - env = { - **os.environ, - "BU_NAME": bu_name, - "DISABLE_TELEMETRY": "1", - # Pi-specific: skip startup network ops so a flaky pi.dev doesn't - # block the run, and disable install/update telemetry. - "PI_OFFLINE": "1", - "PI_SKIP_VERSION_CHECK": "1", - "PI_TELEMETRY": "0", - } - - system_prompt = SYSTEM_PROMPT_FILE.read_text() - cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt) + try: + env = { + **os.environ, + "BU_NAME": bu_name, + "DISABLE_TELEMETRY": "1", + # Pi-specific: skip startup network ops so a flaky pi.dev doesn't + # block the run, and disable install/update telemetry. + "PI_OFFLINE": "1", + "PI_SKIP_VERSION_CHECK": "1", + "PI_TELEMETRY": "0", + } + + system_prompt = SYSTEM_PROMPT_FILE.read_text() + cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt) + except Exception: + _stop_browser(browser_name, bu_name) + raise start = time.time() steps: list[str] = [] @@ -272,16 +276,26 @@ async def execute(task_description: str) -> ExecutionResult: # pi stream-json lines can be huge (tool results with full page HTML/text). # Same workaround as CCH: read raw chunks and split on newlines. - proc = await asyncio.create_subprocess_exec( - *cmd, - cwd=HARNESS_DIR, - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, # 256 MiB safety cap - ) - - stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + proc: asyncio.subprocess.Process | None = None + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=HARNESS_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, # 256 MiB safety cap + ) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + except Exception: + if proc is not None and proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + _stop_browser(browser_name, bu_name) + raise async def _iter_stdout_lines(): """Yield one stream-json line at a time, regardless of line length."""