Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 36 additions & 22 deletions frameworks/claude_code_harness/run_task.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Run a single benchmark task using Claude Code driving browser-harness.
"""Run a single benchmark task using Claude Code driving browser-harness.

This framework wraps Claude Code (the CLI coding agent) around the browser-harness
repo: Claude Code owns the agent loop, we just hand it a task and a workdir
Expand Down Expand Up @@ -258,17 +258,21 @@ async def execute(task_description: str) -> ExecutionResult:
# Pre-provision the browser so Claude starts with a live CDP attach.
_start_browser(browser_name, bu_name)

env = {
**os.environ,
"BU_NAME": bu_name,
"DISABLE_TELEMETRY": "1",
"DISABLE_AUTOUPDATER": "1",
"CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
}

cmd = _build_claude_cmd(
task_description, model_name, max_turns, max_budget_usd, use_bare
)
try:
env = {
**os.environ,
"BU_NAME": bu_name,
"DISABLE_TELEMETRY": "1",
"DISABLE_AUTOUPDATER": "1",
"CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
}

cmd = _build_claude_cmd(
task_description, model_name, max_turns, max_budget_usd, use_bare
)
except Exception:
_stop_browser(browser_name, bu_name)
raise

start = time.time()
steps: list[str] = []
Expand All @@ -284,16 +288,26 @@ async def execute(task_description: str) -> ExecutionResult:
# asyncio StreamReader line buffer is 64 KiB which raises ValueError on
# long lines, and even a larger limit has a ceiling. Read raw chunks and
# split on newlines ourselves.
proc = await asyncio.create_subprocess_exec(
*cmd,
cwd=HARNESS_DIR,
env=env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
limit=256 * 1024 * 1024, # 256 MiB safety cap
)

stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
proc: asyncio.subprocess.Process | None = None
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
cwd=HARNESS_DIR,
env=env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
limit=256 * 1024 * 1024, # 256 MiB safety cap
)
stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
except Exception:
if proc is not None and proc.returncode is None:
proc.kill()
try:
await asyncio.wait_for(proc.wait(), timeout=10)
except asyncio.TimeoutError:
pass
_stop_browser(browser_name, bu_name)
raise

async def _iter_stdout_lines():
"""Yield one stream-json line at a time, regardless of line length."""
Expand Down
62 changes: 38 additions & 24 deletions frameworks/pi_harness/run_task.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Run a single benchmark task using pi (the @earendil-works/pi-coding-agent CLI)
"""Run a single benchmark task using pi (the @earendil-works/pi-coding-agent CLI)
driving browser-harness.

This framework is a near-mirror of `claude_code_harness`, except the coding
Expand Down Expand Up @@ -249,19 +249,23 @@ async def execute(task_description: str) -> ExecutionResult:
# Pre-provision the browser so pi starts with a live CDP attach.
_start_browser(browser_name, bu_name)

env = {
**os.environ,
"BU_NAME": bu_name,
"DISABLE_TELEMETRY": "1",
# Pi-specific: skip startup network ops so a flaky pi.dev doesn't
# block the run, and disable install/update telemetry.
"PI_OFFLINE": "1",
"PI_SKIP_VERSION_CHECK": "1",
"PI_TELEMETRY": "0",
}

system_prompt = SYSTEM_PROMPT_FILE.read_text()
cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt)
try:
env = {
**os.environ,
"BU_NAME": bu_name,
"DISABLE_TELEMETRY": "1",
# Pi-specific: skip startup network ops so a flaky pi.dev doesn't
# block the run, and disable install/update telemetry.
"PI_OFFLINE": "1",
"PI_SKIP_VERSION_CHECK": "1",
"PI_TELEMETRY": "0",
}

system_prompt = SYSTEM_PROMPT_FILE.read_text()
cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt)
except Exception:
_stop_browser(browser_name, bu_name)
raise

start = time.time()
steps: list[str] = []
Expand All @@ -272,16 +276,26 @@ async def execute(task_description: str) -> ExecutionResult:

# pi stream-json lines can be huge (tool results with full page HTML/text).
# Same workaround as CCH: read raw chunks and split on newlines.
proc = await asyncio.create_subprocess_exec(
*cmd,
cwd=HARNESS_DIR,
env=env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
limit=256 * 1024 * 1024, # 256 MiB safety cap
)

stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
proc: asyncio.subprocess.Process | None = None
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
cwd=HARNESS_DIR,
env=env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
limit=256 * 1024 * 1024, # 256 MiB safety cap
)
stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
except Exception:
if proc is not None and proc.returncode is None:
proc.kill()
try:
await asyncio.wait_for(proc.wait(), timeout=10)
except asyncio.TimeoutError:
pass
_stop_browser(browser_name, bu_name)
raise

async def _iter_stdout_lines():
"""Yield one stream-json line at a time, regardless of line length."""
Expand Down