From 2d87a0913f7c6a8f6432e70f0496454b881d3fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 26 Apr 2026 22:52:32 -0400 Subject: [PATCH 01/22] docs: move agent guidance into help workflows --- package.json | 2 +- skills/agent-device/SKILL.md | 80 +--- .../references/bootstrap-install.md | 244 ------------ .../references/coordinate-system.md | 28 -- skills/agent-device/references/debugging.md | 138 ------- skills/agent-device/references/exploration.md | 362 ------------------ .../agent-device/references/macos-desktop.md | 88 ----- .../agent-device/references/remote-tenancy.md | 189 --------- .../agent-device/references/verification.md | 134 ------- skills/dogfood/SKILL.md | 179 +-------- skills/dogfood/references/issue-taxonomy.md | 83 ---- .../templates/dogfood-report-template.md | 52 --- skills/react-devtools/SKILL.md | 44 +-- skills/react-devtools/references/commands.md | 91 ----- skills/react-devtools/references/profiling.md | 74 ---- src/__tests__/cli-help.test.ts | 14 +- src/utils/__tests__/args.test.ts | 19 +- src/utils/command-schema.ts | 284 +++++++++++++- test/skillgym/README.md | 5 +- test/skillgym/skillgym.config.ts | 2 +- .../suites/agent-device-smoke-suite.ts | 126 ++++-- website/docs/docs/commands.md | 11 + website/docs/docs/introduction.md | 3 +- website/docs/docs/skillgym.md | 15 +- 24 files changed, 457 insertions(+), 1810 deletions(-) delete mode 100644 skills/agent-device/references/bootstrap-install.md delete mode 100644 skills/agent-device/references/coordinate-system.md delete mode 100644 skills/agent-device/references/debugging.md delete mode 100644 skills/agent-device/references/exploration.md delete mode 100644 skills/agent-device/references/macos-desktop.md delete mode 100644 skills/agent-device/references/remote-tenancy.md delete mode 100644 skills/agent-device/references/verification.md delete mode 100644 skills/dogfood/references/issue-taxonomy.md delete mode 100644 skills/dogfood/templates/dogfood-report-template.md delete mode 100644 skills/react-devtools/references/commands.md delete mode 100644 skills/react-devtools/references/profiling.md diff --git a/package.json b/package.json index cbb557cb2..50726ad21 100644 --- a/package.json +++ b/package.json @@ -100,7 +100,7 @@ "test-app:typecheck": "pnpm --dir examples/test-app typecheck", "test": "vitest run", "test:unit": "vitest run", - "test:skillgym": "skillgym run ./test/skillgym/suites/agent-device-smoke-suite.ts --config ./test/skillgym/skillgym.config.ts", + "test:skillgym": "pnpm build && skillgym run ./test/skillgym/suites/agent-device-smoke-suite.ts --config ./test/skillgym/skillgym.config.ts", "test:smoke": "node --test test/integration/smoke-*.test.ts", "test:integration": "node --test test/integration/*.test.ts", "test:replay:ios": "node --experimental-strip-types src/bin.ts test test/integration/replays/ios/simulator", diff --git a/skills/agent-device/SKILL.md b/skills/agent-device/SKILL.md index bb228c277..dd5ea82f3 100644 --- a/skills/agent-device/SKILL.md +++ b/skills/agent-device/SKILL.md @@ -1,76 +1,26 @@ --- name: agent-device -description: Automates interactions for Apple-platform apps (iOS, tvOS, macOS) and Android devices. Use when navigating apps, taking snapshots/screenshots, tapping, typing, scrolling, extracting UI info, or collecting logs, network inspection, and perf snapshots across mobile, TV, and desktop targets. +description: Automates Apple-platform apps (iOS, tvOS, macOS) and Android devices. Use when navigating apps, taking snapshots/screenshots, tapping, typing, scrolling, extracting UI info, collecting logs/network/perf evidence, or planning agent-device CLI commands. --- # agent-device -Use this skill as a router with mandatory defaults. Read this file first. For normal device tasks, always load `references/bootstrap-install.md` and `references/exploration.md` before acting. Use bootstrap to confirm or establish deterministic setup. Use exploration for UI inspection, interaction, and verification once the app session is open. +Router only. Before your first agent-device command or plan, read the version-matched CLI guide: -## Default operating rules +```bash +agent-device help workflow +``` -- Start conservative. Prefer read-only inspection before mutating the UI. -- Start deterministic. If the app name, package, device, or session is uncertain, load bootstrap and discover them before interacting. -- Use plain `snapshot` when the task is to verify what text or structure is currently visible on screen. -- Use `snapshot -i` only when you need interactive refs such as `@e3` for a requested action or targeted query. On iOS and Android, default snapshot output uses the same visible-first model: off-screen interactive content is exposed as discovery hints, not tappable refs. -- Prefer `diff snapshot` after a nearby mutation when you only need to know what changed. -- Avoid speculative mutations. You may take the smallest reversible UI action needed to unblock inspection or complete the requested task, such as dismissing a popup, closing an alert, or clearing an unintended surface. -- In React Native dev or debug builds, check early for visible warning or error overlays, tooltips, and toasts that can steal focus or intercept taps. If they are not part of the requested behavior, dismiss them and continue. If you saw them, report them in the final summary. -- In Metro-backed React Native dev loops, use `agent-device metro reload` for a JS app reload before falling back to `open --relaunch`. It mirrors pressing `r` in the Metro terminal and preserves the native app process. -- Do not browse the web or use external sources unless the user explicitly asks. -- Re-snapshot after meaningful UI changes instead of reusing stale refs. -- Treat refs in default snapshot output as actionable-now, not durable identities. If a target appears only in an off-screen summary, use `scroll ` and re-snapshot until the target is visible. -- Prefer `@ref` or selector targeting over raw coordinates. -- Ensure the correct target is pinned and an app session is open before interacting. -- Keep the loop short: `open` -> inspect/act -> verify if needed -> `close`. +Escalate only when relevant: -## Default flow +```bash +agent-device help debugging +agent-device help react-devtools +agent-device help remote +agent-device help macos +agent-device help dogfood +``` -1. Load [references/bootstrap-install.md](references/bootstrap-install.md) and [references/exploration.md](references/exploration.md) before acting on a normal device task. -2. Use bootstrap first to confirm or establish the correct target, app install, and open app session. -3. Once the app session is open and stable, use exploration for inspection, interaction, and verification. -4. Start with plain `snapshot` if the goal is to read or verify what is visible. -5. Escalate to `snapshot -i` only if you need refs for interactive exploration or a requested action. -6. Use `get`, `is`, or `find` before mutating the UI when a read-only command can answer the question. -7. End by capturing proof if needed, then `close`. +Default loop: `open -> snapshot/-i -> get/is/find or press/fill/scroll/wait -> verify -> close`. -## QA modes - -- Open-ended bug hunt with reporting: use [../dogfood/SKILL.md](../dogfood/SKILL.md). -- Pass/fail QA from acceptance criteria: stay in this skill, start with [references/bootstrap-install.md](references/bootstrap-install.md), then use the QA loop in [references/exploration.md](references/exploration.md). - -## Required references - -- For every normal device task, after reading this file, load [references/bootstrap-install.md](references/bootstrap-install.md) first, then [references/exploration.md](references/exploration.md), before acting. -- Use bootstrap to confirm or establish deterministic setup, especially in sandbox or cloud environments. -- Use exploration once the app session is open and stable. -- Load additional references only when their scope is needed. - -## Decision rules - -- Use plain `snapshot` when you need to verify whether text is visible. -- Use `snapshot -i` mainly for interactive exploration and choosing refs. -- Use `diff snapshot` for compact post-action verification; use `snapshot --diff` when that alias is easier to discover from snapshot help. -- Use `get`, `is`, or `find` when they can answer the question without changing UI state. -- Use `fill` to replace text. -- Use `type` to append text. -- Do not write `type @eN "text"`. Use `fill @eN "text"` to target a field directly, or `press @eN` then `type "text"` when the field already has focus and you want append semantics. -- If the on-screen keyboard blocks the next step, prefer `keyboard dismiss` over navigation. On iOS, keep an app session open first; `keyboard status|get` remains Android-only. -- When a task asks to "go back", use plain `back` for predictable app-owned navigation and reserve `back --system` for platform back gestures or button semantics. -- Use `type --delay-ms` or `fill --delay-ms` for debounced search fields that drop characters when typed too quickly. -- If there is no simulator, no app install, or no open app session yet, switch to `bootstrap-install.md` instead of improvising setup steps. -- Use the smallest unblock action first when transient UI blocks inspection, but do not navigate, search, or enter new text just to make the UI reveal data unless the user asked for that interaction. -- In React Native dev or debug apps, treat visible warning or error overlays as transient blockers unless the user is explicitly asking you to diagnose them. Dismiss them when safe, then continue the requested flow. -- For React Native code changes where the app is already connected to Metro, prefer `agent-device metro reload`, then wait and re-snapshot. Use `open --relaunch` only when Metro reload does not reconnect or native startup state must reset. -- Do not use external lookups to compensate for missing on-screen data unless the user asked for them. -- If the needed information is not exposed on screen, say that plainly instead of compensating with extra navigation, text entry, or web search. -- Prefer `@ref` or selector targeting over raw coordinates. - -## Additional references - -- Need logs, network, alerts, permissions, or failure triage: [references/debugging.md](references/debugging.md) -- Need screenshots, diff, recording, replay maintenance, or perf data: [references/verification.md](references/verification.md) -- Need desktop surfaces, menu bar behavior, or macOS-specific interaction rules: [references/macos-desktop.md](references/macos-desktop.md) -- Need remote HTTP transport, `connect --remote-config`, or tenant leases on a remote macOS host: [references/remote-tenancy.md](references/remote-tenancy.md) - This includes remote React Native runs where `agent-device` now prepares Metro locally and manages the local Metro companion tunnel automatically. -- Need the React Native component tree, props, state, hooks, or render profiling: use `agent-device react-devtools ...` and the [react-devtools skill](../react-devtools/SKILL.md). +Keep refs current, prefer selectors/refs over coordinates, use `fill` to replace text, and use `back` for app-owned navigation. Let `help workflow` provide the exact command shapes. diff --git a/skills/agent-device/references/bootstrap-install.md b/skills/agent-device/references/bootstrap-install.md deleted file mode 100644 index a6b5a0f09..000000000 --- a/skills/agent-device/references/bootstrap-install.md +++ /dev/null @@ -1,244 +0,0 @@ -# Bootstrap and Install - -## When to open this file - -Open this file when you still need to choose the right target, start the right session, install or relaunch the app, or pin automation to one device before interacting. This is the deterministic setup layer for sandbox, cloud, or other environments where install paths, device state, or app readiness may be uncertain. - -## Open-first path - -- `devices` -- `apps` -- `ensure-simulator` -- `open` -- `session list` - -Use this exact order when you are not sure about the installed app identifier. On Android dev builds in particular, `apps` is cheaper than guessing package suffixes and retrying failed `open` calls. - -## Install path - -- `install` or `reinstall` -- `install-from-source` when the artifact already exists at a URL the daemon can reach -- `install-from-source --github-actions-artifact` when a compatible remote daemon should resolve a GitHub Actions artifact - -## Most common mistake to avoid - -Do not start acting before you have pinned the correct target and opened an `app` session. In mixed-device environments, always pass `--device`, `--udid`, or `--serial`. - -## Deterministic setup rule - -If there is no simulator, no app install, no open app session, or any uncertainty about where the app should come from, stay in this file and use deterministic setup commands or bootstrap scripts first. Do not improvise install paths or app-launch flows while exploring. - -After setup is confirmed or completed, move to `exploration.md` before doing UI inspection or interaction. - -## Open-first rule - -- If the user asks to test an app and does not provide an install artifact or explicit install instruction, try `open ` first. -- If `open ` fails, run `agent-device apps` and retry with a discovered app name before considering install steps. -- Do not install or reinstall on the first attempt unless the user explicitly asks for installation or provides a concrete artifact path or URL. -- When installation is required from a known location, prefer a checked-in shell script or other deterministic bootstrap command over ad hoc path guessing. -- Use `apps --platform ` together with `--device`, `--udid`, or `--serial` when target selection matters. -- Once you have the correct app name, retry `open` with that exact discovered value. - -## Common starting points - -These are examples, not required exact sequences. Use the smallest setup flow that matches the task. - -### Boot a simulator and open an app - -```bash -agent-device ensure-simulator --platform ios --device "iPhone 17 Pro" --boot -agent-device open MyApp --platform ios --device "iPhone 17 Pro" --relaunch -``` - -### Install an app artifact - -```bash -agent-device install com.example.app ./build/app.apk --platform android --serial emulator-5554 -``` - -```bash -agent-device install com.example.app ./build/MyApp.app --platform ios --device "iPhone 17 Pro" -``` - -```bash -ARTIFACT_URL="" -agent-device install-from-source "$ARTIFACT_URL" --platform android -``` - -Daemon-resolved GitHub Actions artifacts: - -```bash -agent-device install-from-source \ - --github-actions-artifact ORG/REPO:1234567890 \ - --platform android -``` - -Project config can provide an artifact name instead: - -```json -{ - "platform": "android", - "installSource": { - "type": "github-actions-artifact", - "repo": "ORG/REPO", - "artifact": "app-debug" - } -} -``` - -## Install guidance - -- Use `install ` when the app may already be installed and you do not need a fresh-state reset. -- Use `reinstall ` when you explicitly need uninstall plus install as one deterministic step. -- Use `install-from-source ` only when an existing artifact URL is trusted, operator-approved, and reachable by the daemon. -- Use `--github-actions-artifact /:` when a compatible remote daemon should resolve a GitHub Actions artifact. Numeric artifacts are IDs; non-numeric artifacts are names. -- Local `.apk`, `.aab`, `.app`, and `.ipa` paths go through `install` or `reinstall`; existing reachable URLs go through `install-from-source`. -- Do not download, re-zip, publish temporary GitHub releases, or move CI artifacts elsewhere just to make an install command work. -- Keep install and open as separate phases. Do not turn them into one default command flow. -- Supported binary formats: - - Android: `.apk` and `.aab` - - iOS: `.app` and `.ipa` -- Android URL sources can be direct `.apk` or `.aab` files. -- Trusted artifact service URLs may point at archive-backed downloads that contain one installable artifact. Prefer `--github-actions-artifact` for GitHub Actions artifacts that a compatible remote daemon can resolve with its own credentials. -- If a trusted artifact archive contains multiple installables, stop and ask for the intended artifact instead of guessing. -- `.aab` still requires `bundletool` in `PATH`, or `AGENT_DEVICE_BUNDLETOOL_JAR=` with `java` in `PATH`, when the daemon installs the materialized artifact. -- For `.ipa` archives with multiple app bundles, `` is the bundle id or bundle name selection hint. -- After install or reinstall, later use `open ` with the exact discovered or known package/bundle identifier, not the artifact path. - -## Choose the right starting point - -- iOS local QA: prefer simulators unless the task explicitly requires physical hardware. -- iOS in mixed simulator and device environments: run `ensure-simulator` first, then keep using `--device` or `--udid`. -- TV targets: use `--target tv` together with `--platform` when the task is for tvOS or Android TV rather than phone or tablet surfaces. -- Android binary flow: use `install` or `reinstall` for `.apk` or `.aab`, then open by installed package name. -- macOS desktop app flow: use `open --platform macos`. Only load [macos-desktop.md](macos-desktop.md) if a desktop surface or macOS-specific behavior matters. - -TV example: - -```bash -agent-device open MyTvApp --platform ios --target tv -agent-device open com.example.androidtv --platform android --target tv -``` - -## Session rules - -- Use `--session ` when you need a named session: - -```bash -agent-device --session auth open Settings --platform ios -agent-device --session auth snapshot -i -``` - -- Use `open ` before interactions. -- Use `close` when done. Add `--shutdown` when you want simulators or emulators torn down with the session. -- Use semantic session names when you need multiple concurrent runs. -- Use `--save-script=` on `close` when you want to keep a replay script. -- For dev loops where state can linger, prefer `open --relaunch`. -- For Metro-backed React Native JS changes with the app already running, prefer `metro reload` instead of `open --relaunch`; it asks Metro to reload connected apps without restarting the native process. -- In iOS sessions, use `open ` for the app itself. Use `open ` for deep links, and `open ` when you need to launch the app and deep link in one step. -- On iOS, `appstate` is session-scoped and requires the matching active session on the target device. - -## After a session is established - -Once you have opened the correct session on the correct target, default to the conservative rule: keep the session binding on follow-up commands, and stop repeating device-routing flags unless you are intentionally retargeting. - -- Prefer `--session ` on follow-up commands, or use sandboxed `AGENT_DEVICE_SESSION`. -- Do not keep repeating `--platform`, `--target`, `--device`, `--udid`, `--serial`, or similar target-selection flags on normal follow-up commands. -- Only omit follow-up session flags when the environment explicitly guarantees isolation. - -Good shared-host pattern: - -```bash -agent-device --session auth open Settings --platform ios --device "iPhone 17 Pro" -agent-device --session auth snapshot -i -agent-device --session auth press @e3 -agent-device --session auth close -``` - -Bad shared-host pattern: - -```bash -agent-device --session auth open Settings --platform ios --device "iPhone 17 Pro" -agent-device --session auth snapshot -i --platform ios --device "iPhone 17 Pro" -``` - -Use target-selection flags again only when you are choosing the target before opening a session, or when you explicitly mean to retarget. - -## Session-bound automation - -Use this when an orchestrator must keep plain CLI calls on one session and device. - -```bash -export AGENT_DEVICE_SESSION=qa-ios -export AGENT_DEVICE_PLATFORM=ios -export AGENT_DEVICE_SESSION_LOCK=strip - -agent-device open MyApp --relaunch -``` - -- `AGENT_DEVICE_SESSION` plus `AGENT_DEVICE_PLATFORM` provides the default binding. -- `--session-lock reject|strip` controls whether conflicting per-call routing flags fail or are ignored. -- Conflicts include explicit retargeting flags such as `--platform`, `--target`, `--device`, `--udid`, `--serial`, `--ios-simulator-device-set`, and `--android-device-allowlist`. -- Lock policy applies to nested `batch` steps too. -- Compatibility aliases remain supported: `--session-locked`, `--session-lock-conflicts`, `AGENT_DEVICE_SESSION_LOCKED`, and `AGENT_DEVICE_SESSION_LOCK_CONFLICTS`. - -Android emulator variant: - -```bash -export AGENT_DEVICE_SESSION=qa-android -export AGENT_DEVICE_PLATFORM=android - -agent-device --session-lock reject open com.example.myapp --relaunch -``` - -## Scoped discovery - -Use scoped discovery when one run must not see host-global device lists. - -```bash -agent-device devices --platform ios --ios-simulator-device-set /tmp/tenant-a/simulators -agent-device devices --platform android --android-device-allowlist emulator-5554,device-1234 -``` - -- Scope is applied before `--device`, `--udid`, and `--serial`. -- Out-of-scope selectors fail with `DEVICE_NOT_FOUND`. -- With iOS simulator-set scope enabled, iOS physical devices are not enumerated. -- If the scoped iOS simulator set is empty, the error should point at the set path and suggest creating a simulator in that set. -- Environment equivalents: - - `AGENT_DEVICE_IOS_SIMULATOR_DEVICE_SET` - - `AGENT_DEVICE_ANDROID_DEVICE_ALLOWLIST` - -## Session inspection and replay - -```bash -agent-device session list -agent-device replay ./session.ad --session auth -agent-device replay -u ./session.ad --session auth -``` - -- iOS session entries include `device_udid` and `ios_simulator_device_set`. Use them to confirm routing in concurrent runs. -- Prefer selector-based actions and assertions in saved replay scripts. -- Tenant isolation namespaces sessions as `:` during tenant-scoped runs. - -## When to leave this file - -- Once the correct target and session are pinned, move to [exploration.md](exploration.md). -- If opening, startup, permissions, or logs become the blocker, switch to [debugging.md](debugging.md). - -## Install examples - -```bash -agent-device reinstall MyApp /path/to/app-debug.apk --platform android --serial emulator-5554 -``` - -```bash -agent-device install com.example.app ./build/MyApp.ipa --platform ios --device "iPhone 17 Pro" -``` - -Do not use `open --relaunch` on Android. - -## Security and trust notes - -- Treat signing, provisioning, and daemon auth values as host secrets. Do not paste them into shared logs or commit them to source control. -- Prefer Xcode Automatic Signing over manual overrides when a physical iOS device is involved. -- Keep persistent host-specific defaults in environment variables rather than checked-in project config. diff --git a/skills/agent-device/references/coordinate-system.md b/skills/agent-device/references/coordinate-system.md deleted file mode 100644 index 03b8f2ef0..000000000 --- a/skills/agent-device/references/coordinate-system.md +++ /dev/null @@ -1,28 +0,0 @@ -# Coordinate System - -## When to open this file - -Open this file only when you must use raw coordinates instead of selectors or `@ref` targeting. - -## Main commands to reach for first - -- `screenshot` -- coordinate-based `click` or `swipe` - -## Most common mistake to avoid - -Do not assume coordinates mean the same thing across platforms or runs. Prefer selectors and refs first. - -## Canonical loop - -```bash -agent-device screenshot /tmp/current-screen.png -agent-device click 120 240 -``` - -## Rules - -- Origin is the top-left of the device screen. -- iOS uses device points. -- Android uses pixels. -- Use screenshots to reason about coordinates before acting. diff --git a/skills/agent-device/references/debugging.md b/skills/agent-device/references/debugging.md deleted file mode 100644 index adf1ba898..000000000 --- a/skills/agent-device/references/debugging.md +++ /dev/null @@ -1,138 +0,0 @@ -# Debugging - -## When to open this file - -Open this file when the task turns into failure triage, logs, network inspection, permission prompts, setup trouble, or unstable session behavior. - -If the debugging task needs the React Native component tree, props, state, hooks, or render profiling, use `agent-device react-devtools ...` and the `skills/react-devtools` workflow instead of trying to infer those internals from the accessibility tree or app logs alone. - -## Main commands to reach for first - -- `logs clear --restart` -- `network dump` -- `logs path` -- `logs doctor` -- `alert wait` -- `alert accept` or `alert dismiss` - -## Most common mistake to avoid - -Do not leave logging on for normal flows or dump full log files into context. Keep debug windows short and inspect logs with `grep` or `tail`. - -In React Native dev or debug builds, do not dismiss visible warning or error overlays without remembering to report them later. If you close one to keep the flow moving, keep at least a screenshot or a short marked log window so the summary can name it. - -## Canonical loop - -```bash -agent-device open MyApp --platform ios -agent-device logs clear --restart -agent-device network dump 25 -agent-device logs path -agent-device close -``` - -## Log and network flow - -Logging is off by default. Enable it only when you need a debugging window. - -- Default app logs live under `~/.agent-device/sessions//app.log`. -- `logs clear --restart` is the fastest clean repro loop. -- `network dump [limit] [summary|headers|body|all]` parses recent HTTP(s) entries from the same session app log. -- On macOS, `network dump` is app-scoped and only sees Unified Logging associated with the active session app. -- On iOS simulators, `network dump` can recover recent app log history with `simctl log show` when the live session stream is sparse, so check the returned notes before assuming the repro window was empty. -- On iOS, `network dump` is still limited to what Unified Logging exposes for the app process. If the app does not emit request metadata there, `network dump` can legitimately return no HTTP entries even during a real repro. -- Summary output already shows timestamp, status, and duration when the log backend exposes them. -- Prefer the explicit flag form `network dump 25 --include headers|body|all` when you need more than the default summary view. -- If iOS simulator notes say app logs were recovered but none looked like HTTP traffic, treat that as an app instrumentation gap rather than a missing repro and inspect `logs path` for the non-network diagnostics that were captured. -- `logs doctor` checks backend and runtime readiness for the current session and device. -- `logs mark "before tap"` inserts a timestamped marker into the app log. -- Android `network dump` surfaces timestamps from logcat-style prefixes and can backfill status and request/response duration from adjacent GIBSDK packet lines, so check it before dumping raw log windows. -- Android app-log streaming rebinds to the current app PID after relaunches, so rerun the repro window before assuming the last log slice is stale. -- Marker lines are emitted with the `[agent-device][mark][...]` prefix. When you grep later, prefer a narrow pattern such as `grep -n -E "agent-device.*mark|before tap" `. -- Session app logs can contain runtime data, headers, or payload fragments. Review them before sharing. -- `logs start` requires an active app session and appends to `app.log`. -- `logs stop` stops streaming. `close` also stops logging. -- `logs clear` truncates `app.log` and removes rotated `app.log.N` files, and requires logging to be stopped first. -- `logs path` returns the log path plus metadata about the active backend and file state. -- `network log` is an alias for `network dump`. - -Operational limits: - -- `app.log` rotates to `app.log.1` after 5 MB by default. -- `network dump` scans the last 4000 app-log lines, returns up to 200 entries, and truncates header or payload fields at 2048 characters. -- Retention knobs: - - `AGENT_DEVICE_APP_LOG_MAX_BYTES` - - `AGENT_DEVICE_APP_LOG_MAX_FILES` -- Redaction hook: - - `AGENT_DEVICE_APP_LOG_REDACT_PATTERNS` - -Useful shell follow-up after `logs path`: - -```bash -grep -n -E "Error|Exception|Fatal|crash" -grep -n -E "agent-device.*mark|before tap" -tail -50 -``` - -If the app showed a visible warning or error overlay during the flow: - -- Prefer a narrow grep window around your `logs mark` lines instead of loading the whole file. -- Mention the surfaced warning or error in the final summary even if it did not block completion. -- If the overlay kept returning, call that out as a stability issue instead of treating it as operator noise. - -## Alerts and permissions - -Use `alert` for iOS simulator permission dialogs and macOS desktop alerts instead of tapping coordinates. - -```bash -agent-device alert wait 5000 -agent-device alert accept -``` - -- `alert` is supported on iOS simulators and macOS desktop targets. -- `alert accept` and `alert dismiss` retry internally for a short window, so you usually do not need manual sleeps. -- If a permission sheet or modal is visible in `snapshot` or `screenshot` but `alert accept` says no alert was found, treat it as normal tappable UI for that run: take a scoped `snapshot -i -s ""` and `press @ref` instead of looping on `alert`. -- iOS 16+ "Allow Paste" prompts are suppressed under XCUITest. Use `xcrun simctl pbcopy booted` when you need to seed simulator clipboard content directly. - -## Setup problems worth recognizing early - -- iOS snapshots do not require macOS Accessibility permissions. -- iOS physical-device XCTest setup does require valid signing and provisioning. -- If physical-device runner setup fails, prefer Xcode Automatic Signing first. -- Optional overrides are: - - `AGENT_DEVICE_IOS_TEAM_ID` - - `AGENT_DEVICE_IOS_SIGNING_IDENTITY` - - `AGENT_DEVICE_IOS_PROVISIONING_PROFILE` - - `AGENT_DEVICE_IOS_BUNDLE_ID` -- If daemon startup is timing out during setup, increase `AGENT_DEVICE_DAEMON_TIMEOUT_MS`. -- If daemon startup fails with stale metadata hints, clean `~/.agent-device/daemon.json` and `~/.agent-device/daemon.lock`, then retry. -- Free Apple Developer personal-team accounts may reject generic bundle IDs. Use a unique reverse-DNS value for `AGENT_DEVICE_IOS_BUNDLE_ID` when that happens. - -## Common failure patterns - -- `snapshot` returns 0 nodes: the app may no longer be foregrounded or the UI is not stable yet. Re-open the app or retry when state settles. -- Logs are empty: confirm you opened an app session before `logs clear --restart`. -- Android logs look stale after relaunch: retry the repro window after the process rebinds. -- Android accessibility snapshots can lag behind visible screen transitions. The next snapshot retries suspicious trees for a short post-action deadline after navigation-sensitive actions, and `@ref` actions refresh while that window is active. If the tree still looks stale, use `screenshot` as visual truth, wait briefly, then re-run `snapshot -i`. For animation-heavy runs, try `settings animations off` and restore with `settings animations on`. -- React Native dev warnings or errors keep reappearing: treat them as part of the app state, not as disposable chrome. Capture one clean repro and include them in the summary. -- Permission prompts block the flow: wait for the alert and handle it explicitly. -- If snapshots keep returning 0 nodes on an iOS simulator, restart Simulator and re-open the app. -- If a macOS snapshot looks incomplete, compare with `snapshot --raw --platform macos` to separate collector filtering from missing AX content. - -## Crash triage fast path - -Always start from the session app log, then branch by platform. - -```bash -agent-device logs path -grep -n -E "SIGABRT|SIGSEGV|EXC_|fatal|exception|terminated|killed|jetsam|memorystatus|FATAL EXCEPTION|Abort message" -``` - -- iOS: if the log suggests `ReportCrash`, `SIGABRT`, or `EXC_*`, inspect `~/Library/Logs/DiagnosticReports`. -- Android: if the app log is not enough, use `adb logcat` for `FATAL EXCEPTION`, `Abort message`, or `signal` lines around process death. -- If no crash signature appears in app logs, stop collecting broad logs and switch to the platform-native crash source. - -## When to leave this file - -- Return to [exploration.md](exploration.md) once the app is stable again. -- Load [verification.md](verification.md) if you need evidence artifacts after reproducing the issue. diff --git a/skills/agent-device/references/exploration.md b/skills/agent-device/references/exploration.md deleted file mode 100644 index bce87335a..000000000 --- a/skills/agent-device/references/exploration.md +++ /dev/null @@ -1,362 +0,0 @@ -# Exploration - -## When to open this file - -Open this file when the app or screen is already running and you need to discover the UI, choose targets, read state, wait for conditions, or perform normal interactions. - -## Read-only first - -- If the question is what text, labels, or structure is visible on screen, start with plain `snapshot`. -- Escalate to `snapshot -i` only when you need refs such as `@e3` for interactive exploration or a requested action. -- If you intend to `press`, `fill`, or otherwise interact, start with `snapshot -i` and fall back to plain `snapshot` only if interactive refs are unavailable. -- Prefer `get`, `is`, or `find` before mutating the UI when a read-only command can answer the question. -- You may take the smallest reversible UI action needed to unblock inspection, such as dismissing a popup, closing an alert, or backing out of an unintended surface. -- Do not type or fill text just to make hidden information easier to access unless the user asked for that interaction. -- Do not use external sources to infer missing UI state unless the user explicitly asked. -- If the answer is not visible or exposed in the UI, report that gap instead of compensating with search, navigation, or text entry. - -## Decision shortcut - -- User asks what is visible on screen: `snapshot` -- User asks for exact text from a known target: `get text` -- User asks you to tap, type, or choose an element: `snapshot -i`, then act -- User asks for the React Native component tree, props/state/hooks, or render profiling: use `agent-device react-devtools ...` and the `skills/react-devtools` workflow -- User asks to reload a Metro-backed React Native app after JS changes: `agent-device metro reload`, then wait briefly and re-run `snapshot` or `snapshot -i` -- React Native dev or debug build shows warning/error UI: capture enough evidence to identify it, dismiss it if it is not the requested behavior, then continue the flow and report it in the summary -- The on-screen keyboard is blocking the next step: `keyboard dismiss`; on iOS do this only while an app session is active, and use `keyboard status|get` only on Android -- UI does not expose the answer: say so plainly; do not browse or force the app into a new state unless asked - -## Read-only commands - -- `snapshot` -- `get` -- `is` -- `find` -- `keyboard status|get` on Android when keyboard visibility or input type matters - -## Interaction commands - -- `snapshot -i` -- `press` -- `fill` -- `type` -- `scroll` -- `wait` -- `keyboard dismiss` when the keyboard obscures the next target - -## Common mistakes to avoid - -**Stale refs.** Do not treat `@ref` values as durable after navigation or dynamic updates. Re-snapshot after the UI changes, and switch to selectors when the flow must stay stable. - -**Android AX tree lag.** After submits, route changes, or composer transitions, the accessibility tree can lag behind the visible UI. If `snapshot -i` and `screenshot` disagree: - -1. Trust the screenshot as visual truth. -2. Take one fresh `snapshot -i`. Android retries suspicious trees for a short post-action deadline after navigation-sensitive actions. -3. If the tree still disagrees with the screenshot, wait briefly, then take one more fresh snapshot. Do not loop snapshots immediately. -4. For animation-heavy Android runs, use `settings animations off` as an opt-in stabilizer and restore with `settings animations on` after the run. - -**React Native dev overlays.** In dev or debug builds, warning or error overlays can block taps, change focus, or hide the real UI. Check for them near app open and after major transitions. - -- Not blocking the task: dismiss and continue. -- Blocking or recurring: switch to [debugging.md](debugging.md) and collect evidence. -- Seen at any point: mention in the final summary even if dismissed. - -**React Native Metro reload.** When a dev app is already running and connected to Metro, prefer a Metro reload over restarting the native app process: - -```bash -agent-device metro reload -agent-device wait 1000 -agent-device snapshot -i -``` - -Use `--metro-host`, `--metro-port`, or `--bundle-url` only when the active connection does not already carry the right runtime hints. Fall back to `open --relaunch` when the app is not connected to Metro, Metro reload fails, or native startup state needs a clean process. - -## Common example loops - -These are examples, not required exact sequences. Adapt them to the app, state, and task at hand. - -### Interactive exploration loop - -```bash -agent-device open Settings --platform ios -agent-device snapshot -i -agent-device press @e3 -agent-device wait visible 'label="Privacy & Security"' 3000 -agent-device get text 'label="Privacy & Security"' -agent-device close -``` - -### Screen verification loop - -```bash -agent-device open MyApp --platform ios -# perform the necessary actions to reach the state you need to verify -agent-device snapshot -# verify whether the expected element or text is present -agent-device close -``` - -## Snapshot choices - -- Use plain `snapshot` when you only need to verify whether visible text or structure is on screen. -- Use `snapshot -i` when you need refs such as `@e3` for interactive exploration or for an intended interaction. -- On iOS and Android, default snapshot output is visible-first. Off-screen interactive content is surfaced as discovery hints (including inline scroll/list hidden-content hints when known), not shown as directly tappable refs. -- Treat large text-surface lines in `snapshot -i` as discovery output. If a node shows preview or truncation metadata, use `get text @ref` only after you have already decided that `snapshot -i` is needed for that surface. -- Use `snapshot -i -s "Camera"` or `snapshot -i -s @e3` when you want a smaller, scoped result. -- If `snapshot -i -s ""` returns 0 nodes, the scope did not match the current screen. Widen the query or re-check the screen state instead of assuming the command silently fell back to the full tree. -- If `snapshot -i` returns 0 nodes but the screen is visibly populated, treat `screenshot` as visual truth, wait briefly, then re-run `snapshot -i` once before escalating. -- If `snapshot -i -d ` says the interactive output is empty at that depth, retry without `-d` instead of taking more shallow snapshots. - -Example: - -```bash -agent-device snapshot -i -``` - -Sample output: - -```text -Page: com.apple.Preferences -App: com.apple.Preferences - -@e1 [ioscontentgroup] - @e2 [button] "Camera" - @e3 [button] "Privacy & Security" -[off-screen below] 2 interactive items: "Location Services", "Battery" -``` - -## Refs vs selectors - -- Use refs for discovery, debugging, and short local loops. -- When a target appears only in a visible-first off-screen summary, such as `[off-screen below] ... "Battery"`, use `scroll down` and then `snapshot -i`. For `[off-screen above]`, use `scroll up` and then `snapshot -i`. -- For more than two repeated scroll checks, create a short shell loop instead of issuing each command by hand. Stop when the label appears or the snapshot stops changing. -- Visible-first off-screen summaries are intentionally compact. If you need the full off-screen tree instead of a short summary, retry with `snapshot --raw`. -- Cap long searches in the loop when the list may be unbounded or the target may not exist. -- Use selectors for deterministic scripts, assertions, and replay-friendly actions. -- Prefer selector or `@ref` targeting over raw coordinates. -- For tap interactions, `press` is canonical and `click` is an equivalent alias. - -Examples: - -```bash -agent-device press @e2 -agent-device fill @e5 "test" -agent-device press 'id="camera_row" || label="Camera" role=button' -agent-device is visible 'id="camera_settings_anchor"' -``` - -Example loop: - -```bash -previous='' -for _ in 1 2 3 4 5 6; do - current="$(agent-device snapshot -i)" - printf '%s\n' "$current" - printf '%s\n' "$current" | grep -q 'Battery' && break - [ "$current" = "$previous" ] && break - previous="$current" - agent-device scroll down 0.5 >/dev/null -done -``` - -## Interaction fallbacks - -When `press @ref` fails: - -1. If the error says the ref is off-screen, use the off-screen summary direction to run `scroll `, then take a fresh `snapshot -i`. -2. Re-snapshot if the UI may have changed. -3. Retry `press @ref` or a selector-based `press`. -4. If `screenshot --overlay-refs --json` returned a reliable `overlayRefs[].center`, use `agent-device press `. -5. Use an external vision-based tap tool only after semantic and coordinate targeting fail. - -- Prefer `@ref` over coordinates. -- Do not guess coordinates from the image when structured `center` is available. -- `agent-device` does not provide a built-in vision-tap flag. - -## Text entry rules - -- Use `fill` to replace text in an editable field. -- Use `type` to append text to the current insertion point. -- Use `fill @ref "text"` when you need to target a field directly by ref. -- Use `press @ref`, then `type "text"` when the field is already focused and you need append semantics. -- Do not write `type @ref "text"`; `type` only accepts text and will not target that ref for you. -- If the keyboard blocks the next control after text entry, prefer `keyboard dismiss` instead of backing out of the screen. -- On iOS, `keyboard dismiss` depends on the active app session to keep the target app foregrounded, so do not rely on selector-only dismiss calls after closing or without `open`. -- Do not use `fill` or `type` just to make the app reveal information that is not currently visible unless the user asked for that interaction. - -## React Native dev or debug overlays - -Use this loop for React Native dev clients, Metro-backed builds, and local debug sessions where warnings or errors may appear as tooltips, banners, toasts, or modal overlays. - -1. After `open`, inspect the visible UI for warning or error surfaces before relying on the next tap. -2. If a warning or error is visible, capture enough evidence to identify it: - - preferred: `screenshot` - - optional: `logs mark "warning visible"` or `logs mark "error visible"` if you are already in a debug window -3. If the overlay is not the thing the user asked you to investigate, dismiss or close it with the smallest reversible action. -4. Re-check the intended screen before continuing the task. -5. Report any visible warnings or errors in the final summary, even if the flow succeeded after dismissal. - -Use this rule of thumb: - -- Warning overlay that does not block the task: dismiss and keep going. -- Error overlay that does not block the task: dismiss, keep going, and report it. -- Error overlay that blocks the task or keeps returning: stop treating it as noise and switch to [debugging.md](debugging.md). - -## Query and sync rules - -- Use `get` to read text, attrs, or state from a known target. -- Use `is` for assertions. -- Use `wait` when the UI needs time to settle after a mutation. -- Use `find "" click --json` when you need search-driven targeting plus matched-target metadata. -- Use `find "" click --first` or `--last` when ambiguous matches are expected and you want the first or last occurrence without falling back to raw coordinates. -- If you are forced onto raw coordinates, open [coordinate-system.md](coordinate-system.md) first. - -Example: - -```bash -agent-device find "Increment" click --json -``` - -Returned metadata comes from the matched snapshot node and can be used for observability or replay maintenance. - -## QA from acceptance criteria - -Use this loop when the task starts from acceptance criteria and you need to turn them into concrete checks. - -Preferred mapping: - -- visibility claim for what is on-screen now: `is visible` or plain `snapshot` -- presence claim regardless of viewport visibility: `is exists` -- exact text, label, or value claim: `get text` -- post-action state change: act, then `wait`, then `is` or `get` -- nearby structural UI change: `diff snapshot` -- proof artifact for the final result: `screenshot` or `record` - -Notes: - -- `wait text` is useful for synchronizing on text presence, but it is not the same as `is visible`. -- After a nearby navigation or submit on Android, prefer `screenshot`, then one fresh `snapshot -i`; `@ref` interactions refresh while the Android freshness window is active. - -Anti-hallucination rules: - -- Do not invent app names, device ids, session names, refs, selectors, or package names. -- Discover them first with `devices`, `open`, `snapshot -i`, `find`, or `session list`. -- If refs drift after navigation, re-snapshot or switch to selectors instead of guessing. - -Avoid this escalation path for visible-text questions: - -- Do not jump from `snapshot -i` to `get text @ref`, then to web search, then to typing into a search box just to force the app to reveal the answer. -- Start with `snapshot`. If the text is not visible or exposed, report that directly. -- After Android submit or navigation-heavy actions when the UI looks wrong: `screenshot` first, then `snapshot -i`. - -Canonical QA loop: - -```bash -agent-device open MyApp --platform ios -agent-device snapshot -i -agent-device press @e3 -agent-device wait visible 'label="Success"' 3000 -agent-device is visible 'label="Success"' -agent-device screenshot /tmp/qa-proof.png -agent-device close -``` - -## Accessibility audit - -Use this pattern when you need to find UI that is visible to a user but missing from the accessibility tree. - -Audit loop: - -1. Capture a `screenshot` to see what is visually rendered. -2. Capture a `snapshot` or `snapshot -i` to see what the accessibility tree exposes. -3. Compare the two: - - visible in screenshot and present in snapshot: exposed to accessibility - - visible in screenshot and missing from snapshot: likely accessibility gap -4. If you suspect the node exists in AX but is filtered from interactive output, retry with `snapshot --raw`. - -Example: - -```bash -agent-device screenshot /tmp/accessibility-screen.png -agent-device snapshot -i -``` - -Use `screenshot` as the visual source of truth and `snapshot` as the accessibility source of truth for this audit. - -## Batch only when the sequence is already known - -Use `batch` when a short command sequence is already planned and belongs to one logical screen flow. - -```bash -agent-device batch --session sim --platform ios --steps-file /tmp/batch-steps.json --json -``` - -- Keep batch size moderate, roughly 5 to 20 steps. -- Add `wait` or `is exists` guards after mutating steps. -- Do not use `batch` for highly dynamic flows that need replanning after each step. - -Example: known chat-send flow - -```json -[ - { "command": "open", "positionals": ["ChatApp"], "flags": { "platform": "android" } }, - { "command": "click", "positionals": ["label=\"Travel chat\""], "flags": {} }, - { "command": "wait", "positionals": ["label=\"Message\"", "3000"], "flags": {} }, - { "command": "fill", "positionals": ["label=\"Message\"", "Filed the expense"], "flags": {} }, - { "command": "press", "positionals": ["label=\"Send\""], "flags": {} } -] -``` - -Step payload contract: - -```json -[ - { "command": "open", "positionals": ["Settings"], "flags": { "platform": "ios" } }, - { "command": "wait", "positionals": ["label=\"Privacy & Security\"", "3000"], "flags": {} }, - { "command": "click", "positionals": ["label=\"Privacy & Security\""], "flags": {} }, - { "command": "get", "positionals": ["text", "label=\"Tracking\""], "flags": {} } -] -``` - -- `positionals` is optional and defaults to `[]`. -- `flags` is optional and defaults to `{}`. -- Only `command`, `positionals`, `flags`, and `runtime` are accepted as top-level step keys. -- Nested `batch` and `replay` are rejected. -- Supported error mode is stop-on-first-error. - -Response handling: - -- Success returns fields such as `total`, `executed`, `totalDurationMs`, and `results[]`. -- Human-mode `batch` runs also print a short per-step success summary. -- Failed runs include `details.step`, `details.command`, `details.executed`, and `details.partialResults`. -- Replan from the first failing step instead of rerunning the whole flow blindly. - -Canonical batch recipe: open app -> open action menu -> choose option -> verify - -```json -[ - { "command": "open", "positionals": ["com.example.app"], "flags": { "platform": "android" } }, - { "command": "wait", "positionals": ["text", "Home", "3000"], "flags": {} }, - { "command": "press", "positionals": ["label=\"More actions\" role=button"], "flags": {} }, - { "command": "wait", "positionals": ["text", "Camera scan", "2000"], "flags": {} }, - { "command": "press", "positionals": ["label=\"Camera scan\""], "flags": {} }, - { "command": "wait", "positionals": ["text", "Expense created", "15000"], "flags": {} }, - { "command": "is", "positionals": ["visible", "label=\"Expense created\""], "flags": {} } -] -``` - -Common batch error categories: - -- `INVALID_ARGS`: fix the payload shape and retry. -- `SESSION_NOT_FOUND`: open or select the correct session, then retry. -- `UNSUPPORTED_OPERATION`: switch to a supported command or surface. -- `AMBIGUOUS_MATCH`: refine the selector or locator, then retry the failed step. -- `DEVICE_IN_USE`: the device is held by another session — close or reuse the existing session before retrying. -- `COMMAND_FAILED`: add sync guards and retry from the failing step. - -## Stop conditions - -- If refs drift after transitions, switch to selectors. -- If a desktop surface or context menu is involved on macOS, load [macos-desktop.md](macos-desktop.md). -- If logs, network, alerts, or setup failures become the blocker, switch to [debugging.md](debugging.md). -- If the flow is stable and you need proof or replay maintenance, switch to [verification.md](verification.md). diff --git a/skills/agent-device/references/macos-desktop.md b/skills/agent-device/references/macos-desktop.md deleted file mode 100644 index a411d370b..000000000 --- a/skills/agent-device/references/macos-desktop.md +++ /dev/null @@ -1,88 +0,0 @@ -# macOS Desktop - -## When to open this file - -Open this file only when `--platform macos` is involved or the task needs `frontmost-app`, `desktop`, or `menubar` surfaces. - -## Main commands to reach for first - -- `open --platform macos` -- `open --platform macos --surface frontmost-app|desktop|menubar` -- `snapshot -i` -- `get` -- `is` -- `click --button secondary` - -## Most common mistake to avoid - -Do not treat every macOS surface the same. Use the normal `app` surface when you want to act inside one app. Use `frontmost-app`, `desktop`, or `menubar` mainly to inspect what is visible before switching back to `app` for most interactions. - -## Canonical loop - -```bash -agent-device open TextEdit --platform macos -agent-device snapshot -agent-device close -``` - -## Surface rules - -- `app`: default surface and the normal choice for `click`, `fill`, `press`, `scroll`, `screenshot`, and `record`. -- `frontmost-app`: inspect the currently focused app without naming it first. -- `desktop`: inspect visible desktop windows across apps. -- `menubar`: inspect the active app menu bar and system menu extras. Use `open --platform macos --surface menubar` when you need one menu bar app's extras, such as a status-item app. -- Menu bar apps can expose a sparse or empty default `app` tree. Prefer the `menubar` surface first when the app lives entirely in the top bar. - -Use inspect-first surfaces to understand desktop-global UI, then switch back to `app` when you need to act in one app. - -## Snapshot expectations - -- `snapshot -i` should describe UI visible to a human. -- `desktop` snapshots can include multiple windows from multiple apps. -- `menubar` snapshots can include both app-menu items and system menu extras. -- Finder-style rows, sidebar items, toolbar controls, search fields, and opened context menus should appear when visible. -- Finder and other native apps may expose duplicate-looking row, cell, and child text nodes. Treat them as distinct AX nodes unless you have a stronger selector anchor. - -## Context menus - -Context menus are not ambient UI. Open them explicitly, then re-snapshot. - -```bash -agent-device click @e66 --button secondary --platform macos -agent-device snapshot -i -``` - -Expected loop: - -1. Snapshot visible content. -2. Secondary-click the target item. -3. Snapshot again. -4. Interact with the new `menu-item` nodes. - -## Targeting rules - -- Prefer selectors or `@ref` values over raw coordinates. -- On macOS, window position can vary across runs, so coordinate-only flows are fragile. -- If the task only needs shared exploration rules, return to [exploration.md](exploration.md). - -Selector guidance: - -- Good selectors usually anchor on stable labels or app-owned identifiers such as `label="Downloads"` or `role=menu-item label="Rename"`. -- Avoid relying on framework-generated `_NS:*` identifiers as stable selectors. - -Use `snapshot --raw --platform macos` only when debugging AX structure or collector filtering. Do not make raw snapshots the default agent loop. - -Things not to rely on: - -- Mobile-only helpers such as `install`, `reinstall`, or `push`. -- Desktop-global click, fill, or gesture parity from `desktop` or `menubar` sessions. -- Raw coordinate assumptions across runs. - -Troubleshooting: - -- If visible content is missing from `snapshot -i`, re-snapshot after the UI settles. -- If `desktop` is too broad, retry with `frontmost-app`. -- If `menubar` is missing the expected menu, retry with `open --platform macos --surface menubar` for menu bar apps, or make the app frontmost first and retry the generic menubar surface. -- If the wrong menu opened, retry secondary-clicking the row or cell wrapper rather than the nested text node. -- If the app has multiple windows, make the correct window frontmost before relying on refs. -- If overriding the local helper, set `AGENT_DEVICE_MACOS_HELPER_BIN` to an absolute executable path; relative helper paths are rejected. diff --git a/skills/agent-device/references/remote-tenancy.md b/skills/agent-device/references/remote-tenancy.md deleted file mode 100644 index 0fecd8195..000000000 --- a/skills/agent-device/references/remote-tenancy.md +++ /dev/null @@ -1,189 +0,0 @@ -# Remote Tenancy - -## When to open this file - -Open this file for remote daemon HTTP flows that let an agent running in a Linux sandbox talk to another `agent-device` instance on a remote macOS host in order to control devices that are not available locally. This file covers daemon URL setup, authentication, `connect`, tenant lease scope, and remote Metro companion lifecycle. - -## Main commands to reach for first - -- `agent-device connect --remote-config ` -- `agent-device install-from-source --remote-config --platform android` -- `agent-device install-from-source --github-actions-artifact /: --remote-config --platform android` -- `agent-device open --remote-config --relaunch` -- `agent-device metro reload --remote-config ` -- `agent-device snapshot --remote-config -i` -- `agent-device disconnect --remote-config ` -- `agent-device connection status` -- `agent-device auth status` -- `AGENT_DEVICE_DAEMON_AUTH_TOKEN=adc_live_...` for CI/service-token automation - -## Most common mistake to avoid - -Do not mix an arbitrary `--session` plus ad-hoc daemon, tenant, run, or lease flags. That can bypass saved Metro runtime hints. Use one of these patterns instead: - -- Interactive flow: run `connect --remote-config ` once, then normal commands, then `disconnect`. -- Script flow: pass the same `--remote-config ` to every command, including `disconnect`. - -## Choose one flow - -### Interactive flow - -Use this when the agent will run several commands in one session. - -```bash -agent-device connect --remote-config ./remote-config.json - -ARTIFACT_URL="" -agent-device install-from-source "$ARTIFACT_URL" --platform android -agent-device open com.example.app --relaunch -agent-device metro reload -agent-device snapshot -i -agent-device fill @e3 "test@example.com" -agent-device disconnect -``` - -After `connect`, normal commands use the active remote connection. If cloud credentials are missing, `connect` starts login automatically in an interactive local shell and stores a revocable CLI session that silently mints short-lived `adc_agent_...` command tokens. Linux sandboxes, CI, and other non-interactive shells should set `AGENT_DEVICE_DAEMON_AUTH_TOKEN=adc_live_...` instead. The cloud side remains responsible for token expiry, tenant/run claim checks, revocation, one-time device approval, and polling rate limits. End with `disconnect` to release the lease and stop the owned Metro companion. - -### Self-contained script flow - -Use this when each command must be explicit and repeatable. Pass the same `--remote-config` to each step. - -```bash -ARTIFACT_URL="" - -agent-device install-from-source "$ARTIFACT_URL" \ - --remote-config ./remote-config.json \ - --platform android - -agent-device open com.example.app \ - --remote-config ./remote-config.json \ - --relaunch - -agent-device snapshot \ - --remote-config ./remote-config.json \ - -i - -agent-device disconnect \ - --remote-config ./remote-config.json -``` - -The first command that needs a lease or Metro runtime prepares and persists it. Later commands with the same `--remote-config` reuse that state. End with `disconnect --remote-config ` to release the lease and stop the owned Metro companion. - -## Behavior summary - -- `connect` stores local non-secret connection state and defers tenant lease allocation plus Metro preparation until a later command needs them. -- Commands such as `install-from-source`, `open`, `snapshot`, `devices`, and `apps` allocate or refresh the lease when needed. -- `open` prepares Metro runtime hints when the remote profile has Metro fields and no compatible runtime is already saved. -- `metro reload` reuses saved Metro runtime hints and asks Metro to reload connected React Native apps without restarting the native process. -- `batch` also prepares Metro when any step opens an app and that step does not provide its own runtime. -- `disconnect` closes the session when possible, stops the Metro companion owned by the connection, releases the lease when one was allocated, and removes local connection state. - -Remote install examples: - -```bash -agent-device install com.example.app ./app.apk -ARTIFACT_URL="" -agent-device install-from-source "$ARTIFACT_URL" --platform android -``` - -- Use `install` or `reinstall` for local paths; remote daemons upload local artifacts automatically. -- Use `install-from-source` only for trusted, operator-approved artifact URLs the remote daemon can reach. Do not fetch arbitrary user-supplied URLs. -- Use `install-from-source --github-actions-artifact /:` when the remote daemon has repository credentials and supports daemon-resolved GitHub Actions artifacts. -- For local-path versus URL artifact rules, follow [bootstrap-install.md](bootstrap-install.md). - -Use `agent-device connection status --session adc-android` to inspect the active connection without reading JSON state manually. Status output must not include auth tokens. - -## Remote config shape - -Example `remote-config.json` shape: - -```json -{ - "daemonBaseUrl": "https://bridge.example.com/agent-device", - "daemonTransport": "http", - "tenant": "acme", - "runId": "run-123", - "sessionIsolation": "tenant", - "platform": "ios", - "metroProxyBaseUrl": "https://bridge.example.com" -} -``` - -Optional overrides stay available for advanced cases: - -```json -{ - "session": "adc-ios", - "leaseBackend": "ios-instance", - "metroProjectRoot": ".", - "metroKind": "expo", - "metroPublicBaseUrl": "http://127.0.0.1:8081" -} -``` - -- Keep service tokens in env/config managed by the operator boundary. Do not persist auth tokens in connection state. Human login uses `agent-device auth login` or implicit `connect` login and stores only the CLI session credential. -- Omit Metro fields for non-React Native flows. -- Put `tenant`, `runId`, and `sessionIsolation` in the remote profile so agents can run `agent-device connect --remote-config ./remote-config.json` without extra scope flags. Add `platform`, `leaseBackend`, `session`, or Metro overrides only when the default inference is not enough for that flow. -- Explicit command-line flags override connected defaults. Use them intentionally when switching session, platform, target, tenant, run, or lease scope. -- For React Native Metro runs with `metroProxyBaseUrl`, `agent-device >= 0.11.12` can manage the local companion tunnel, but Metro itself still needs to be running locally. `metroProxyBaseUrl` is the bridge origin, not a prebuilt `/api/metro/...` route. -- Set `AGENT_DEVICE_CLOUD_BASE_URL` to the bridge/control-plane API origin. It does not need to be the dashboard origin; `/api-keys` on the bridge can redirect to the dashboard for service-token setup. -- For cloud stock React Native iOS, use the bridge descriptor's wildcard HTTPS Metro hints directly; do not install or launch the XCTest runner just to make Metro reachable. -- Android keeps using bridge-provided `/api/metro/runtimes//...` Metro routes. -- `metroPublicBaseUrl` is only needed for direct/non-bridge bundle hints. Bridged profiles can omit it. -- Use a lease backend that matches the bridge target platform, for example `android-instance`, `ios-instance`, or an explicit `--lease-backend` override. - -## Transport prerequisites - -- Start the daemon in HTTP mode with `AGENT_DEVICE_DAEMON_SERVER_MODE=http|dual` on the host. -- Point the profile or env at the remote host with `daemonBaseUrl` or `AGENT_DEVICE_DAEMON_BASE_URL=http(s)://host:port[/base-path]`. -- For humans, run `connect --remote-config ` and let it refresh or create the CLI session. Use `agent-device auth status` to inspect it and `agent-device auth logout` to remove it. -- For CI/non-interactive shells, set `AGENT_DEVICE_DAEMON_AUTH_TOKEN=adc_live_...` or pass `--daemon-auth-token`. The client does not start device-code polling in CI by default. -- Prefer an auth hook such as `AGENT_DEVICE_HTTP_AUTH_HOOK` when the host needs caller validation or tenant injection. - -## Lease debug fallback - -The main agent flow should use `connect` and `connection status`. For daemon-side auth, scope, or lease debugging, inspect host-side daemon logs and operator tooling instead of issuing raw daemon RPC from the agent shell. - -## GitHub Actions artifact install - -Use this when a compatible remote daemon resolves GitHub Actions artifacts server-side. Do not download CI artifacts locally or add a local `GITHUB_TOKEN` just to install CI output. - -Artifact ID shape: - -```bash -agent-device install-from-source \ - --github-actions-artifact OWNER/REPO:1234567890 \ - --remote-config ./remote-config.json \ - --platform android -``` - -Artifact-name shape: - -```bash -agent-device install-from-source \ - --github-actions-artifact OWNER/REPO:app-debug \ - --remote-config ./remote-config.json \ - --platform ios -``` - -Config shape: - -```json -{ - "installSource": { - "type": "github-actions-artifact", - "repo": "OWNER/REPO", - "artifact": "app-debug" - } -} -``` - -Numeric artifacts are passed as artifact IDs. Non-numeric artifacts are passed as artifact names. - -## Failure semantics and trust notes - -- Missing tenant, run, or lease fields in tenant-isolation mode should fail as `INVALID_ARGS`. -- Inactive or scope-mismatched leases should fail as `UNAUTHORIZED`. -- Inspect logs on the remote host during remote debugging. Client-side `--debug` does not tail a local daemon log once `AGENT_DEVICE_DAEMON_BASE_URL` is set. -- Do not point `AGENT_DEVICE_DAEMON_BASE_URL` at untrusted hosts. Remote daemon requests can launch apps and execute interaction commands. -- Treat daemon auth tokens and lease identifiers as sensitive operational data. diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md deleted file mode 100644 index b74da377d..000000000 --- a/skills/agent-device/references/verification.md +++ /dev/null @@ -1,134 +0,0 @@ -# Verification - -## When to open this file - -Open this file when the task needs evidence, regression checks, replay maintenance, or session performance measurements after the main interaction flow is already working. - -## Main commands to reach for first - -- `screenshot` -- `diff snapshot` -- `diff screenshot` -- `record` -- `replay -u` -- `perf` - -## Most common mistake to avoid - -Do not use verification tools as the first exploration step. First get the app into the correct state with the normal interaction flow, then capture proof or maintain replay assets. - -## Canonical loop - -```bash -agent-device open Settings --platform ios -# after using exploration to reach the state you want to verify -agent-device snapshot -agent-device screenshot /tmp/settings-proof.png --overlay-refs -agent-device close -``` - -## Structural verification with diff snapshot - -Use `diff snapshot` when you need a compact view of how the UI changed between nearby states. - -```bash -agent-device snapshot -i -agent-device press @e5 -agent-device diff snapshot -i -``` - -- Initialize the baseline at a stable point. -- Perform the mutation. -- Run `diff snapshot` to confirm the expected structural change. -- Re-run full `snapshot` only when you need fresh refs. - -## Screenshot artifacts - -Use `screenshot` when the proof needs a rendered image instead of a structural tree. - -- Add `--max-size 1024` when a full-resolution screenshot is too large for an agent, model, or chat attachment. -- Add `--overlay-refs` when you want the saved PNG to show fresh `@eN` refs burned into the screenshot. -- Combine them as `screenshot /tmp/proof.png --max-size 1024 --overlay-refs` when you need a smaller visual proof that still includes tappable refs. -- Avoid very small `--max-size` values when text, icons, or labels need to remain readable. - -## Visual regression with diff screenshot - -Use `diff screenshot` when comparing the current rendered screen against a saved visual baseline. - -```bash -agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png -agent-device diff screenshot --baseline ./baseline.png ./current.png --out /tmp/diff.png -agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --overlay-refs -``` - -- Text output includes ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance. JSON also includes normalized bounds. -- The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. -- When a current image path is provided, `diff screenshot` compares the two saved files instead of capturing from the live device or requiring an active session. -- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas, movement clusters, and bbox size-change hints. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. -- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition. -- Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide for a live capture. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. Saved-image comparisons do not have live accessibility refs, so omit `--overlay-refs` when passing a current image path. - -## Session recording - -Use `record` for debugging, documentation, or shareable verification artifacts. - -```bash -agent-device record start ./recordings/ios.mov -agent-device open App -agent-device snapshot -i -agent-device press @e3 -agent-device close -agent-device record stop -``` - -- `record` supports iOS simulators, iOS devices, and Android. -- On iOS, recording is a wrapper around `simctl` for simulators and the corresponding device capture path for physical devices. -- On Android, recording is a wrapper around `adb`. -- Recording writes a video artifact and a gesture-telemetry sidecar JSON. -- Use `record start --quality 5` when a smaller video is easier to inspect or share. The scale is 5-10, where 10 is native resolution; omit it to preserve native/current resolution. -- On macOS hosts, touch overlay burn-in is available for supported recordings. -- On non-macOS hosts, recording still succeeds but the video stays raw and `record stop` can return an `overlayWarning`. -- If the agent already knows the interaction sequence and wants a more lifelike, uninterrupted recording, drive the flow with `batch` while recording instead of replanning between each step. - -Example: - -```bash -agent-device record start ./recordings/smoke.mov -agent-device batch --session sim --platform ios --steps-file /tmp/smoke-steps.json --json -agent-device record stop -``` - -- Use this only after exploration has stabilized the flow. -- Keep the batch short and add `wait` or `is exists` guards after mutating steps so the recorded flow still tracks realistic UI timing. - -## Replay maintenance - -Use replay updates when selectors drift but the recorded scenario is still correct. - -```bash -agent-device replay -u ./session.ad -agent-device test ./smoke --platform android -``` - -- Prefer selector-based actions in recorded `.ad` replays. -- Use `test` when you already have multiple `.ad` flows and need a quick regression pass after updating or recording them. -- Keep the skill-level rule simple: use `replay -u` to maintain one script, use `test` to verify a folder or matcher of scripts. -- Treat `test` as a human and CI-facing suite runner that an agent can invoke for verification, not as the main source of product documentation. -- Failed runs keep suite artifacts under `.agent-device/test-artifacts` by default, which is usually enough for debugging without extra agent-side processing. -- Use update mode for maintenance, not as a substitute for fixing a broken interaction strategy. - -## Performance checks - -Use `perf --json` or `metrics --json` when you need session performance data for the active session. - -```bash -agent-device open Settings --platform ios -agent-device perf --json -``` - -- `startup` is command round-trip timing around `open`. -- It is not true first-frame or first-interactive telemetry. -- Android app sessions also expose `memory` (`dumpsys meminfo`) and `cpu` (`dumpsys cpuinfo`) snapshots when the session has an app package context. -- Apple app sessions on macOS, iOS simulators, and physical iOS devices also expose `memory` and `cpu` process snapshots when the session has an app bundle ID. -- On physical iOS devices, sampling uses a short `xcrun xctrace` Activity Monitor capture, so keep the device unlocked, connected, and the app active in the foreground while sampling. -- `fps` is still unavailable in this release. diff --git a/skills/dogfood/SKILL.md b/skills/dogfood/SKILL.md index 03fef0cf0..458d8bffa 100644 --- a/skills/dogfood/SKILL.md +++ b/skills/dogfood/SKILL.md @@ -1,184 +1,17 @@ --- name: dogfood -description: 'Systematically explore and test a mobile app on iOS/Android with agent-device to find bugs, UX issues, and other problems. Use when asked to "dogfood", "QA", "exploratory test", "find issues", "bug hunt", or "test this app" on mobile. Produces a structured report with reproducible evidence: screenshots, optional repro videos, and detailed steps for every issue.' +description: Systematically explore and test a mobile app on iOS/Android with agent-device to find bugs, UX issues, and other problems. Use when asked to dogfood, QA, exploratory test, find issues, bug hunt, or test this app on mobile. allowed-tools: Bash(agent-device:*), Bash(npx agent-device:*) --- -# Dogfood (agent-device) +# Dogfood -Systematically explore a mobile app, find issues, and produce a report with full reproduction evidence for every finding. - -## Setup - -Only the **Target app** is required. Everything else has sensible defaults. - -| Parameter | Default | Example override | -| -------------------- | ----------------------------------------------------------- | -------------------------------------------- | -| **Target app** | _(required)_ | `Settings`, `com.example.app`, deep link URL | -| **Platform** | Infer from user context; otherwise ask (`ios` or `android`) | `--platform ios` | -| **Session name** | Slugified app/platform (for example `settings-ios`) | `--session my-session` | -| **Output directory** | `./dogfood-output/` | `Output directory: /tmp/mobile-qa` | -| **Scope** | Full app | `Focus on onboarding and profile` | -| **Authentication** | None | `Sign in to user@example.com` | - -If the user gives enough context to start, begin immediately with defaults. Ask follow-up only when a required detail is missing (for example platform or credentials). - -Prefer direct `agent-device` binary when available. - -## Workflow - -``` -1. Initialize Set up session, output dirs, report file -2. Launch/Auth Open app and sign in if needed -3. Orient Capture initial snapshot and map navigation -4. Explore Systematically test flows and states -5. Document Record reproducible evidence per issue -6. Wrap up Reconcile summary, close session -``` - -### 1. Initialize - -```bash -mkdir -p {OUTPUT_DIR}/screenshots {OUTPUT_DIR}/videos -cp {SKILL_DIR}/templates/dogfood-report-template.md {OUTPUT_DIR}/report.md -``` - -### 2. Launch/Auth - -Start a named session and launch target app: - -```bash -agent-device --session {SESSION} open {TARGET_APP} --platform {PLATFORM} -agent-device --session {SESSION} snapshot -i -``` - -If login is required: - -```bash -agent-device --session {SESSION} snapshot -i -agent-device --session {SESSION} fill @e1 "{EMAIL}" -agent-device --session {SESSION} fill @e2 "{PASSWORD}" -agent-device --session {SESSION} press @e3 -agent-device --session {SESSION} wait 1000 -agent-device --session {SESSION} snapshot -i -``` - -For OTP/email codes: ask the user, wait for input, then continue. - -### 3. Orient - -Capture initial evidence and navigation anchors: - -```bash -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/initial.png -agent-device --session {SESSION} snapshot -i -``` - -Map top-level navigation, tabs, and key workflows before deep testing. - -### 4. Explore - -Read [references/issue-taxonomy.md](references/issue-taxonomy.md) for severity/category calibration. - -Strategy: - -- Move through each major app area (tabs, drawers, settings pages). -- Test core journeys end-to-end (create, edit, delete, submit, recover). -- Validate edge states (empty/error/loading/offline/permissions denied). -- Use `diff snapshot -i` after UI transitions to avoid stale refs. -- Periodically capture `logs path` and inspect the app log when behavior looks suspicious. - -Useful commands per screen: - -```bash -agent-device --session {SESSION} snapshot -i -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/{screen-name}.png -agent-device --session {SESSION} appstate -agent-device --session {SESSION} logs path -``` - -### 5. Document Issues (Repro-First) - -Explore and document in one pass. When you find an issue, stop and fully capture evidence before continuing. - -#### Interactive/behavioral issues - -Use video + step screenshots: - -1. Start recording: - -```bash -agent-device --session {SESSION} record start {OUTPUT_DIR}/videos/issue-{NNN}-repro.mp4 -``` - -2. Reproduce with visible pacing. Capture each step: - -```bash -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/issue-{NNN}-step-1.png -sleep 1 -# perform action -sleep 1 -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/issue-{NNN}-step-2.png -``` - -3. Capture final broken state: - -```bash -sleep 2 -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/issue-{NNN}-result.png -``` - -4. Stop recording: +Router for exploratory QA. Read current CLI guidance: ```bash -agent-device --session {SESSION} record stop +agent-device help dogfood ``` -5. Append issue immediately to report with numbered steps and screenshot references. - -#### Static/on-load issues - -Single screenshot is sufficient; no video required: - -```bash -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/issue-{NNN}.png -``` - -Set **Repro Video** to `N/A` in the report. - -### 6. Wrap Up - -Target 5-10 well-evidenced issues, then finish: - -1. Reconcile summary severity counts in `report.md`. -2. Close session: - -```bash -agent-device --session {SESSION} close -``` - -3. Report total issues, severity breakdown, and highest-risk findings. - -## Guidance - -- Repro quality matters more than issue count. -- Use refs (`@eN`) for fast exploration, selectors for deterministic replay assertions when needed. -- Re-snapshot after any mutation (navigation, modal, list update, form submit). -- Use `fill` for clear-then-type semantics; use `type` for incremental typing behavior checks. -- Keep logs optional and targeted: enable/read app logs only when useful for diagnosis. -- If the issue appears rooted in React Native internals rather than device/app runtime behavior, use `agent-device react-devtools ...` and the `skills/react-devtools` workflow for component-tree or render-profiling inspection. -- Never read source code of the app under test; findings must come from observed runtime behavior. -- Write each issue immediately to avoid losing evidence. -- Never delete screenshots/videos/report artifacts during a session. - -## References - -| Reference | When to Read | -| ------------------------------------------------------------ | ----------------------------------------------- | -| [references/issue-taxonomy.md](references/issue-taxonomy.md) | Start of session; severity/categories/checklist | - -## Templates +Loop: open named session -> snapshot -i + screenshot -> explore flows -> capture evidence per issue -> close. -| Template | Purpose | -| ---------------------------------------------------------------------------- | --------------------------------------------- | -| [templates/dogfood-report-template.md](templates/dogfood-report-template.md) | Copy into output directory as the report file | +Target app is required; infer platform or ask. Default output is `./dogfood-output/`. Findings must come from runtime behavior, not source reads. Re-snapshot after mutations. Use logs, network, trace, perf, overlay screenshots, or react-devtools only when they add evidence. diff --git a/skills/dogfood/references/issue-taxonomy.md b/skills/dogfood/references/issue-taxonomy.md deleted file mode 100644 index 27a678d61..000000000 --- a/skills/dogfood/references/issue-taxonomy.md +++ /dev/null @@ -1,83 +0,0 @@ -# Issue Taxonomy (Mobile) - -Reference for categorizing issues found during mobile dogfooding. - -## Severity Levels - -| Severity | Definition | -| ------------ | ------------------------------------------------------------------------- | -| **critical** | Blocks a core workflow, causes data loss, or crashes/freeze loops the app | -| **high** | Major feature broken or unusable, no practical workaround | -| **medium** | Feature works with notable friction or partial failure; workaround exists | -| **low** | Minor cosmetic or polish issue | - -## Categories - -### Visual / UI - -- Layout broken, clipped, overlapped, or unreadable text -- Safe-area/notch overlap issues -- Incorrect dark/light appearance rendering -- Missing assets/icons -- Animation glitches or flicker - -### Functional - -- Buttons/controls do nothing or trigger wrong action -- Flows fail (create/edit/delete/submit) -- Navigation dead-ends or wrong destination -- State loss after background/foreground transitions -- Deep link opens wrong screen or fails - -### UX - -- Confusing hierarchy or navigation labels -- Missing loading/progress feedback -- Unclear error handling or no recovery affordance -- Excessive steps for common tasks -- Inconsistent behavior between similar screens - -### Content - -- Typos, incorrect copy, placeholder text -- Wrong labels/help text -- Truncated text with no affordance -- Inconsistent terminology across screens - -### Performance - -- Slow startup or route transitions -- Input lag or gesture jank -- Scroll hitches/frame drops -- Notable battery/thermal symptoms during basic usage - -### Diagnostics / Logs - -- Native crashes or repeated fatal exceptions -- Repeated warnings correlated with broken behavior -- Unhandled runtime errors visible during repro - -### Permissions / Platform - -- Permission prompt flow broken or loops forever -- Denied permissions not handled gracefully -- Platform-specific regressions (iOS-only or Android-only) -- Background/foreground lifecycle regressions - -### Accessibility - -- Missing labels or incorrect accessibility names -- Focus order/navigation issues for assistive tech -- Low contrast or unreadable text scaling -- Touch targets too small for reliable interaction - -## Exploration Checklist - -1. Visual scan: capture screenshot; verify layout/safe areas/text/icon rendering. -2. Interactions: press controls, open menus/modals, validate expected response. -3. Forms/input: test valid/invalid/empty/boundary input. -4. Navigation: traverse all top-level sections and return paths. -5. App states: loading/empty/error/offline/permission-denied/background-resume. -6. Logs/diagnostics: inspect app logs when behavior is suspicious. -7. Platform parity: verify critical flows on each requested platform. -8. Accessibility basics: labels, touch target sizes, readability/contrast. diff --git a/skills/dogfood/templates/dogfood-report-template.md b/skills/dogfood/templates/dogfood-report-template.md deleted file mode 100644 index fd11566a6..000000000 --- a/skills/dogfood/templates/dogfood-report-template.md +++ /dev/null @@ -1,52 +0,0 @@ -# Dogfood Report: {APP_NAME} - -| Field | Value | -| -------------- | -------------- | -| **Date** | {DATE} | -| **Platform** | {PLATFORM} | -| **Target App** | {TARGET_APP} | -| **Session** | {SESSION_NAME} | -| **Scope** | {SCOPE} | - -## Summary - -| Severity | Count | -| --------- | ----- | -| Critical | 0 | -| High | 0 | -| Medium | 0 | -| Low | 0 | -| **Total** | **0** | - -## Issues - - - -### ISSUE-001: {Short title} - -| Field | Value | -| ------------------ | -------------------------------------------------------------------------------------------- | -| **Severity** | critical / high / medium / low | -| **Category** | visual / functional / ux / content / performance / diagnostics / permissions / accessibility | -| **Screen / Route** | {screen where issue was found} | -| **Repro Video** | {path to video, or N/A for static issues} | - -**Description** - -{What is wrong, what was expected, and what actually happened.} - -**Repro Steps** - -1. Open {screen/entry point} - ![Step 1](screenshots/issue-001-step-1.png) - -2. {Action} - ![Step 2](screenshots/issue-001-step-2.png) - -3. {Action} - ![Step 3](screenshots/issue-001-step-3.png) - -4. **Observe:** {broken behavior} - ![Result](screenshots/issue-001-result.png) - ---- diff --git a/skills/react-devtools/SKILL.md b/skills/react-devtools/SKILL.md index 521992353..4cdaa7557 100644 --- a/skills/react-devtools/SKILL.md +++ b/skills/react-devtools/SKILL.md @@ -1,55 +1,31 @@ --- name: react-devtools -description: Inspect and profile React Native component trees from agent-device. Use when debugging React Native props, state, hooks, render causes, slow components, excessive re-renders, or questions like why a component re-rendered. +description: Inspect and profile React Native component trees from agent-device. Use when debugging React Native props, state, hooks, render causes, slow components, excessive rerenders, or questions like why a component rerendered. --- # react-devtools -Use this skill when the task needs React Native internals that are not visible in the accessibility tree: component hierarchy, props, state, hooks, render causes, or profiling data. +Router for React Native internals. Read current CLI guidance: -Run commands through `agent-device react-devtools`. The command dynamically runs pinned `agent-react-devtools@0.4.0` and passes arguments through 1:1. - -The first run may download the pinned package from npm. `agent-device` global flags work before or after `react-devtools`; use `--` before downstream flags only when they intentionally share an `agent-device` global flag name. - -## Default flow - -1. Use `agent-device` to open the React Native app and verify the visible state when needed. -2. Check `agent-device react-devtools status`. -3. If no app is connected, start or wait for the devtools daemon, then reload or relaunch the app. -4. Inspect with `get tree`, `find`, and `get component`. -5. Profile only around the interaction being investigated. -6. Verify the fix with the same command sequence and interaction. +```bash +agent-device help react-devtools +``` -For cross-platform validation with explicit `--device`, `--udid`, or `--serial` selectors, prefer an isolated `--state-dir` over separate named sessions. Named sessions enable bound-session locks during setup. Restart `agent-device react-devtools` between iOS and Android runs so `status`, `get tree`, and profiling clearly refer to the currently launched app. +Use `agent-device react-devtools ...` for component tree, props, state, hooks, render ownership, slow components, or rerenders. It dynamically runs pinned `agent-react-devtools@0.4.0`. Use normal `agent-device` commands for visible UI, refs, screenshots, logs, network, or perf. -## Main commands +Core loop: ```bash agent-device react-devtools status agent-device react-devtools wait --connected agent-device react-devtools get tree --depth 3 -agent-device react-devtools find -agent-device react-devtools get component @c5 agent-device react-devtools profile start +# perform the interaction with normal agent-device commands agent-device react-devtools profile stop agent-device react-devtools profile slow --limit 5 agent-device react-devtools profile rerenders --limit 5 ``` -## Decision rules - -- Need current UI text, refs, screenshots, logs, network, or device metrics: use the `agent-device` skill. -- Need props, state, hooks, component ownership, render causes, or React profiler data: use this skill. -- Start component-tree reads with `get tree --depth 3` or `find ` to keep output bounded. -- Labels like `@c5` reset when the app reloads or components remount. After reload, run `wait --connected` and inspect again. -- Profiling only captures renders between `profile start` and `profile stop`. -- On Android, set `adb reverse tcp:8097 tcp:8097` for React DevTools. If Metro is local, also set `adb reverse tcp:8081 tcp:8081`. -- For Android sessions connected through `agent-device connect --remote-config`, run `agent-device react-devtools ...` normally. The CLI registers a bridge companion tunnel to the local DevTools daemon on `127.0.0.1:8097` and unregisters it when the command exits. -- Remote Android React DevTools assumes the React Native-bundled DevTools behavior in React Native 0.83+. Do not assume older browser/Chromium DevTools workflows exist in remote sandboxes. For Expo apps, verify the SDK's bundled React Native version and runtime behavior first; no Expo SDK version is separately verified by this skill. - -## References +Rules: -| File | When to read | -| --------------------------------------- | --------------------------------------------- | -| [commands.md](references/commands.md) | Command reference and common inspection flows | -| [profiling.md](references/profiling.md) | Render profiling workflow and interpretation | +Keep reads bounded with `--depth`/`find`, treat `@c` refs as reload-local, profile only the investigated interaction, and run the same command in remote Android sessions; the CLI manages the companion tunnel. diff --git a/skills/react-devtools/references/commands.md b/skills/react-devtools/references/commands.md deleted file mode 100644 index 8b2cc11f5..000000000 --- a/skills/react-devtools/references/commands.md +++ /dev/null @@ -1,91 +0,0 @@ -# React DevTools Commands - -All commands are run through `agent-device react-devtools`. - -## Connection - -```bash -agent-device react-devtools start -agent-device react-devtools stop -agent-device react-devtools status -agent-device react-devtools wait --connected --timeout 30 -agent-device react-devtools wait --component --timeout 30 -``` - -- `status` shows the daemon port, connected apps, component count, profiling state, uptime, and last connection event. -- Most commands auto-start the daemon, but `start` is useful before launching or reloading the app. -- React Native development builds connect to the daemon on port 8097. For Android emulators or physical devices, use `adb reverse tcp:8097 tcp:8097` if the app cannot reach the host. If the app also uses local Metro, set `adb reverse tcp:8081 tcp:8081`. - -## Validation Notes - -- When validating the same app across iOS and Android with explicit `--device`, `--udid`, or `--serial` selectors, prefer an isolated `--state-dir` over separate named sessions. A named `--session` enables bound-session lock behavior, so setup commands with explicit target selectors can be rejected. -- Restart the React DevTools daemon between platforms so `status`, `get tree`, and profiling output belong to the currently launched app. -- Verify the app is visibly loaded with `snapshot` before collecting React internals. Use `react-devtools` for component state and profiling, not for proving the device/app surface is open. - -## Component Inspection - -```bash -agent-device react-devtools get tree --depth 3 -agent-device react-devtools get component @c5 -agent-device react-devtools find Button -agent-device react-devtools find Button --exact -agent-device react-devtools count -agent-device react-devtools errors -``` - -- `get tree` prints a component hierarchy with labels like `@c1`, `@c2`. -- Use `--depth` on large apps. Start at `--depth 3` or `--depth 4`. -- `get component` accepts a label or numeric React fiber id and shows props, state, and hooks. -- `find` searches by display name. Use `--exact` when fuzzy results are noisy. -- `errors` lists components with React-tracked warnings or errors. - -## Profiling - -```bash -agent-device react-devtools profile start "interaction name" -agent-device react-devtools profile stop -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -agent-device react-devtools profile report @c5 -agent-device react-devtools profile timeline --limit 20 -agent-device react-devtools profile commit 3 -agent-device react-devtools profile export profile.json -agent-device react-devtools profile diff before.json after.json --limit 10 -``` - -- `profile slow` ranks components by average render duration. -- `profile rerenders` ranks components by render count. -- `profile report @cN` shows render causes and changed props/state/hooks for one component. -- `profile timeline` lists commits. Use `--limit` and `--offset` for long sessions. -- `profile export` writes React DevTools Profiler JSON that can be diffed later. - -## Common Flows - -Inspect a component: - -```bash -agent-device react-devtools status -agent-device react-devtools get tree --depth 3 -agent-device react-devtools find SearchScreen -agent-device react-devtools get component @c12 -``` - -Profile a slow interaction: - -```bash -agent-device react-devtools profile start "slow search" -# Trigger the interaction with agent-device or ask the user to perform it. -agent-device react-devtools profile stop -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -``` - -Verify a render fix: - -```bash -agent-device react-devtools profile start "after fix" -# Repeat the same interaction. -agent-device react-devtools profile stop -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -``` diff --git a/skills/react-devtools/references/profiling.md b/skills/react-devtools/references/profiling.md deleted file mode 100644 index ee77d706c..000000000 --- a/skills/react-devtools/references/profiling.md +++ /dev/null @@ -1,74 +0,0 @@ -# React Native Profiling - -Use this workflow when the user reports slow interactions, excessive re-renders, unstable props, or unclear render causes. - -## Baseline - -```bash -agent-device react-devtools status -agent-device react-devtools count -agent-device react-devtools get tree --depth 3 -``` - -If the app is not connected, run: - -```bash -agent-device react-devtools start -agent-device react-devtools wait --connected -``` - -Then reload or relaunch the React Native app if needed. - -## Capture One Interaction - -```bash -agent-device react-devtools profile start "short label" -# Trigger exactly the interaction being investigated. -agent-device react-devtools profile stop -``` - -Keep the profiling window narrow. Extra navigation, warm-up work, or unrelated gestures make the report harder to interpret. - -## Identify Suspects - -```bash -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -``` - -- A component with high average render time is a slow-render suspect. -- A component with high render count is a re-render suspect. -- A component can be both. - -## Drill In - -```bash -agent-device react-devtools profile report @c12 -agent-device react-devtools get component @c12 -``` - -Use `profile report` to identify render causes and changed keys. Use `get component` to inspect current props, state, and hooks. - -Common interpretations: - -| Signal | Meaning | Typical follow-up | -| ------------------------------------------ | ----------------------------------- | ---------------------------------------------- | -| `props-changed` with function props | Parent may pass unstable callbacks | Check whether the parent can use `useCallback` | -| `props-changed` with object or array props | Parent may pass unstable references | Check whether the parent can use `useMemo` | -| `parent-rendered` with many child renders | Child has no bailout | Check whether `React.memo` is appropriate | -| `state-changed` | Component state caused the render | Check whether the state update is necessary | -| `hooks-changed` | Hook value or dependency changed | Inspect hook values and dependencies | - -## Verify - -After making a change, repeat the same interaction: - -```bash -agent-device react-devtools profile start "after fix" -# Repeat the same interaction. -agent-device react-devtools profile stop -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -``` - -Compare render counts, average durations, changed keys, and commit counts against the baseline. diff --git a/src/__tests__/cli-help.test.ts b/src/__tests__/cli-help.test.ts index ce6effe16..337a70c26 100644 --- a/src/__tests__/cli-help.test.ts +++ b/src/__tests__/cli-help.test.ts @@ -47,8 +47,18 @@ test('help react-devtools prints passthrough command help and skips daemon dispa const result = await runCliCapture(['help', 'react-devtools']); assert.equal(result.code, 0); assert.equal(result.calls.length, 0); - assert.match(result.stdout, /Usage:\n agent-device react-devtools \[\.\.\.args\]/); - assert.match(result.stdout, /React Native component trees/); + assert.match(result.stdout, /agent-device help react-devtools/); + assert.match(result.stdout, /React Native internals/); + assert.match(result.stdout, /agent-device react-devtools status/); +}); + +test('help workflow prints agent workflow topic and skips daemon dispatch', async () => { + const result = await runCliCapture(['help', 'workflow']); + assert.equal(result.code, 0); + assert.equal(result.calls.length, 0); + assert.match(result.stdout, /agent-device help workflow/); + assert.match(result.stdout, /Core loop:/); + assert.match(result.stdout, /Do not use CSS selectors/); }); test('help unknown command prints error plus global usage and skips daemon dispatch', async () => { diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index a260f74f0..c447ddac7 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -783,12 +783,12 @@ test('usage includes only global flags in the top-level flags section', () => { assert.doesNotMatch(usageText, /--metadata/); }); -test('usage includes skills, config, environment, and examples footers', () => { +test('usage includes agent workflows, config, environment, and examples footers', () => { const usageText = usage(); - assert.match(usageText, /Agent Skills:/); - assert.match(usageText, /agent-device\s+Canonical mobile automation flows/); - assert.match(usageText, /dogfood\s+Exploratory QA and bug hunts/); - assert.match(usageText, /See `skills\/\/SKILL\.md` in the installed package\./); + assert.match(usageText, /Agent Workflows:/); + assert.match(usageText, /help workflow\s+Normal bootstrap, exploration, and validation loop/); + assert.match(usageText, /help debugging\s+Logs, network, alerts, diagnostics, and traces/); + assert.match(usageText, /help react-devtools\s+React Native component tree and render profiling/); assert.match(usageText, /Configuration:/); assert.match( usageText, @@ -811,6 +811,15 @@ test('usage includes skills, config, environment, and examples footers', () => { assert.match(usageText, /agent-device test \.\/suite --platform android/); }); +test('usageForCommand resolves workflow help topic', () => { + const help = usageForCommand('workflow'); + if (help === null) throw new Error('Expected workflow help text'); + assert.match(help, /agent-device help workflow/); + assert.match(help, /Use selectors as positional targets/); + assert.match(help, /Do not use CSS selectors/); + assert.match(help, /help react-devtools/); +}); + test('apps defaults to --all filter and allows overrides', () => { const defaultFilter = parseArgs(['apps'], { strictFlags: true }); assert.equal(defaultFilter.command, 'apps'); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index e93232f3d..d69275e86 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -158,10 +158,13 @@ const SELECTOR_SNAPSHOT_FLAGS = [ const FIND_SNAPSHOT_FLAGS = ['snapshotDepth', 'snapshotRaw'] as const satisfies readonly FlagKey[]; -const AGENT_SKILLS = [ - { label: 'agent-device', description: 'Canonical mobile automation flows' }, - { label: 'react-devtools', description: 'React Native component tree and render profiling' }, - { label: 'dogfood', description: 'Exploratory QA and bug hunts' }, +const AGENT_WORKFLOWS = [ + { label: 'help workflow', description: 'Normal bootstrap, exploration, and validation loop' }, + { label: 'help debugging', description: 'Logs, network, alerts, diagnostics, and traces' }, + { label: 'help react-devtools', description: 'React Native component tree and render profiling' }, + { label: 'help remote', description: 'Remote config, tenants, leases, and companion tunnels' }, + { label: 'help macos', description: 'Desktop, frontmost-app, and menu bar surfaces' }, + { label: 'help dogfood', description: 'Exploratory QA report workflow' }, ] as const; const CONFIGURATION_LINES = [ @@ -194,6 +197,258 @@ const EXAMPLE_LINES = [ 'agent-device test ./suite --platform android', ] as const; +const HELP_TOPICS = { + workflow: { + summary: 'Normal agent-device bootstrap, exploration, and validation loop', + body: `agent-device help workflow + +Version-matched operating guide for normal agent-device work. + +Core loop: + devices/apps -> open -> snapshot or snapshot -i -> get/is/find/wait or press/fill/scroll/back -> verify -> close + +Command shape: + Final plans should use agent-device, not node bin/agent-device.mjs, pnpm ad, raw platform tools, or helper prose. + Put subcommand first, then positionals, then flags: + agent-device open com.example.app --session checkout --platform android --relaunch + agent-device record start ./checkout.mp4 --session checkout + Unknown current ref placeholder: @ref. Use provided labels/ids/selectors when known. Never invent @e#. + After snapshot -i, use @ref in plans when the exact @e number is unknown. + Close means agent-device close. App-owned back means back; system back means back --system. + Taps are press or click. Gestures are direct commands: swipe, longpress, pinch. + +Bootstrap: + agent-device devices --platform ios + agent-device apps --platform android + agent-device open MyApp --platform ios --device "iPhone 17 Pro" + agent-device open --session checkout --platform android + agent-device install com.example.app ./dist/app.apk --platform android + agent-device reinstall com.example.app ./build/MyApp.app --platform ios + If app id is unknown, plan devices, apps, then open . Install arguments are app/package id then artifact path. Fresh install state: open with --relaunch. + +Snapshots and refs: + snapshot reads visible state. snapshot -i gets current interactive refs. + Re-snapshot after navigation, submit, modal/list/reload/dynamic changes. + Off-screen summaries are scroll hints; use scroll, not swipe, then snapshot -i. + +Selectors: + Use selectors as positional targets: id="field-email" or label="Allow". + Do not use CSS selectors, pseudo refs, --selector, --text, or raw x/y when refs/selectors exist. + agent-device fill 'id="catalog-search"' "tart" --delay-ms 80 + agent-device press 'id="submit-order"' + agent-device is visible 'label="Online"' + agent-device get text 'id="quantity-value"' + +Text entry: + fill replaces; type appends to focused field. + agent-device fill @e5 "qa@example.com" + agent-device fill 'id="field-email"' "qa@example.com" + agent-device press 'id="product-note"' + agent-device type "Handle with care" --delay-ms 80 + Debounced field with no result selector: agent-device wait 1000. Keyboard read-only: keyboard status/get. Blocked control: keyboard dismiss. + +Read-only and waits: + agent-device snapshot + agent-device get text 'id="product-title"' + agent-device is visible 'label="Online"' + agent-device wait visible 'label="Refreshing metrics..."' 3000 + agent-device find "Increment" press --json + +Navigation and gestures: + Use scroll for lists; swipe for coordinate gestures/carousels. + Keep count/pause/pattern on one swipe; flags are --count, --pause-ms, --pattern ping-pong. + longpress duration and pinch scale/center are positional: + agent-device longpress 300 500 800 + agent-device swipe 320 500 40 500 --count 8 --pause-ms 30 --pattern ping-pong + agent-device pinch 0.5 200 400 + +Validation and evidence: + Nearby mutation diff: agent-device diff snapshot -i. + Expected text/selector verification should mention it via wait, is, get, or find; avoid bare screenshots/snapshots for expected text. + If task says snapshot, use snapshot. If it asks visual evidence, use screenshot. + Icon/tappable visual proof: screenshot --overlay-refs. Flag is --overlay-refs. + Startup/CPU/memory: perf --json or metrics. Replay maintenance: replay -u ./flow.ad. + Recording: record start/stop. Tracing: trace start ./trace.log, trace stop ./trace.log. Paths are positional. + Stable known flow: batch ./steps.json, not workflow batch. + Android animations: settings animations off/on, not animations disable/restore. + Network headers: network dump --include headers. + Remote config: connect --remote-config ./remote-config.json, open, snapshot, disconnect. + macOS menu bar: open ... --platform macos --surface menubar; snapshot -i --platform macos --surface menubar. + +React Native dev loop: + JS-only change with Metro connected: + agent-device metro reload + agent-device find "Home" + Do not use agent-device reload. Use open --relaunch for native startup reset. + +React DevTools minimum loop: + react-devtools status -> react-devtools wait --connected -> react-devtools profile start -> interact -> react-devtools profile stop -> react-devtools profile slow -> react-devtools profile rerenders. + +Escalate: + help debugging logs, network, alerts, traces, flaky runtime failures + help react-devtools props, state, hooks, component tree, slow renders, rerenders + help remote remote-config, tenant, lease, remote Android companion tunnel + help macos desktop, frontmost-app, menu bar surfaces + help dogfood exploratory QA report workflow`, + }, + debugging: { + summary: 'Targeted failure evidence without dumping stale context', + body: `agent-device help debugging + +Use this when behavior fails, hangs, times out, throws alerts, or needs runtime evidence. + +Logs: + Keep log windows small. Prefer clear, mark, reproduce, then path. + agent-device logs clear --restart + agent-device logs mark "before diagnostics retry" + agent-device press 'id="load-diagnostics"' + agent-device logs path + Do not cat a full stale log into agent context. Open or grep only the relevant window when needed. + +Network: + Use network dump for recent session HTTP traffic parsed from app logs. + agent-device network dump --include headers + agent-device network dump 20 --include all + Use this instead of logs path when the question is request/response metadata. + network log is a supported alias, but network dump --include headers is the clearest plan form. + +Alerts: + Native alerts: + agent-device alert wait 3000 + agent-device alert accept + agent-device alert dismiss + If alert accept says no alert but a permission sheet is visibly on screen, treat it as normal UI: + agent-device snapshot -i + agent-device press 'label="Allow"' + +Diagnostics and traces: + Use --debug for CLI/daemon diagnostic ids and log paths. + Use trace for low-level session diagnostics around one repro: + agent-device trace start ./traces/diagnostics.trace + agent-device press 'id="load-diagnostics"' + agent-device trace stop ./traces/diagnostics.trace + The trace path is positional. Do not use --path for trace start or trace stop. + +Stabilizers: + Android animation-sensitive flows: + agent-device settings animations off + agent-device snapshot + agent-device settings animations on + Re-enable settings you changed before finishing. + +React Native internals: + If the question is about props, state, hooks, render causes, slow components, or rerenders, use help react-devtools instead of inferring from screenshots or logs.`, + }, + 'react-devtools': { + summary: 'React Native component tree and profiling workflow', + body: `agent-device help react-devtools + +Use this for React Native internals that the accessibility tree cannot expose: components, props, state, hooks, ownership, slow renders, and rerenders. + +Core commands: + agent-device react-devtools status + agent-device react-devtools wait --connected + agent-device react-devtools get tree --depth 3 + agent-device react-devtools find + agent-device react-devtools get component @c5 + agent-device react-devtools profile start + agent-device react-devtools profile stop + agent-device react-devtools profile slow --limit 5 + agent-device react-devtools profile rerenders --limit 5 + +Profiling loop: + 1. Verify the app is connected: react-devtools status, then wait --connected if needed. + 2. Start profiling immediately before the interaction. + 3. Drive the interaction with normal agent-device commands. + 4. Stop profiling. + 5. Inspect slow components and rerenders. + +Example: + agent-device react-devtools status + agent-device react-devtools wait --connected + agent-device react-devtools profile start + agent-device fill 'id="catalog-search"' "tart" --delay-ms 80 + agent-device react-devtools profile stop + agent-device react-devtools profile slow --limit 5 + agent-device react-devtools profile rerenders --limit 5 + +Use snapshot, screenshot, logs, network, and perf for device/app runtime evidence. Use react-devtools only when component internals or React rendering behavior matters.`, + }, + remote: { + summary: 'Remote config, tenant, lease, and remote host flow', + body: `agent-device help remote + +Use remote config when a profile owns daemon URL, auth, tenant, run, lease, device scope, and Metro hints. Do not restate those as individual flags unless overriding intentionally. + +Normal flow: + agent-device connect --remote-config ./remote-config.json + agent-device open com.example.app + agent-device snapshot + agent-device disconnect + +Rules: + connect and disconnect are top-level commands. Do not write agent-device remote connect or agent-device remote disconnect. + Prefer --remote-config over --daemon-base-url, --tenant, --run-id, and --lease-id in ordinary remote flows. + After connect, let the active remote connection supply runtime hints. + For remote Android React DevTools, run agent-device react-devtools normally. The CLI opens the companion tunnel for the local DevTools daemon and cleans it up when the command exits. + Use --debug when remote connection or transport errors need diagnostic ids and remote log hints.`, + }, + macos: { + summary: 'macOS desktop, frontmost-app, and menu bar surfaces', + body: `agent-device help macos + +Use macOS only when the task targets desktop apps, desktop surfaces, or menu bar extras. + +Open and inspect: + agent-device open TextEdit --platform macos + agent-device snapshot -i --platform macos + +Surfaces: + --surface app normal app session + --surface frontmost-app inspect whichever app is frontmost + --surface desktop desktop-wide surface + --surface menubar menu bar extras and menu bar-only apps + +Menu bar app example: + agent-device open "Agent Device Tester Menu" --platform macos --surface menubar + agent-device snapshot -i --platform macos --surface menubar + +Rules: + Use open and snapshot -i for menu bar inspection. Do not output inspect as a command. + Do not let iOS simulator-set scoping hide macOS desktop targets. + Prefer refs/selectors over raw coordinates. + macOS snapshot rects are window-space; use current refs or overlay refs instead of guessing coordinates.`, + }, + dogfood: { + summary: 'Exploratory QA workflow with reproducible evidence', + body: `agent-device help dogfood + +Use this when asked to dogfood, exploratory test, bug hunt, QA, or find issues in an app. + +Loop: + 1. Open a named session for the target app and platform. + 2. Capture initial snapshot -i and screenshot. + 3. Map top-level navigation. + 4. Explore major flows, edge states, loading, errors, offline, permissions, and settings. + 5. For each issue, stop and capture evidence before continuing. + 6. Close the session and summarize findings. + +Evidence commands: + agent-device --session qa open --platform ios + agent-device --session qa snapshot -i + agent-device --session qa screenshot ./dogfood-output/screenshots/initial.png + agent-device --session qa record start ./dogfood-output/videos/issue-001.mp4 + agent-device --session qa record stop + agent-device --session qa close + +Rules: + Never read app source to invent findings. + Re-snapshot after each mutation. + Prefer refs for exploration and selectors for deterministic replay. + Use logs, network, screenshot --overlay-refs, trace, perf, or react-devtools only when they add evidence to a specific issue.`, + }, +} as const satisfies Record; + const FLAG_DEFINITIONS: readonly FlagDefinition[] = [ { key: 'config', @@ -1565,10 +1820,7 @@ CLI to control iOS and Android devices for AI agents. const helpFlags = listHelpFlags(GLOBAL_FLAG_KEYS); const flagsSection = renderFlagSection('Flags:', helpFlags); - const skillsSection = [ - renderAlignedSection('Agent Skills:', AGENT_SKILLS), - 'See `skills//SKILL.md` in the installed package.', - ].join('\n\n'); + const workflowsSection = renderAlignedSection('Agent Workflows:', AGENT_WORKFLOWS); const configSection = renderTextSection('Configuration:', CONFIGURATION_LINES); const environmentSection = renderAlignedSection('Environment:', ENVIRONMENT_LINES); const examplesSection = renderTextSection('Examples:', EXAMPLE_LINES); @@ -1578,7 +1830,7 @@ ${commandLines} ${flagsSection} -${skillsSection} +${workflowsSection} ${configSection} @@ -1648,6 +1900,8 @@ function renderCommandSection( } export function buildCommandUsageText(commandName: string): string | null { + const topicHelp = buildHelpTopicUsageText(commandName); + if (topicHelp) return topicHelp; const schema = getCommandSchema(commandName); if (!schema) return null; const usage = buildCommandUsage(commandName, schema); @@ -1669,3 +1923,15 @@ Usage: ${sections.join('\n\n')} `; } + +function buildHelpTopicUsageText(topicName: string): string | null { + const topic = HELP_TOPICS[topicName as keyof typeof HELP_TOPICS]; + if (!topic) return null; + return `${topic.body} + +Related: + agent-device help command list and global flags + agent-device help command-specific flags + agent-device help workflow normal app automation loop +`; +} diff --git a/test/skillgym/README.md b/test/skillgym/README.md index 39ffe4b0f..4c669d9b9 100644 --- a/test/skillgym/README.md +++ b/test/skillgym/README.md @@ -44,7 +44,7 @@ Skill-guidance regression cases cover distinct command-planning habits: `assertAgentDeviceEvidence` is intentionally soft when a runner does not expose skill-detection telemetry. When telemetry exists, the suite asserts that `agent-device` was loaded; when it is absent, the cases still judge command-planning output instead of failing on missing runner metadata. -The `codex-main` baseline is a benchmark signal, not a required all-green gate. Its failures should map to command-planning regressions called out by individual case IDs; do not treat the historical pass/fail count as a fixed threshold. +The `codex-mini` baseline is a benchmark signal, not a required all-green gate. Its failures should map to command-planning regressions called out by individual case IDs; do not treat the historical pass/fail count as a fixed threshold. ## Suggested workflow @@ -62,10 +62,11 @@ pnpm install pnpm test:skillgym ``` -If you want to run `skillgym` directly instead of using the convenience script: +If you want to run `skillgym` directly instead of using the convenience script, build the local CLI first so agents can call `node bin/agent-device.mjs help workflow`: ```bash cd /absolute/path/to/agent-device +pnpm build pnpm exec skillgym run \ ./test/skillgym/suites/agent-device-smoke-suite.ts \ --config ./test/skillgym/skillgym.config.ts diff --git a/test/skillgym/skillgym.config.ts b/test/skillgym/skillgym.config.ts index c9f46d341..2736d87f2 100644 --- a/test/skillgym/skillgym.config.ts +++ b/test/skillgym/skillgym.config.ts @@ -12,7 +12,7 @@ const config: SkillGymConfig = { timeoutMs: 120_000, }, runners: { - 'codex-main': { + 'codex-mini': { agent: { type: 'codex', model: 'gpt-5.4-mini', diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 7173d6e19..a4421eb15 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -12,9 +12,21 @@ You are benchmarking agent-device command planning for a known fixture app. Do not read project source files or project docs. Do not inspect examples/test-app, src/, README.md, or website/docs. -Use only the app contract provided in this prompt and your existing agent-device knowledge. -If you need command syntax, rely on known agent-device usage patterns instead of reading repository code. -Output only the requested commands, one per line, with no explanation. +Do not browse the web. +Use only this prompt plus local CLI help: run node bin/agent-device.mjs help workflow once as private reference. +Final output: only agent-device commands, one per line. Any prose or Markdown fails. +Follow the task wording exactly: snapshot != screenshot; record != trace; batch is direct; Metro reload is metro reload. +Screenshot evidence uses screenshot --overlay-refs when ref overlays are requested. +Use current command shapes from help: @ref placeholder after snapshot -i, id="..."/label="..." selectors, fill replaces, type appends, press/click not tap. +Use provided labels, ids, and selectors when known; reserve @ref for unknown current refs. +If the task says by @ref, output press @ref or click @ref. +App-owned navigation/back means the back command, not clicking a tab. +Use direct gesture forms: swipe 320 500 40 500 --count 8 --pause-ms 30 --pattern ping-pong; longpress 300 500 800; pinch 0.5 200 400. +Off-screen snapshot hints use scroll down/up then snapshot -i, not swipe or coordinate scroll. +If discovery is needed, include devices, apps, and open ; if debounced wait has no result selector, use wait 1000. +Use precise helpers when relevant: diff snapshot -i; logs clear --restart -> logs mark -> reproduce -> logs path; network dump --include headers; connect --remote-config -> open -> snapshot -> disconnect; settings animations off/on; macOS menubar uses --platform macos --surface menubar. +React DevTools loop keeps the react-devtools prefix on every profile command: status -> wait --connected -> profile start -> interact -> profile stop -> profile slow -> profile rerenders. +When expected text is provided, verify it with wait/is/get/find instead of a bare screenshot. `.trim(); function buildPrompt(options: { contract: string[]; task: string }) { @@ -40,34 +52,58 @@ function assertNoProjectSourceReads(report: SessionReport) { function commandPattern(command: string) { // The suite asks agents for one command per line, so command-name assertions stay line anchored. - return new RegExp(`(?:^|\\n)(?:agent-device\\s+)?${command}(?:\\s|$)`, 'i'); + return new RegExp( + `(?:^|\\n)(?:agent-device(?:\\s+--[^\\s]+(?:\\s+(?!-)[^\\s]+)?)?\\s+)?${command}(?:\\s|$)`, + 'i', + ); } function commandAlternativesPattern(commands: string[]) { const alternatives = commands.join('|'); - return new RegExp(`(?:^|\\n)(?:agent-device\\s+)?(?:${alternatives})(?:\\s|$)`, 'i'); + return new RegExp( + `(?:^|\\n)(?:agent-device(?:\\s+--[^\\s]+(?:\\s+(?!-)[^\\s]+)?)?\\s+)?(?:${alternatives})(?:\\s|$)`, + 'i', + ); } function assertOutputs(report: SessionReport, matchers: Array) { + const output = normalizedFinalOutput(report); for (const matcher of matchers) { - assert.output.includes(report, matcher); + if (typeof matcher === 'string') { + assert.ok( + output.includes(matcher), + `Expected final output to include ${JSON.stringify(matcher)}. Observed final output: ${report.finalOutput}`, + ); + continue; + } + + assert.match(output, matcher); } } function assertNoOutputs(report: SessionReport, matchers: Array) { + const output = normalizedFinalOutput(report); for (const matcher of matchers) { if (typeof matcher === 'string') { assert.ok( - !report.finalOutput.includes(matcher), + !output.includes(matcher), `Expected final output not to include ${JSON.stringify(matcher)}. Observed final output: ${report.finalOutput}`, ); continue; } - assert.doesNotMatch(report.finalOutput, matcher); + assert.doesNotMatch(output, matcher); } } +function normalizedFinalOutput(report: SessionReport): string { + return report.finalOutput + .replace(/```[a-z]*\n?/gi, '') + .replace(/```/g, '') + .replace(/`([^`\n]+)`/g, '$1') + .trim(); +} + function assertExpectedOutput(report: SessionReport, matchers: Array = []) { if (matchers.length === 0) { assert.output.notEmpty(report); @@ -117,7 +153,11 @@ const FIXTURE_SMOKE_CASES: TestCase[] = [ 'visible text: Release notice', ], task: 'Assume Agent Device Tester is already open on the Home tab. Plan the commands to dismiss the Release notice using the dismiss-notice testID, verify it is gone with diff snapshot -i, then close.', - outputs: [/dismiss-notice/i, /diff snapshot -i/i, commandPattern('close')], + outputs: [ + /dismiss-notice/i, + /(?:diff snapshot -i|snapshot\b.*(?:-i\b.*--diff|--diff\b.*-i\b))/i, + commandPattern('close'), + ], }), makeCase({ id: 'home-confirm-alert', @@ -172,7 +212,7 @@ const FIXTURE_SMOKE_CASES: TestCase[] = [ 'visible product after filtering: Berry Tart', ], task: 'Assume Agent Device Tester is on the Catalog tab. Plan the commands to select the Bakery category and verify Berry Tart is visible.', - outputs: [/category-bakery/i, /Berry Tart/i], + outputs: [/(?:category-bakery|Bakery)/i, /Berry Tart/i], }), makeCase({ id: 'catalog-favorite-toggle', @@ -368,7 +408,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ 'No interaction is needed to answer this task', ], task: 'Plan the minimal read-only command to verify whether the Online badge is visible. Do not request interactive refs or mutate the UI.', - outputs: [/(?:^|\n)(?:agent-device\s+)?(?:snapshot|is)(?:\s|$)/i, /Online/i], + outputs: [/(?:^|\n)(?:agent-device\s+)?(?:snapshot|is|find)(?:\s|$)/i, /Online/i], forbiddenOutputs: [ /snapshot -i/i, commandPattern('click'), @@ -463,8 +503,9 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ 'Target app display name is known: Agent Device Tester', 'Package id is unknown', 'No app session is open yet', + 'Session name: discovery', ], - task: 'Plan the bootstrap commands to discover the correct Android device and app identifier before opening the app in a named session.', + task: 'Plan the bootstrap commands to discover the correct Android device and app identifier before opening the app in the named session.', outputs: [ commandPattern('devices'), commandPattern('apps'), @@ -498,8 +539,11 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ 'Only JavaScript changed', ], task: 'Plan the commands to reload the running app after the JS change, then verify the Home screen is visible.', - outputs: [/(?:^|\n)(?:agent-device\s+)?metro\s+reload(?:\s|$)/i, commandPattern('snapshot')], - forbiddenOutputs: [/open\b.*--relaunch/i], + outputs: [ + /(?:^|\n)(?:agent-device\s+)?metro\s+reload(?:\s|$)/i, + /(?:^|\n)(?:agent-device\s+)?(?:snapshot\b|find\b[^\n]*Home|is\b[^\n]*Home|wait\b[^\n]*Home)/i, + ], + forbiddenOutputs: [/open\b.*--relaunch/i, /(?:^|\n)(?:agent-device\s+)?screenshot\b/i], }), makeCase({ id: 'debug-logs-short-window', @@ -519,10 +563,16 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ 'App name: Agent Device Tester', 'Current screen: Settings tab', 'Diagnostics load triggers HTTP traffic logged by the app', + 'Repro button selector: id="load-diagnostics"', 'Need request and response headers', ], task: 'Plan the commands to reproduce the diagnostics request and inspect recent session network traffic with headers.', - outputs: [commandPattern('network'), /dump/i, /--include headers/i], + outputs: [ + /load-diagnostics/i, + commandPattern('network'), + /(?:dump|log)/i, + /(?:--include\s+headers|\bheaders\b)/i, + ], forbiddenOutputs: [/logs path/i, /cat .*log/i], }), makeCase({ @@ -554,6 +604,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ 'App name: Agent Device Tester', 'React Native DevTools can connect to the running app', 'Interaction to profile: type in the Catalog search field', + 'Search field selector: id="catalog-search"', 'Need slow components and rerender counts', ], task: 'Plan the commands to verify React DevTools is connected, profile the Catalog search interaction, then list slow components and rerenders.', @@ -576,7 +627,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ 'Need to advance and return across pages repeatedly', 'Gesture should use a swipe series, not scroll', ], - task: 'Plan the gesture command to swipe horizontally across the carousel eight times with a short pause and ping-pong pattern.', + task: 'Plan the gesture command to swipe horizontally across the carousel eight times with a 30ms pause and ping-pong pattern.', outputs: [ commandPattern('swipe'), /--count\s+8/i, @@ -595,7 +646,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ ], task: 'Plan the gesture command to long-press the target center for 800ms.', outputs: [commandPattern('longpress'), /300\s+500\s+800/i], - forbiddenOutputs: [/--hold-ms/i, commandPattern('click')], + forbiddenOutputs: [/--duration-ms/i, /--hold-ms/i, commandPattern('click')], }), makeCase({ id: 'gesture-pinch-zoom', @@ -604,10 +655,17 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ 'Current screen: image preview', 'Pinch is supported on Apple simulators', 'Need to zoom out around x=200 y=400', + 'Zoom-out scale: 0.5', ], task: 'Plan the gesture command to pinch zoom out at the specified center.', outputs: [commandPattern('pinch'), /0\.5/i, /200\s+400/i], - forbiddenOutputs: [commandPattern('scroll'), commandPattern('swipe')], + forbiddenOutputs: [ + /--scale/i, + /--x/i, + /--y/i, + commandPattern('scroll'), + commandPattern('swipe'), + ], }), makeCase({ id: 'settings-animation-stabilizer', @@ -619,13 +677,19 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ ], task: 'Plan the commands to disable platform animations before the app check, run a snapshot, then restore animations.', outputs: [/settings animations off/i, commandPattern('snapshot'), /settings animations on/i], - forbiddenOutputs: [/--platform macos/i, /settings appearance/i], + forbiddenOutputs: [ + /--platform macos/i, + /settings appearance/i, + /animations disable/i, + /animations restore/i, + ], }), makeCase({ id: 'trace-capture-session', contract: [ 'App name: Agent Device Tester', 'An app session is already open', + 'Repro button selector: id="load-diagnostics"', 'Need low-level session diagnostics for one diagnostics-button repro', 'Trace artifact path: ./traces/diagnostics.trace', ], @@ -635,7 +699,11 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ /load-diagnostics/i, /trace stop \.\/traces\/diagnostics\.trace/i, ], - forbiddenOutputs: [commandPattern('record'), /logs clear --restart/i], + forbiddenOutputs: [ + commandPattern('record'), + /logs clear --restart/i, + /trace (?:start|stop) --path/i, + ], }), makeCase({ id: 'alert-visible-ui-fallback', @@ -647,7 +715,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ ], task: 'Plan the fallback commands to handle the visible sheet as normal tappable UI instead of looping on alert accept.', outputs: [ - /(?:^|\n)(?:agent-device\s+)?(?:find\b.*\bpress\b|press\b.*Allow|snapshot -i)/is, + /(?:^|\n)(?:agent-device\s+)?(?:find\b.*\bpress\b|(?:press|click)\b.*Allow|snapshot -i)/is, /Allow/i, ], forbiddenOutputs: [/alert accept.*\n.*alert accept/is, RAW_COORDINATE_TARGET], @@ -678,7 +746,12 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ commandPattern('snapshot'), commandPattern('disconnect'), ], - forbiddenOutputs: [/--session\s+\w+/i, /--daemon-base-url/i, /--tenant/i, /--run-id/i], + forbiddenOutputs: [ + /--daemon-base-url/i, + /--tenant/i, + /--run-id/i, + commandPattern('screenshot'), + ], }), makeCase({ id: 'macos-menubar-surface', @@ -688,8 +761,8 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ 'The app lives entirely as a menu bar extra', 'Normal app snapshots can be sparse or empty', ], - task: 'Plan the commands to inspect the menu bar app surface and capture interactive refs.', - outputs: [/--platform macos/i, /--surface menubar/i, /snapshot -i/i], + task: 'Plan the commands to inspect the menu bar app surface and capture interactive refs with snapshot -i.', + outputs: [/--platform macos/i, /--surface menubar/i, /snapshot\b.*(?:-i\b|\s-i\b)/i], forbiddenOutputs: [/--surface app/i, /snapshot --raw/i], }), makeCase({ @@ -708,15 +781,16 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ contract: [ 'App name: Agent Device Tester', 'The full checkout flow is already known and stable', + 'Known batch steps file: ./checkout-steps.json', 'Need fewer round trips while recording evidence', ], - task: 'Plan the commands to start a recording, execute the known checkout steps as one batch, and stop the recording.', + task: 'Plan the commands to start a recording, execute the known checkout steps from the provided steps file as one batch, and stop the recording.', outputs: [ /(?:^|\n)(?:agent-device\s+)?record\s+start/i, commandPattern('batch'), /(?:^|\n)(?:agent-device\s+)?record\s+stop/i, ], - forbiddenOutputs: [PSEUDO_ASSERTION_COMMAND], + forbiddenOutputs: [PSEUDO_ASSERTION_COMMAND, /workflow batch/i, commandPattern('trace')], }), ]; diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index 634c9c38b..5cf69754c 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -8,6 +8,17 @@ This page summarizes the primary command groups. For persistent defaults and project-scoped CLI settings, see [Configuration](/docs/configuration). +For agent workflow guidance that is matched to the installed CLI, run: + +```bash +agent-device help workflow +agent-device help debugging +agent-device help react-devtools +agent-device help remote +agent-device help macos +agent-device help dogfood +``` + ## Navigation ```bash diff --git a/website/docs/docs/introduction.md b/website/docs/docs/introduction.md index b54f610f5..08a1d78ba 100644 --- a/website/docs/docs/introduction.md +++ b/website/docs/docs/introduction.md @@ -13,8 +13,7 @@ title: Introduction - Performance snapshots with `perf`/`metrics`, including CPU and memory data where supported If you know `agent-browser`, this is the mobile-native counterpart for iOS/Android UI automation and app-level observability. -For exploratory QA and bug-hunting workflows, see `skills/dogfood/SKILL.md` in this repository. -For React Native component trees, props/state/hooks, and render profiling, use `agent-device react-devtools`, which dynamically runs pinned `agent-react-devtools` commands. +For agent-oriented operating guidance, start with `agent-device help workflow`. For exploratory QA, use `agent-device help dogfood`. For React Native component trees, props/state/hooks, and render profiling, use `agent-device help react-devtools` and the `agent-device react-devtools` passthrough. ## What it’s good at diff --git a/website/docs/docs/skillgym.md b/website/docs/docs/skillgym.md index 1d6ab109e..7c8f1b413 100644 --- a/website/docs/docs/skillgym.md +++ b/website/docs/docs/skillgym.md @@ -1,6 +1,6 @@ # Skillgym -`agent-device` works well with [`skillgym`](https://github.com/callstackincubator/skillgym) when you want to benchmark skill routing and workflow quality before paying the cost of full live-device runs. +`agent-device` works well with [`skillgym`](https://github.com/callstackincubator/skillgym) when you want to benchmark help-guided command planning and workflow quality before paying the cost of full live-device runs. ## What `skillgym` gives us @@ -10,9 +10,9 @@ For `agent-device`, that makes it a strong fit for: -- verifying that the `agent-device` skill is selected for simulator and device tasks -- verifying that the skill loads its mandatory references before normal interactions -- checking that planning guidance mentions the right `agent-device` loop for a known fixture app +- verifying that agents use version-matched `agent-device help workflow` guidance instead of stale priors +- checking that planning guidance produces valid `agent-device` command shapes for a known fixture app +- keeping optional skill telemetry visible without making it a hard dependency for every runner ## Included starter @@ -24,8 +24,8 @@ This repo now includes a starter setup under `test/skillgym` plus a fixture app ## Recommended rollout -1. Start with skill-routing suites that assert `agent-device` is loaded in the right prompts. -2. Add fixture-aware planning suites against `Agent Device Tester` to keep prompts concrete. +1. Start with fixture-aware planning suites against `Agent Device Tester` to keep prompts concrete. +2. Add targeted cases when new help guidance or command surfaces are introduced. 3. Add local-only cases that expect real `agent-device` command usage when a simulator or device is available. ## Fixture app coverage @@ -73,9 +73,10 @@ pnpm install pnpm test:skillgym ``` -Equivalent direct command: +Equivalent direct command after building the CLI: ```bash +pnpm build pnpm exec skillgym run \ ./test/skillgym/suites/agent-device-smoke-suite.ts \ --config ./test/skillgym/skillgym.config.ts From 32dc12ae0e816593e4b2daa272a745b918e2ce85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 11:07:35 -0400 Subject: [PATCH 02/22] test: address skill help review feedback --- src/__tests__/cli-help.test.ts | 2 +- src/utils/__tests__/args.test.ts | 10 ++++++++++ src/utils/command-schema.ts | 5 +++-- test/skillgym/suites/agent-device-smoke-suite.ts | 15 ++++++++------- 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/__tests__/cli-help.test.ts b/src/__tests__/cli-help.test.ts index 337a70c26..cf9b38eb0 100644 --- a/src/__tests__/cli-help.test.ts +++ b/src/__tests__/cli-help.test.ts @@ -43,7 +43,7 @@ test('connect help documents cloud auth environment origins', async () => { assert.match(result.stdout, /AGENT_DEVICE_DAEMON_AUTH_TOKEN/); }); -test('help react-devtools prints passthrough command help and skips daemon dispatch', async () => { +test('help react-devtools prints agent workflow topic and skips daemon dispatch', async () => { const result = await runCliCapture(['help', 'react-devtools']); assert.equal(result.code, 0); assert.equal(result.calls.length, 0); diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index c447ddac7..2864a070f 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -820,6 +820,16 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /help react-devtools/); }); +test('workflow help keeps common copyable command forms', () => { + const help = usageForCommand('workflow'); + if (help === null) throw new Error('Expected workflow help text'); + assert.match(help, /network dump --include headers/); + assert.match(help, /settings animations off/); + assert.match(help, /connect --remote-config/); + assert.match(help, /metro reload/); + assert.match(help, /screenshot --overlay-refs/); +}); + test('apps defaults to --all filter and allows overrides', () => { const defaultFilter = parseArgs(['apps'], { strictFlags: true }); assert.equal(defaultFilter.command, 'apps'); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index d69275e86..429b51416 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -208,7 +208,7 @@ Core loop: devices/apps -> open -> snapshot or snapshot -i -> get/is/find/wait or press/fill/scroll/back -> verify -> close Command shape: - Final plans should use agent-device, not node bin/agent-device.mjs, pnpm ad, raw platform tools, or helper prose. + Reading help through node bin/agent-device.mjs help in local tests is fine; final command plans should use agent-device, not node bin/agent-device.mjs, pnpm ad, raw platform tools, or helper prose. Put subcommand first, then positionals, then flags: agent-device open com.example.app --session checkout --platform android --relaunch agent-device record start ./checkout.mp4 --session checkout @@ -264,7 +264,8 @@ Navigation and gestures: Validation and evidence: Nearby mutation diff: agent-device diff snapshot -i. - Expected text/selector verification should mention it via wait, is, get, or find; avoid bare screenshots/snapshots for expected text. + Expected text/selector verification must include the exact text or selector via wait, is, get, or find; bare screenshots/snapshots are insufficient for named expectations. + Prefer provided testIDs/ids/selectors for verification; use visible text when no durable selector is provided. If task says snapshot, use snapshot. If it asks visual evidence, use screenshot. Icon/tappable visual proof: screenshot --overlay-refs. Flag is --overlay-refs. Startup/CPU/memory: perf --json or metrics. Replay maintenance: replay -u ./flow.ad. diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index a4421eb15..6bbe7aea4 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -19,6 +19,7 @@ Follow the task wording exactly: snapshot != screenshot; record != trace; batch Screenshot evidence uses screenshot --overlay-refs when ref overlays are requested. Use current command shapes from help: @ref placeholder after snapshot -i, id="..."/label="..." selectors, fill replaces, type appends, press/click not tap. Use provided labels, ids, and selectors when known; reserve @ref for unknown current refs. +For verification, prefer provided testIDs/selectors over inferred visible text. If the task says by @ref, output press @ref or click @ref. App-owned navigation/back means the back command, not clicking a tab. Use direct gesture forms: swipe 320 500 40 500 --count 8 --pause-ms 30 --pattern ping-pong; longpress 300 500 800; pinch 0.5 200 400. @@ -26,7 +27,7 @@ Off-screen snapshot hints use scroll down/up then snapshot -i, not swipe or coor If discovery is needed, include devices, apps, and open ; if debounced wait has no result selector, use wait 1000. Use precise helpers when relevant: diff snapshot -i; logs clear --restart -> logs mark -> reproduce -> logs path; network dump --include headers; connect --remote-config -> open -> snapshot -> disconnect; settings animations off/on; macOS menubar uses --platform macos --surface menubar. React DevTools loop keeps the react-devtools prefix on every profile command: status -> wait --connected -> profile start -> interact -> profile stop -> profile slow -> profile rerenders. -When expected text is provided, verify it with wait/is/get/find instead of a bare screenshot. +When expected text is provided, include that exact text in a wait/is/get/find command; a bare snapshot or screenshot fails. `.trim(); function buildPrompt(options: { contract: string[]; task: string }) { @@ -53,7 +54,7 @@ function assertNoProjectSourceReads(report: SessionReport) { function commandPattern(command: string) { // The suite asks agents for one command per line, so command-name assertions stay line anchored. return new RegExp( - `(?:^|\\n)(?:agent-device(?:\\s+--[^\\s]+(?:\\s+(?!-)[^\\s]+)?)?\\s+)?${command}(?:\\s|$)`, + `(?:^|\\n)(?:agent-device(?:\\s+--[^\\s]+(?:\\s+(?!-)[^\\s]+)?)*\\s+)?${command}(?:\\s|$)`, 'i', ); } @@ -61,7 +62,7 @@ function commandPattern(command: string) { function commandAlternativesPattern(commands: string[]) { const alternatives = commands.join('|'); return new RegExp( - `(?:^|\\n)(?:agent-device(?:\\s+--[^\\s]+(?:\\s+(?!-)[^\\s]+)?)?\\s+)?(?:${alternatives})(?:\\s|$)`, + `(?:^|\\n)(?:agent-device(?:\\s+--[^\\s]+(?:\\s+(?!-)[^\\s]+)?)*\\s+)?(?:${alternatives})(?:\\s|$)`, 'i', ); } @@ -570,7 +571,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ outputs: [ /load-diagnostics/i, commandPattern('network'), - /(?:dump|log)/i, + /dump/i, /(?:--include\s+headers|\bheaders\b)/i, ], forbiddenOutputs: [/logs path/i, /cat .*log/i], @@ -660,9 +661,9 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ task: 'Plan the gesture command to pinch zoom out at the specified center.', outputs: [commandPattern('pinch'), /0\.5/i, /200\s+400/i], forbiddenOutputs: [ - /--scale/i, - /--x/i, - /--y/i, + /(?:^|\s)--scale(?!\w)/i, + /(?:^|\s)--x(?!\w)/i, + /(?:^|\s)--y(?!\w)/i, commandPattern('scroll'), commandPattern('swipe'), ], From 3784f99b9486138bb5c7d73fb18225691b2ae8e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 12:12:00 -0400 Subject: [PATCH 03/22] docs: surface agent workflow in top-level help --- src/utils/__tests__/args.test.ts | 4 ++++ src/utils/command-schema.ts | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index 2864a070f..8ae3cd0da 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -785,6 +785,10 @@ test('usage includes only global flags in the top-level flags section', () => { test('usage includes agent workflows, config, environment, and examples footers', () => { const usageText = usage(); + assert.match(usageText, /Agent Quickstart:/); + assert.match(usageText, /Default loop: devices\/apps -> open -> snapshot -i/); + assert.match(usageText, /Use selectors or refs as positional targets/); + assert.match(usageText, /Full operating guide: agent-device help workflow/); assert.match(usageText, /Agent Workflows:/); assert.match(usageText, /help workflow\s+Normal bootstrap, exploration, and validation loop/); assert.match(usageText, /help debugging\s+Logs, network, alerts, diagnostics, and traces/); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 429b51416..8ecdc641f 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -167,6 +167,14 @@ const AGENT_WORKFLOWS = [ { label: 'help dogfood', description: 'Exploratory QA report workflow' }, ] as const; +const AGENT_QUICKSTART_LINES = [ + 'Default loop: devices/apps -> open -> snapshot -i -> press/fill/get/is/wait/find -> verify -> close.', + 'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.', + 'Verification commands must name the expected text/selector; bare screenshots/snapshots are not enough.', + 'Use agent-device commands in final plans; raw platform tools, pseudo commands, and helper prose are wrong.', + 'Full operating guide: agent-device help workflow.', +] as const; + const CONFIGURATION_LINES = [ 'Default config files: ~/.agent-device/config.json, ./agent-device.json', 'Use --config or AGENT_DEVICE_CONFIG to load one explicit config file.', @@ -1821,6 +1829,7 @@ CLI to control iOS and Android devices for AI agents. const helpFlags = listHelpFlags(GLOBAL_FLAG_KEYS); const flagsSection = renderFlagSection('Flags:', helpFlags); + const quickstartSection = renderTextSection('Agent Quickstart:', AGENT_QUICKSTART_LINES); const workflowsSection = renderAlignedSection('Agent Workflows:', AGENT_WORKFLOWS); const configSection = renderTextSection('Configuration:', CONFIGURATION_LINES); const environmentSection = renderAlignedSection('Environment:', ENVIRONMENT_LINES); @@ -1831,6 +1840,8 @@ ${commandLines} ${flagsSection} +${quickstartSection} + ${workflowsSection} ${configSection} From 6e432b3d9ed5f3859ecb7124e02572dd165a9112 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 12:23:16 -0400 Subject: [PATCH 04/22] docs: strengthen dogfood help workflow --- src/utils/__tests__/args.test.ts | 11 ++++++++++ src/utils/command-schema.ts | 37 ++++++++++++++++++++++++-------- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index 8ae3cd0da..dc1b2b9b3 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -789,6 +789,7 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /Default loop: devices\/apps -> open -> snapshot -i/); assert.match(usageText, /Use selectors or refs as positional targets/); assert.match(usageText, /Full operating guide: agent-device help workflow/); + assert.match(usageText, /Exploratory QA: agent-device help dogfood/); assert.match(usageText, /Agent Workflows:/); assert.match(usageText, /help workflow\s+Normal bootstrap, exploration, and validation loop/); assert.match(usageText, /help debugging\s+Logs, network, alerts, diagnostics, and traces/); @@ -834,6 +835,16 @@ test('workflow help keeps common copyable command forms', () => { assert.match(help, /screenshot --overlay-refs/); }); +test('usageForCommand resolves dogfood help topic', () => { + const help = usageForCommand('dogfood'); + if (help === null) throw new Error('Expected dogfood help text'); + assert.match(help, /agent-device help dogfood/); + assert.match(help, /Find user-visible issues from runtime behavior/); + assert.match(help, /dogfood-output\/report\.md/); + assert.match(help, /severity, title, affected flow, repro commands/); + assert.match(help, /screenshot \.\/dogfood-output\/screenshots\/issue-001\.png --overlay-refs/); +}); + test('apps defaults to --all filter and allows overrides', () => { const defaultFilter = parseArgs(['apps'], { strictFlags: true }); assert.equal(defaultFilter.command, 'apps'); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 8ecdc641f..b14a443fc 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -172,7 +172,7 @@ const AGENT_QUICKSTART_LINES = [ 'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.', 'Verification commands must name the expected text/selector; bare screenshots/snapshots are not enough.', 'Use agent-device commands in final plans; raw platform tools, pseudo commands, and helper prose are wrong.', - 'Full operating guide: agent-device help workflow.', + 'Full operating guide: agent-device help workflow. Exploratory QA: agent-device help dogfood.', ] as const; const CONFIGURATION_LINES = [ @@ -434,27 +434,46 @@ Rules: Use this when asked to dogfood, exploratory test, bug hunt, QA, or find issues in an app. +Goal: + Find user-visible issues from runtime behavior. Do not read app source or invent findings from code. + Produce a concise report with severity, repro commands, expected/actual behavior, and evidence paths. + Loop: - 1. Open a named session for the target app and platform. - 2. Capture initial snapshot -i and screenshot. - 3. Map top-level navigation. - 4. Explore major flows, edge states, loading, errors, offline, permissions, and settings. - 5. For each issue, stop and capture evidence before continuing. - 6. Close the session and summarize findings. + 1. Identify target app/platform; ask only if missing. + 2. Create output dirs and open a named session. + 3. Capture baseline snapshot -i and screenshot. + 4. Map top-level navigation, then exercise primary flows and edge states. + 5. For each issue, capture evidence immediately, then continue. + 6. Close the session and write the report. + +Coverage: + Navigation, forms, empty/error/loading states, offline or retry behavior, permissions, settings, accessibility labels, orientation/keyboard, and obvious performance stalls. Evidence commands: + mkdir -p ./dogfood-output/screenshots ./dogfood-output/videos ./dogfood-output/traces agent-device --session qa open --platform ios agent-device --session qa snapshot -i agent-device --session qa screenshot ./dogfood-output/screenshots/initial.png + agent-device --session qa screenshot ./dogfood-output/screenshots/issue-001.png --overlay-refs + agent-device --session qa logs clear --restart + agent-device --session qa logs mark "issue-001 repro" + agent-device --session qa logs path agent-device --session qa record start ./dogfood-output/videos/issue-001.mp4 agent-device --session qa record stop agent-device --session qa close +Report shape: + ./dogfood-output/report.md + For each finding: severity, title, affected flow, repro commands, expected, actual, evidence files, notes. + If no issues are found, report coverage completed and residual risk instead of claiming the app is bug-free. + Rules: - Never read app source to invent findings. + Findings must come from observed runtime behavior, not source reads. Re-snapshot after each mutation. + Keep commands in the report reproducible; use selectors or refs from fresh snapshots, not guessed coordinates. Prefer refs for exploration and selectors for deterministic replay. - Use logs, network, screenshot --overlay-refs, trace, perf, or react-devtools only when they add evidence to a specific issue.`, + Use logs, network, screenshot --overlay-refs, trace, perf, or react-devtools only when they add evidence to a specific issue. + Escalate to help debugging or help react-devtools when runtime symptoms require those tools.`, }, } as const satisfies Record; From a50df9d25f93b3dc2e4552171710268b79f9912c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 12:25:52 -0400 Subject: [PATCH 05/22] docs: improve react devtools performance routing --- skills/react-devtools/SKILL.md | 4 ++-- src/__tests__/cli-help.test.ts | 2 +- src/utils/__tests__/args.test.ts | 5 ++++- src/utils/command-schema.ts | 17 ++++++++++------- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/skills/react-devtools/SKILL.md b/skills/react-devtools/SKILL.md index 4cdaa7557..2cd517722 100644 --- a/skills/react-devtools/SKILL.md +++ b/skills/react-devtools/SKILL.md @@ -1,6 +1,6 @@ --- name: react-devtools -description: Inspect and profile React Native component trees from agent-device. Use when debugging React Native props, state, hooks, render causes, slow components, excessive rerenders, or questions like why a component rerendered. +description: Inspect and profile React Native component trees from agent-device. Use for React Native performance, profiling, props, state, hooks, render causes, slow components, excessive rerenders, or questions like why a component rerendered. --- # react-devtools @@ -11,7 +11,7 @@ Router for React Native internals. Read current CLI guidance: agent-device help react-devtools ``` -Use `agent-device react-devtools ...` for component tree, props, state, hooks, render ownership, slow components, or rerenders. It dynamically runs pinned `agent-react-devtools@0.4.0`. Use normal `agent-device` commands for visible UI, refs, screenshots, logs, network, or perf. +Use `agent-device react-devtools ...` for component tree, props, state, hooks, render ownership, performance profiling, slow components, or rerenders. It dynamically runs pinned `agent-react-devtools@0.4.0`. Use normal `agent-device` commands for visible UI, refs, screenshots, logs, network, or device-level perf. Core loop: diff --git a/src/__tests__/cli-help.test.ts b/src/__tests__/cli-help.test.ts index cf9b38eb0..c6a1ea60b 100644 --- a/src/__tests__/cli-help.test.ts +++ b/src/__tests__/cli-help.test.ts @@ -48,7 +48,7 @@ test('help react-devtools prints agent workflow topic and skips daemon dispatch' assert.equal(result.code, 0); assert.equal(result.calls.length, 0); assert.match(result.stdout, /agent-device help react-devtools/); - assert.match(result.stdout, /React Native internals/); + assert.match(result.stdout, /React Native performance\/profiling/); assert.match(result.stdout, /agent-device react-devtools status/); }); diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index dc1b2b9b3..ed6817273 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -793,7 +793,10 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /Agent Workflows:/); assert.match(usageText, /help workflow\s+Normal bootstrap, exploration, and validation loop/); assert.match(usageText, /help debugging\s+Logs, network, alerts, diagnostics, and traces/); - assert.match(usageText, /help react-devtools\s+React Native component tree and render profiling/); + assert.match( + usageText, + /help react-devtools\s+React Native performance, profiling, component tree, and renders/, + ); assert.match(usageText, /Configuration:/); assert.match( usageText, diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index b14a443fc..97288dfe3 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -161,7 +161,10 @@ const FIND_SNAPSHOT_FLAGS = ['snapshotDepth', 'snapshotRaw'] as const satisfies const AGENT_WORKFLOWS = [ { label: 'help workflow', description: 'Normal bootstrap, exploration, and validation loop' }, { label: 'help debugging', description: 'Logs, network, alerts, diagnostics, and traces' }, - { label: 'help react-devtools', description: 'React Native component tree and render profiling' }, + { + label: 'help react-devtools', + description: 'React Native performance, profiling, component tree, and renders', + }, { label: 'help remote', description: 'Remote config, tenants, leases, and companion tunnels' }, { label: 'help macos', description: 'Desktop, frontmost-app, and menu bar surfaces' }, { label: 'help dogfood', description: 'Exploratory QA report workflow' }, @@ -295,7 +298,7 @@ React DevTools minimum loop: Escalate: help debugging logs, network, alerts, traces, flaky runtime failures - help react-devtools props, state, hooks, component tree, slow renders, rerenders + help react-devtools React Native performance, profiling, props/state/hooks, slow renders, rerenders help remote remote-config, tenant, lease, remote Android companion tunnel help macos desktop, frontmost-app, menu bar surfaces help dogfood exploratory QA report workflow`, @@ -346,13 +349,13 @@ Stabilizers: Re-enable settings you changed before finishing. React Native internals: - If the question is about props, state, hooks, render causes, slow components, or rerenders, use help react-devtools instead of inferring from screenshots or logs.`, + If the question is about React Native performance, profiling, props, state, hooks, render causes, slow components, or rerenders, use help react-devtools instead of inferring from screenshots or logs.`, }, 'react-devtools': { - summary: 'React Native component tree and profiling workflow', + summary: 'React Native performance, profiling, and component internals', body: `agent-device help react-devtools -Use this for React Native internals that the accessibility tree cannot expose: components, props, state, hooks, ownership, slow renders, and rerenders. +Use this for React Native performance/profiling and internals that the accessibility tree cannot expose: components, props, state, hooks, ownership, slow renders, and rerenders. Core commands: agent-device react-devtools status @@ -1507,8 +1510,8 @@ const COMMAND_SCHEMAS: Record = { usageOverride: 'react-devtools [...args]', listUsageOverride: 'react-devtools [...args]', helpDescription: - 'Run pinned agent-react-devtools commands for React Native component trees, props/state/hooks, and render profiling', - summary: 'Inspect and profile React Native component trees', + 'Run pinned agent-react-devtools commands for React Native performance profiling, component trees, props/state/hooks, and render analysis', + summary: 'Profile React Native performance and component renders', positionalArgs: ['args?'], allowsExtraPositionals: true, allowedFlags: [], From 6fc9774a4690c426e8975a7e5ce58fb968be58f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 12:47:00 -0400 Subject: [PATCH 06/22] test: thin skillgym prompt --- src/utils/__tests__/args.test.ts | 1 + src/utils/command-schema.ts | 11 +++++++- test/skillgym/bin/agent-device | 2 ++ test/skillgym/skillgym.config.ts | 9 ++++++ .../suites/agent-device-smoke-suite.ts | 28 ++++++++----------- 5 files changed, 34 insertions(+), 17 deletions(-) create mode 100755 test/skillgym/bin/agent-device diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index ed6817273..fec9a5621 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -788,6 +788,7 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /Agent Quickstart:/); assert.match(usageText, /Default loop: devices\/apps -> open -> snapshot -i/); assert.match(usageText, /Use selectors or refs as positional targets/); + assert.match(usageText, /network headers: network dump --include headers/); assert.match(usageText, /Full operating guide: agent-device help workflow/); assert.match(usageText, /Exploratory QA: agent-device help dogfood/); assert.match(usageText, /Agent Workflows:/); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 97288dfe3..f1304c83d 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -174,6 +174,7 @@ const AGENT_QUICKSTART_LINES = [ 'Default loop: devices/apps -> open -> snapshot -i -> press/fill/get/is/wait/find -> verify -> close.', 'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.', 'Verification commands must name the expected text/selector; bare screenshots/snapshots are not enough.', + 'Debug evidence: logs clear/mark/path; network headers: network dump --include headers.', 'Use agent-device commands in final plans; raw platform tools, pseudo commands, and helper prose are wrong.', 'Full operating guide: agent-device help workflow. Exploratory QA: agent-device help dogfood.', ] as const; @@ -225,6 +226,7 @@ Command shape: agent-device record start ./checkout.mp4 --session checkout Unknown current ref placeholder: @ref. Use provided labels/ids/selectors when known. Never invent @e#. After snapshot -i, use @ref in plans when the exact @e number is unknown. + If a task explicitly says to act by @ref, output press @ref or click @ref after refreshing refs. Close means agent-device close. App-owned back means back; system back means back --system. Taps are press or click. Gestures are direct commands: swipe, longpress, pinch. @@ -294,7 +296,14 @@ React Native dev loop: Do not use agent-device reload. Use open --relaunch for native startup reset. React DevTools minimum loop: - react-devtools status -> react-devtools wait --connected -> react-devtools profile start -> interact -> react-devtools profile stop -> react-devtools profile slow -> react-devtools profile rerenders. + Keep the agent-device react-devtools prefix on every React DevTools command: + agent-device react-devtools status + agent-device react-devtools wait --connected + agent-device react-devtools profile start + interact with normal agent-device commands + agent-device react-devtools profile stop + agent-device react-devtools profile slow --limit 5 + agent-device react-devtools profile rerenders --limit 5 Escalate: help debugging logs, network, alerts, traces, flaky runtime failures diff --git a/test/skillgym/bin/agent-device b/test/skillgym/bin/agent-device new file mode 100755 index 000000000..07f567f4b --- /dev/null +++ b/test/skillgym/bin/agent-device @@ -0,0 +1,2 @@ +#!/bin/sh +exec node "$(dirname "$0")/../../../bin/agent-device.mjs" "$@" diff --git a/test/skillgym/skillgym.config.ts b/test/skillgym/skillgym.config.ts index 2736d87f2..96a7ce13b 100644 --- a/test/skillgym/skillgym.config.ts +++ b/test/skillgym/skillgym.config.ts @@ -1,4 +1,11 @@ import type { SkillGymConfig } from 'skillgym'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const localBinDir = fileURLToPath(new URL('./bin', import.meta.url)); +const runnerEnv = { + PATH: [localBinDir, process.env.PATH].filter(Boolean).join(path.delimiter), +}; const config: SkillGymConfig = { run: { @@ -16,12 +23,14 @@ const config: SkillGymConfig = { agent: { type: 'codex', model: 'gpt-5.4-mini', + env: runnerEnv, }, }, 'claude-haiku': { agent: { type: 'claude-code', model: 'haiku', + env: runnerEnv, }, }, }, diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 6bbe7aea4..3eec26aad 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -13,21 +13,8 @@ You are benchmarking agent-device command planning for a known fixture app. Do not read project source files or project docs. Do not inspect examples/test-app, src/, README.md, or website/docs. Do not browse the web. -Use only this prompt plus local CLI help: run node bin/agent-device.mjs help workflow once as private reference. +Use only this prompt plus local CLI help as private reference. Final output: only agent-device commands, one per line. Any prose or Markdown fails. -Follow the task wording exactly: snapshot != screenshot; record != trace; batch is direct; Metro reload is metro reload. -Screenshot evidence uses screenshot --overlay-refs when ref overlays are requested. -Use current command shapes from help: @ref placeholder after snapshot -i, id="..."/label="..." selectors, fill replaces, type appends, press/click not tap. -Use provided labels, ids, and selectors when known; reserve @ref for unknown current refs. -For verification, prefer provided testIDs/selectors over inferred visible text. -If the task says by @ref, output press @ref or click @ref. -App-owned navigation/back means the back command, not clicking a tab. -Use direct gesture forms: swipe 320 500 40 500 --count 8 --pause-ms 30 --pattern ping-pong; longpress 300 500 800; pinch 0.5 200 400. -Off-screen snapshot hints use scroll down/up then snapshot -i, not swipe or coordinate scroll. -If discovery is needed, include devices, apps, and open ; if debounced wait has no result selector, use wait 1000. -Use precise helpers when relevant: diff snapshot -i; logs clear --restart -> logs mark -> reproduce -> logs path; network dump --include headers; connect --remote-config -> open -> snapshot -> disconnect; settings animations off/on; macOS menubar uses --platform macos --surface menubar. -React DevTools loop keeps the react-devtools prefix on every profile command: status -> wait --connected -> profile start -> interact -> profile stop -> profile slow -> profile rerenders. -When expected text is provided, include that exact text in a wait/is/get/find command; a bare snapshot or screenshot fails. `.trim(); function buildPrompt(options: { contract: string[]; task: string }) { @@ -36,12 +23,21 @@ function buildPrompt(options: { contract: string[]; task: string }) { } function assertAgentDeviceEvidence(report: SessionReport) { - const hasDetectedSkills = (report.detectedSkills?.length ?? 0) > 0; + const detectedSkills = report.detectedSkills ?? []; + const hasDetectedSkills = detectedSkills.length > 0; + const hasBundledDeviceSkill = detectedSkills.some((skill) => + ['agent-device', 'react-devtools', 'dogfood'].includes(skill.skill), + ); // Some SkillGym runners do not expose skill telemetry. Keep this as a conditional routing // assertion instead of failing otherwise valid command-planning runs on missing metadata. if (hasDetectedSkills) { - assert.skills.has(report, 'agent-device'); + assert.ok( + hasBundledDeviceSkill, + `Expected detectedSkills to include an agent-device bundled skill. Observed detectedSkills: ${detectedSkills + .map((skill) => `${skill.skill} (${skill.confidence})`) + .join(', ')}`, + ); } } From 39905135bd3b53fe7e0b6260deeea8d8164be414 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 12:52:25 -0400 Subject: [PATCH 07/22] docs: gate skills on help topic version --- package.json | 2 +- skills/agent-device/SKILL.md | 10 +++++++++- skills/dogfood/SKILL.md | 10 +++++++++- skills/react-devtools/SKILL.md | 10 +++++++++- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/package.json b/package.json index 50726ad21..5cf221241 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "agent-device", - "version": "0.13.3", + "version": "0.13.4", "description": "Agent-driven CLI for mobile UI automation, network inspection, and performance diagnostics across iOS, Android, tvOS, and macOS.", "license": "MIT", "author": "Callstack", diff --git a/skills/agent-device/SKILL.md b/skills/agent-device/SKILL.md index dd5ea82f3..8172a2bac 100644 --- a/skills/agent-device/SKILL.md +++ b/skills/agent-device/SKILL.md @@ -5,7 +5,15 @@ description: Automates Apple-platform apps (iOS, tvOS, macOS) and Android device # agent-device -Router only. Before your first agent-device command or plan, read the version-matched CLI guide: +Router only. Private setup before using this skill: + +```bash +agent-device --version +``` + +Require `agent-device >= 0.13.4`; older CLIs lack these help topics. If older, run `npm install -g agent-device@latest`, recheck, then continue. If you cannot upgrade, stop and tell the user. Do not include version/upgrade commands in final plans. + +Before your first agent-device command or plan, read the version-matched CLI guide: ```bash agent-device help workflow diff --git a/skills/dogfood/SKILL.md b/skills/dogfood/SKILL.md index 458d8bffa..af549c5ae 100644 --- a/skills/dogfood/SKILL.md +++ b/skills/dogfood/SKILL.md @@ -6,7 +6,15 @@ allowed-tools: Bash(agent-device:*), Bash(npx agent-device:*) # Dogfood -Router for exploratory QA. Read current CLI guidance: +Router for exploratory QA. Private setup before using this skill: + +```bash +agent-device --version +``` + +Require `agent-device >= 0.13.4`; older CLIs lack these help topics. If older, run `npm install -g agent-device@latest`, recheck, then continue. If you cannot upgrade, stop and tell the user. Do not include version/upgrade commands in final plans. + +Read current CLI guidance: ```bash agent-device help dogfood diff --git a/skills/react-devtools/SKILL.md b/skills/react-devtools/SKILL.md index 2cd517722..c32c4ff14 100644 --- a/skills/react-devtools/SKILL.md +++ b/skills/react-devtools/SKILL.md @@ -5,7 +5,15 @@ description: Inspect and profile React Native component trees from agent-device. # react-devtools -Router for React Native internals. Read current CLI guidance: +Router for React Native internals. Private setup before using this skill: + +```bash +agent-device --version +``` + +Require `agent-device >= 0.13.4`; older CLIs lack these help topics. If older, run `npm install -g agent-device@latest`, recheck, then continue. If you cannot upgrade, stop and tell the user. Do not include version/upgrade commands in final plans. + +Read current CLI guidance: ```bash agent-device help react-devtools From cf618e540f469f1766bc15b4fe5ff1dc2b3b79f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 13:31:25 -0400 Subject: [PATCH 08/22] test: stabilize thin help skillgym --- src/utils/__tests__/args.test.ts | 9 +++++++-- src/utils/command-schema.ts | 13 +++++++++---- test/skillgym/skillgym.config.ts | 2 +- test/skillgym/suites/agent-device-smoke-suite.ts | 1 + 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index fec9a5621..5c1e507e1 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -758,7 +758,7 @@ test('usage includes concise top-level commands', () => { assert.match(usageText, /pinch \[x\] \[y\]/); assert.match(usageText, /rotate /); assert.match(usageText, /record start \[path\] \| record stop/); - assert.match(usageText, /trace start \[path\] \| trace stop/); + assert.match(usageText, /trace start \| trace stop /); }); test('usage includes only global flags in the top-level flags section', () => { @@ -788,7 +788,12 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /Agent Quickstart:/); assert.match(usageText, /Default loop: devices\/apps -> open -> snapshot -i/); assert.match(usageText, /Use selectors or refs as positional targets/); - assert.match(usageText, /network headers: network dump --include headers/); + assert.match(usageText, /Plain snapshot reads state; snapshot -i is required/); + assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/); + assert.match(usageText, /After mutation: diff snapshot -i/); + assert.match(usageText, /app-owned back uses back/); + assert.match(usageText, /trace start \.\/path; trace stop \.\/path/); + assert.match(usageText, /network dump --include headers/); assert.match(usageText, /Full operating guide: agent-device help workflow/); assert.match(usageText, /Exploratory QA: agent-device help dogfood/); assert.match(usageText, /Agent Workflows:/); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index f1304c83d..39f8e9ea1 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -173,8 +173,12 @@ const AGENT_WORKFLOWS = [ const AGENT_QUICKSTART_LINES = [ 'Default loop: devices/apps -> open -> snapshot -i -> press/fill/get/is/wait/find -> verify -> close.', 'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.', + 'Plain snapshot reads state; snapshot -i is required to refresh interactive refs.', + 'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.', + 'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.', + 'Navigation: app-owned back uses back; system back uses back --system.', 'Verification commands must name the expected text/selector; bare screenshots/snapshots are not enough.', - 'Debug evidence: logs clear/mark/path; network headers: network dump --include headers.', + 'Debug evidence: logs clear/mark/path; trace start ./path; trace stop ./path; network dump --include headers.', 'Use agent-device commands in final plans; raw platform tools, pseudo commands, and helper prose are wrong.', 'Full operating guide: agent-device help workflow. Exploratory QA: agent-device help dogfood.', ] as const; @@ -1704,9 +1708,10 @@ const COMMAND_SCHEMAS: Record = { allowedFlags: ['fps', 'quality', 'hideTouches'], }, trace: { - usageOverride: 'trace start [path] | trace stop [path]', - listUsageOverride: 'trace start [path] | trace stop', - helpDescription: 'Start/stop trace log capture', + usageOverride: 'trace start | trace stop ', + listUsageOverride: 'trace start | trace stop ', + helpDescription: + 'Start/stop trace log capture; when an artifact path is requested, pass the same positional path to start and stop', summary: 'Start or stop trace capture', positionalArgs: ['start|stop', 'path?'], allowedFlags: [], diff --git a/test/skillgym/skillgym.config.ts b/test/skillgym/skillgym.config.ts index 96a7ce13b..f9fe03818 100644 --- a/test/skillgym/skillgym.config.ts +++ b/test/skillgym/skillgym.config.ts @@ -16,7 +16,7 @@ const config: SkillGymConfig = { schedule: 'parallel', }, defaults: { - timeoutMs: 120_000, + timeoutMs: 600_000, }, runners: { 'codex-mini': { diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 3eec26aad..4886d3d56 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -14,6 +14,7 @@ Do not read project source files or project docs. Do not inspect examples/test-app, src/, README.md, or website/docs. Do not browse the web. Use only this prompt plus local CLI help as private reference. +For local CLI help in this repo, use node bin/agent-device.mjs help or --help; final commands still use agent-device. Final output: only agent-device commands, one per line. Any prose or Markdown fails. `.trim(); From 6b1d1c204668012362363d238d342be27113efac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 13:34:56 -0400 Subject: [PATCH 09/22] docs: restore dogfood QA guidance --- src/utils/__tests__/args.test.ts | 6 +++++- src/utils/command-schema.ts | 19 ++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index 5c1e507e1..da250ed7b 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -849,8 +849,12 @@ test('usageForCommand resolves dogfood help topic', () => { if (help === null) throw new Error('Expected dogfood help text'); assert.match(help, /agent-device help dogfood/); assert.match(help, /Find user-visible issues from runtime behavior/); + assert.match(help, /Severity: critical blocks a core flow\/data\/crashes/); + assert.match(help, /Interactive\/behavioral issues need step screenshots/); + assert.match(help, /Static\/on-load issues can use one screenshot/); assert.match(help, /dogfood-output\/report\.md/); - assert.match(help, /severity, title, affected flow, repro commands/); + assert.match(help, /ID, severity, category, title, affected flow\/screen/); + assert.match(help, /Never delete screenshots, videos, traces, or report artifacts/); assert.match(help, /screenshot \.\/dogfood-output\/screenshots\/issue-001\.png --overlay-refs/); }); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 39f8e9ea1..987c9003b 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -456,14 +456,16 @@ Goal: Loop: 1. Identify target app/platform; ask only if missing. - 2. Create output dirs and open a named session. + 2. Create output dirs and open a named session. If auth or OTP is required, sign in or ask the user for the code. 3. Capture baseline snapshot -i and screenshot. 4. Map top-level navigation, then exercise primary flows and edge states. - 5. For each issue, capture evidence immediately, then continue. - 6. Close the session and write the report. + 5. For each issue, capture evidence and write the finding immediately, then continue. + 6. Close the session and reconcile the report summary. Coverage: Navigation, forms, empty/error/loading states, offline or retry behavior, permissions, settings, accessibility labels, orientation/keyboard, and obvious performance stalls. + Categories: visual, functional, UX, content, performance, diagnostics, permissions, accessibility. + Severity: critical blocks a core flow/data/crashes; high breaks a major feature; medium has friction or workaround; low is polish. Evidence commands: mkdir -p ./dogfood-output/screenshots ./dogfood-output/videos ./dogfood-output/traces @@ -478,10 +480,16 @@ Evidence commands: agent-device --session qa record stop agent-device --session qa close +Evidence rules: + Interactive/behavioral issues need step screenshots and usually a repro video. + Static/on-load issues can use one screenshot; set repro video to N/A. + Use screenshot --overlay-refs when showing the tappable target or broken state helps repro. + Report shape: ./dogfood-output/report.md - For each finding: severity, title, affected flow, repro commands, expected, actual, evidence files, notes. - If no issues are found, report coverage completed and residual risk instead of claiming the app is bug-free. + Include date, platform, target app, session, scope, severity counts, and issues. + For each finding: ID, severity, category, title, affected flow/screen, repro commands, expected, actual, evidence files, notes. + Target 5-10 well-evidenced issues when available. If no issues are found, report coverage completed and residual risk instead of claiming the app is bug-free. Rules: Findings must come from observed runtime behavior, not source reads. @@ -489,6 +497,7 @@ Rules: Keep commands in the report reproducible; use selectors or refs from fresh snapshots, not guessed coordinates. Prefer refs for exploration and selectors for deterministic replay. Use logs, network, screenshot --overlay-refs, trace, perf, or react-devtools only when they add evidence to a specific issue. + Never delete screenshots, videos, traces, or report artifacts during a session. Escalate to help debugging or help react-devtools when runtime symptoms require those tools.`, }, } as const satisfies Record; From 41fc234e4672ffe2b906c9cfab015413411affa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 13:40:38 -0400 Subject: [PATCH 10/22] test: cover truncated text scoped snapshots --- src/utils/__tests__/args.test.ts | 4 ++++ src/utils/command-schema.ts | 7 ++++-- .../suites/agent-device-smoke-suite.ts | 22 +++++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index da250ed7b..05db8fb14 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -789,6 +789,7 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /Default loop: devices\/apps -> open -> snapshot -i/); assert.match(usageText, /Use selectors or refs as positional targets/); assert.match(usageText, /Plain snapshot reads state; snapshot -i is required/); + assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @ref/); assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/); assert.match(usageText, /After mutation: diff snapshot -i/); assert.match(usageText, /app-owned back uses back/); @@ -831,6 +832,8 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /agent-device help workflow/); assert.match(help, /Use selectors as positional targets/); assert.match(help, /Do not use CSS selectors/); + assert.match(help, /Truncated text\/input previews: do not use get text first/); + assert.match(help, /snapshot -s @e7/); assert.match(help, /help react-devtools/); }); @@ -842,6 +845,7 @@ test('workflow help keeps common copyable command forms', () => { assert.match(help, /connect --remote-config/); assert.match(help, /metro reload/); assert.match(help, /screenshot --overlay-refs/); + assert.match(help, /snapshot -s @ref/); }); test('usageForCommand resolves dogfood help topic', () => { diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 987c9003b..2a6869b6a 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -174,6 +174,7 @@ const AGENT_QUICKSTART_LINES = [ 'Default loop: devices/apps -> open -> snapshot -i -> press/fill/get/is/wait/find -> verify -> close.', 'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.', 'Plain snapshot reads state; snapshot -i is required to refresh interactive refs.', + 'Truncated text/input preview: expand first with snapshot -s @ref, not get text.', 'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.', 'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.', 'Navigation: app-owned back uses back; system back uses back --system.', @@ -247,6 +248,7 @@ Snapshots and refs: snapshot reads visible state. snapshot -i gets current interactive refs. Re-snapshot after navigation, submit, modal/list/reload/dynamic changes. Off-screen summaries are scroll hints; use scroll, not swipe, then snapshot -i. + Truncated text/input previews: do not use get text first; expand with snapshot -s @ref (for example snapshot -s @e7), then read the scoped output. Selectors: Use selectors as positional targets: id="field-email" or label="Allow". @@ -1598,8 +1600,9 @@ const COMMAND_SCHEMAS: Record = { }, get: { usageOverride: 'get text|attrs <@ref|selector>', - helpDescription: 'Return element text/attributes by ref or selector', - summary: 'Get text or attrs by ref or selector', + helpDescription: + 'Return exposed element text/attributes by ref or selector; use snapshot -s @ref for truncated previews', + summary: 'Get exposed text or attrs by ref or selector', positionalArgs: ['subcommand', 'target'], allowedFlags: [...SELECTOR_SNAPSHOT_FLAGS], }, diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 4886d3d56..624c96bea 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -430,6 +430,28 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ ], forbiddenOutputs: [RAW_COORDINATE_TARGET, /\btestID=/i], }), + makeCase({ + id: 'truncated-text-input-scope-ref', + contract: [ + 'App name: Agent Device Tester', + 'Current screen: Checkout form tab', + 'Fresh interactive snapshot already shows @e7 [textinput] "Delivery instructions" [preview:"Leave at side gate..." truncated]', + 'Need the full text value of that truncated input before deciding whether to edit it', + ], + task: 'Plan the command to expand the truncated Delivery instructions text input using the current @e7 ref.', + outputs: [ + commandPattern('snapshot'), + /(?:^|\n)(?:agent-device\s+)?snapshot\b.*(?:-s|--scope)\s+@e7\b/i, + ], + forbiddenOutputs: [ + /snapshot --raw/i, + commandPattern('get'), + commandPattern('fill'), + commandPattern('type'), + commandPattern('press'), + RAW_COORDINATE_TARGET, + ], + }), makeCase({ id: 'target-selector-for-durable-field', contract: [ From 53174b1d06cf776295d7fcb90fcff2eb9a884331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 13:42:52 -0400 Subject: [PATCH 11/22] docs: clarify remote help wording --- src/utils/command-schema.ts | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 2a6869b6a..f3e827ca9 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -165,7 +165,10 @@ const AGENT_WORKFLOWS = [ label: 'help react-devtools', description: 'React Native performance, profiling, component tree, and renders', }, - { label: 'help remote', description: 'Remote config, tenants, leases, and companion tunnels' }, + { + label: 'help remote', + description: 'Remote/cloud config, tenants, leases, and local service tunnels', + }, { label: 'help macos', description: 'Desktop, frontmost-app, and menu bar surfaces' }, { label: 'help dogfood', description: 'Exploratory QA report workflow' }, ] as const; @@ -314,7 +317,7 @@ React DevTools minimum loop: Escalate: help debugging logs, network, alerts, traces, flaky runtime failures help react-devtools React Native performance, profiling, props/state/hooks, slow renders, rerenders - help remote remote-config, tenant, lease, remote Android companion tunnel + help remote remote/cloud config, tenant, lease, local service tunnels help macos desktop, frontmost-app, menu bar surfaces help dogfood exploratory QA report workflow`, }, @@ -417,7 +420,7 @@ Rules: connect and disconnect are top-level commands. Do not write agent-device remote connect or agent-device remote disconnect. Prefer --remote-config over --daemon-base-url, --tenant, --run-id, and --lease-id in ordinary remote flows. After connect, let the active remote connection supply runtime hints. - For remote Android React DevTools, run agent-device react-devtools normally. The CLI opens the companion tunnel for the local DevTools daemon and cleans it up when the command exits. + For remote Android React DevTools, run agent-device react-devtools normally. The CLI opens the needed local service tunnel for the DevTools daemon and cleans it up when the command exits. Use --debug when remote connection or transport errors need diagnostic ids and remote log hints.`, }, macos: { From b7c84586c3627585f3af1ab3577d8fe95713335d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 14:24:01 -0400 Subject: [PATCH 12/22] test: cover rn overlays and expo launch guidance --- src/utils/__tests__/args.test.ts | 9 ++++ src/utils/command-schema.ts | 11 +++++ .../suites/agent-device-smoke-suite.ts | 49 +++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index 05db8fb14..fb4085915 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -790,6 +790,8 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /Use selectors or refs as positional targets/); assert.match(usageText, /Plain snapshot reads state; snapshot -i is required/); assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @ref/); + assert.match(usageText, /RN warning\/error overlays can block taps/); + assert.match(usageText, /Expo Go\/dev clients need their provided exp:\/\//); assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/); assert.match(usageText, /After mutation: diff snapshot -i/); assert.match(usageText, /app-owned back uses back/); @@ -834,6 +836,11 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /Do not use CSS selectors/); assert.match(help, /Truncated text\/input previews: do not use get text first/); assert.match(help, /snapshot -s @e7/); + assert.match(help, /Warning\/error overlays can obscure UI and intercept taps/); + assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform ios/); + assert.match(help, /agent-device open "Expo Go" exp:\/\/127\.0\.0\.1:8081 --platform ios/); + assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform android/); + assert.match(help, /metro prepare --kind expo/); assert.match(help, /help react-devtools/); }); @@ -856,6 +863,8 @@ test('usageForCommand resolves dogfood help topic', () => { assert.match(help, /Severity: critical blocks a core flow\/data\/crashes/); assert.match(help, /Interactive\/behavioral issues need step screenshots/); assert.match(help, /Static\/on-load issues can use one screenshot/); + assert.match(help, /React Native warning\/error overlays can be real findings/); + assert.match(help, /Expo Go\/dev-client shells/); assert.match(help, /dogfood-output\/report\.md/); assert.match(help, /ID, severity, category, title, affected flow\/screen/); assert.match(help, /Never delete screenshots, videos, traces, or report artifacts/); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index f3e827ca9..ebc5eadef 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -178,6 +178,8 @@ const AGENT_QUICKSTART_LINES = [ 'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.', 'Plain snapshot reads state; snapshot -i is required to refresh interactive refs.', 'Truncated text/input preview: expand first with snapshot -s @ref, not get text.', + 'RN warning/error overlays can block taps: screenshot, dismiss/close, then snapshot -i.', + 'Expo Go/dev clients need their provided exp:// or dev-client URL; do not invent app ids.', 'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.', 'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.', 'Navigation: app-owned back uses back; system back uses back --system.', @@ -303,6 +305,13 @@ React Native dev loop: agent-device metro reload agent-device find "Home" Do not use agent-device reload. Use open --relaunch for native startup reset. + Warning/error overlays can obscure UI and intercept taps. If visible: screenshot it, dismiss/close it if it is not the task target, then snapshot -i before tapping the real UI. + Expo Go is a host shell; use the provided project URL instead of inventing a bundle id. iOS simulators can open the URL directly; use host + URL when targeting a specific host shell: + agent-device open exp://127.0.0.1:8081 --platform ios + agent-device open "Expo Go" exp://127.0.0.1:8081 --platform ios + Android uses the URL target directly; do not write open there: + agent-device open exp://127.0.0.1:8081 --platform android + Expo Dev Client/development builds: open the installed dev-client app id/name; if a dev-client URL is provided, open that URL next. For Metro setup use metro prepare --kind expo. React DevTools minimum loop: Keep the agent-device react-devtools prefix on every React DevTools command: @@ -469,6 +478,8 @@ Loop: Coverage: Navigation, forms, empty/error/loading states, offline or retry behavior, permissions, settings, accessibility labels, orientation/keyboard, and obvious performance stalls. + React Native warning/error overlays can be real findings or test blockers. Capture them, dismiss if unrelated, re-snapshot, and report them. + Expo Go/dev-client shells: use the provided exp:// or dev-client URL and record whether the shell, project load, or app UI is being tested. Categories: visual, functional, UX, content, performance, diagnostics, permissions, accessibility. Severity: critical blocks a core flow/data/crashes; high breaks a major feature; medium has friction or workaround; low is polish. diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 624c96bea..f18db04f8 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -565,6 +565,55 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ ], forbiddenOutputs: [/open\b.*--relaunch/i, /(?:^|\n)(?:agent-device\s+)?screenshot\b/i], }), + makeCase({ + id: 'rn-warning-overlay-dismiss-before-tap', + contract: [ + 'App name: Agent Device Tester', + 'React Native dev warning overlay is visible after open', + 'Overlay button label: Dismiss', + 'The overlay covers the intended submit target and can intercept taps', + 'Target selector after dismissing overlay: id="submit-order"', + ], + task: 'Plan commands to preserve evidence of the warning overlay, dismiss it, refresh interactive refs, then press the submit target.', + outputs: [commandPattern('screenshot'), /(?:Dismiss|Close)/i, /snapshot -i/i, /submit-order/i], + forbiddenOutputs: [ + RAW_COORDINATE_TARGET, + /(?:^|\n)(?:agent-device\s+)?(?:press|click)\b[^\n]*submit-order[\s\S]*(?:Dismiss|Close)/i, + /alert accept/i, + ], + }), + makeCase({ + id: 'expo-go-ios-project-url', + contract: [ + 'Platform: iOS simulator', + 'Launch context: Expo Go', + 'Project URL: exp://127.0.0.1:8081', + 'The native bundle id for the project is not installed separately', + ], + task: 'Plan the command to launch the Expo project in Expo Go without inventing a native bundle id.', + outputs: [commandPattern('open'), /exp:\/\/127\.0\.0\.1:8081/i, /--platform ios/i], + forbiddenOutputs: [ + /open\s+Agent Device Tester/i, + /host\.exp\.Exponent/i, + /com\.(?:callstack|example|agent)/i, + ], + }), + makeCase({ + id: 'expo-go-android-url-only', + contract: [ + 'Platform: Android', + 'Launch context: Expo Go', + 'Project URL: exp://10.0.2.2:8081', + 'Android does not support open ; use a URL target for deep links', + ], + task: 'Plan the command to launch the Expo project on Android using the project URL.', + outputs: [commandPattern('open'), /exp:\/\/10\.0\.2\.2:8081/i, /--platform android/i], + forbiddenOutputs: [ + /open\s+(?:"Expo Go"|Expo\s+Go)\s+exp:\/\//i, + /--activity/i, + /host\.exp\.exponent/i, + ], + }), makeCase({ id: 'debug-logs-short-window', contract: [ From 4cddc557b8c331961385651275e67e65663be936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 14:40:19 -0400 Subject: [PATCH 13/22] test: cover expo launch after app lookup miss --- src/utils/__tests__/args.test.ts | 1 + src/utils/command-schema.ts | 1 + .../skillgym/suites/agent-device-smoke-suite.ts | 17 +++++++++++++++++ 3 files changed, 19 insertions(+) diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index fb4085915..4f1bd3d4e 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -840,6 +840,7 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open "Expo Go" exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform android/); + assert.match(help, /apps lookup misses the project but shows Expo Go\/dev-client/); assert.match(help, /metro prepare --kind expo/); assert.match(help, /help react-devtools/); }); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index ebc5eadef..25d26df4e 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -311,6 +311,7 @@ React Native dev loop: agent-device open "Expo Go" exp://127.0.0.1:8081 --platform ios Android uses the URL target directly; do not write open there: agent-device open exp://127.0.0.1:8081 --platform android + If apps lookup misses the project but shows Expo Go/dev-client and a project URL is available, open the URL/host shell; if no URL is available, ask instead of inventing an app id. Expo Dev Client/development builds: open the installed dev-client app id/name; if a dev-client URL is provided, open that URL next. For Metro setup use metro prepare --kind expo. React DevTools minimum loop: diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index f18db04f8..99fb0f117 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -598,6 +598,23 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ /com\.(?:callstack|example|agent)/i, ], }), + makeCase({ + id: 'expo-go-ios-after-app-id-miss', + contract: [ + 'Platform: iOS simulator', + 'Target app display name: Agent Device Tester', + 'Previous apps lookup did not list Agent Device Tester', + 'Previous apps lookup did list Expo Go', + 'Project URL: exp://127.0.0.1:8081', + ], + task: 'Plan the next command to launch the project after the app-id lookup miss without inventing a native bundle id.', + outputs: [commandPattern('open'), /exp:\/\/127\.0\.0\.1:8081/i], + forbiddenOutputs: [ + /open\s+Agent Device Tester/i, + /com\.(?:callstack|example|agent)/i, + /host\.exp\.Exponent/i, + ], + }), makeCase({ id: 'expo-go-android-url-only', contract: [ From 86513c370e23b31cf879e011a39a681955f41557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 14:44:10 -0400 Subject: [PATCH 14/22] test: refine rn warning overlay workflow --- src/utils/__tests__/args.test.ts | 4 ++-- src/utils/command-schema.ts | 4 ++-- test/skillgym/suites/agent-device-smoke-suite.ts | 15 ++++++++++----- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index 4f1bd3d4e..20c527842 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -790,7 +790,7 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /Use selectors or refs as positional targets/); assert.match(usageText, /Plain snapshot reads state; snapshot -i is required/); assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @ref/); - assert.match(usageText, /RN warning\/error overlays can block taps/); + assert.match(usageText, /RN warning\/error overlays can block taps: snapshot -i/); assert.match(usageText, /Expo Go\/dev clients need their provided exp:\/\//); assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/); assert.match(usageText, /After mutation: diff snapshot -i/); @@ -836,7 +836,7 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /Do not use CSS selectors/); assert.match(help, /Truncated text\/input previews: do not use get text first/); assert.match(help, /snapshot -s @e7/); - assert.match(help, /Warning\/error overlays can obscure UI and intercept taps/); + assert.match(help, /If snapshot -i shows one, dismiss\/close its visible control/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open "Expo Go" exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform android/); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 25d26df4e..0b1ac05ff 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -178,7 +178,7 @@ const AGENT_QUICKSTART_LINES = [ 'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.', 'Plain snapshot reads state; snapshot -i is required to refresh interactive refs.', 'Truncated text/input preview: expand first with snapshot -s @ref, not get text.', - 'RN warning/error overlays can block taps: screenshot, dismiss/close, then snapshot -i.', + 'RN warning/error overlays can block taps: snapshot -i, dismiss/close, then diff snapshot -i.', 'Expo Go/dev clients need their provided exp:// or dev-client URL; do not invent app ids.', 'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.', 'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.', @@ -305,7 +305,7 @@ React Native dev loop: agent-device metro reload agent-device find "Home" Do not use agent-device reload. Use open --relaunch for native startup reset. - Warning/error overlays can obscure UI and intercept taps. If visible: screenshot it, dismiss/close it if it is not the task target, then snapshot -i before tapping the real UI. + Warning/error overlays can obscure UI and intercept taps. If snapshot -i shows one, dismiss/close its visible control (for example Dismiss or Close) if it is not the task target, then diff snapshot -i or snapshot -i before tapping the real UI. Expo Go is a host shell; use the provided project URL instead of inventing a bundle id. iOS simulators can open the URL directly; use host + URL when targeting a specific host shell: agent-device open exp://127.0.0.1:8081 --platform ios agent-device open "Expo Go" exp://127.0.0.1:8081 --platform ios diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 99fb0f117..db0f75e14 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -569,14 +569,19 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ id: 'rn-warning-overlay-dismiss-before-tap', contract: [ 'App name: Agent Device Tester', - 'React Native dev warning overlay is visible after open', - 'Overlay button label: Dismiss', - 'The overlay covers the intended submit target and can intercept taps', + 'Current screen after opening will trigger console.warn', + 'Fresh interactive snapshot should show the minimized React Native warning overlay', + 'Overlay close control label: Dismiss', + 'The warning overlay can obscure UI and intercept taps', 'Target selector after dismissing overlay: id="submit-order"', ], - task: 'Plan commands to preserve evidence of the warning overlay, dismiss it, refresh interactive refs, then press the submit target.', - outputs: [commandPattern('screenshot'), /(?:Dismiss|Close)/i, /snapshot -i/i, /submit-order/i], + task: 'Plan commands to identify the warning overlay in snapshot -i, dismiss it, verify the overlay is gone with diff snapshot -i or a fresh snapshot -i, then press the submit target.', + outputs: [ + /snapshot -i[\s\S]*(?:press|click)\b[^\n]*(?:Dismiss|Close|warning)[\s\S]*(?:diff snapshot -i|snapshot\b.*-i)/i, + /submit-order/i, + ], forbiddenOutputs: [ + commandPattern('screenshot'), RAW_COORDINATE_TARGET, /(?:^|\n)(?:agent-device\s+)?(?:press|click)\b[^\n]*submit-order[\s\S]*(?:Dismiss|Close)/i, /alert accept/i, From b3b47674a8b325b89a40851240c735df7b1cdeb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 14:49:39 -0400 Subject: [PATCH 15/22] docs: restore react devtools advanced guidance --- skills/react-devtools/SKILL.md | 2 +- src/utils/__tests__/args.test.ts | 20 +++++++++++ src/utils/command-schema.ts | 19 +++++++++++ .../suites/agent-device-smoke-suite.ts | 34 +++++++++++++++++++ 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/skills/react-devtools/SKILL.md b/skills/react-devtools/SKILL.md index c32c4ff14..5bebf10f0 100644 --- a/skills/react-devtools/SKILL.md +++ b/skills/react-devtools/SKILL.md @@ -36,4 +36,4 @@ agent-device react-devtools profile rerenders --limit 5 Rules: -Keep reads bounded with `--depth`/`find`, treat `@c` refs as reload-local, profile only the investigated interaction, and run the same command in remote Android sessions; the CLI manages the companion tunnel. +Keep reads bounded with `--depth`/`find`, treat `@c` refs as reload-local, profile only the investigated interaction, and run the same command in remote Android sessions; the CLI manages the needed local service tunnel. diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index 20c527842..daac07ca3 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -872,6 +872,26 @@ test('usageForCommand resolves dogfood help topic', () => { assert.match(help, /screenshot \.\/dogfood-output\/screenshots\/issue-001\.png --overlay-refs/); }); +test('usageForCommand resolves react-devtools help topic', () => { + const help = usageForCommand('react-devtools'); + if (help === null) throw new Error('Expected react-devtools help text'); + assert.match(help, /agent-device react-devtools start/); + assert.match(help, /agent-device react-devtools wait --component /); + assert.match(help, /agent-device react-devtools find --exact/); + assert.match(help, /agent-device react-devtools errors/); + assert.match(help, /agent-device react-devtools profile report @c5/); + assert.match(help, /agent-device react-devtools profile timeline --limit 20/); + assert.match(help, /agent-device react-devtools profile export profile\.json/); + assert.match( + help, + /agent-device react-devtools profile diff before\.json after\.json --limit 10/, + ); + assert.match(help, /render causes and changed props\/state\/hooks/); + assert.match(help, /@c refs reset after reload\/remount/); + assert.match(help, /isolated --state-dir/); + assert.match(help, /local service tunnel/); +}); + test('apps defaults to --all filter and allows overrides', () => { const defaultFilter = parseArgs(['apps'], { strictFlags: true }); assert.equal(defaultFilter.command, 'apps'); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 0b1ac05ff..9c44ce761 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -386,15 +386,25 @@ React Native internals: Use this for React Native performance/profiling and internals that the accessibility tree cannot expose: components, props, state, hooks, ownership, slow renders, and rerenders. Core commands: + agent-device react-devtools start + agent-device react-devtools stop agent-device react-devtools status agent-device react-devtools wait --connected + agent-device react-devtools wait --component + agent-device react-devtools count agent-device react-devtools get tree --depth 3 agent-device react-devtools find + agent-device react-devtools find --exact agent-device react-devtools get component @c5 + agent-device react-devtools errors agent-device react-devtools profile start agent-device react-devtools profile stop agent-device react-devtools profile slow --limit 5 agent-device react-devtools profile rerenders --limit 5 + agent-device react-devtools profile report @c5 + agent-device react-devtools profile timeline --limit 20 + agent-device react-devtools profile export profile.json + agent-device react-devtools profile diff before.json after.json --limit 10 Profiling loop: 1. Verify the app is connected: react-devtools status, then wait --connected if needed. @@ -402,6 +412,14 @@ Profiling loop: 3. Drive the interaction with normal agent-device commands. 4. Stop profiling. 5. Inspect slow components and rerenders. + 6. Use profile report @cN for render causes and changed props/state/hooks; use get component @cN for current props/state/hooks. + +Rules: + Start with get tree --depth 3 or find ; use find --exact when fuzzy results are noisy. + @c refs reset after reload/remount. After reload, wait --connected and inspect again. + Keep the profile window narrow; unrelated navigation makes render data noisy. + For cross-platform validation with explicit device selectors, prefer isolated --state-dir and restart react-devtools between platforms. + Remote Android runs normally through agent-device react-devtools; the CLI manages the needed local service tunnel. Expo support depends on the SDK's bundled React Native runtime. Example: agent-device react-devtools status @@ -411,6 +429,7 @@ Example: agent-device react-devtools profile stop agent-device react-devtools profile slow --limit 5 agent-device react-devtools profile rerenders --limit 5 + agent-device react-devtools profile report @c5 Use snapshot, screenshot, logs, network, and perf for device/app runtime evidence. Use react-devtools only when component internals or React rendering behavior matters.`, }, diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index db0f75e14..f9b2c4312 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -710,6 +710,40 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ ], forbiddenOutputs: [commandPattern('snapshot'), commandPattern('perf')], }), + makeCase({ + id: 'react-devtools-exact-component-inspect', + contract: [ + 'App name: Agent Device Tester', + 'React Native DevTools is connected', + 'Need current props, state, and hooks for component SearchScreen', + 'Fuzzy component search returns noisy matches unless exact matching is used', + ], + task: 'Plan bounded React DevTools commands to find the exact SearchScreen component and inspect it.', + outputs: [ + commandPattern('react-devtools find'), + /SearchScreen/i, + /--exact/i, + commandPattern('react-devtools get component'), + ], + forbiddenOutputs: [commandPattern('snapshot'), commandPattern('perf')], + }), + makeCase({ + id: 'react-devtools-render-cause-report', + contract: [ + 'App name: Agent Device Tester', + 'React Native profile has already been stopped', + 'Rerender suspect from profile output: @c12', + 'Need render causes and changed props/state/hooks for that component', + ], + task: 'Plan the React DevTools command to inspect render causes for @c12.', + outputs: [commandPattern('react-devtools profile report'), /@c12/i], + forbiddenOutputs: [ + commandPattern('snapshot'), + commandPattern('perf'), + commandPattern('react-devtools profile slow'), + commandPattern('react-devtools profile rerenders'), + ], + }), makeCase({ id: 'gesture-swipe-carousel', contract: [ From 471bfc93af5c61a02c86de6a1809b9f5579f1aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 15:05:31 -0400 Subject: [PATCH 16/22] test: cover restored agent guidance gaps --- src/utils/__tests__/args.test.ts | 25 ++++++ src/utils/command-schema.ts | 19 +++++ .../suites/agent-device-smoke-suite.ts | 76 +++++++++++++++++++ 3 files changed, 120 insertions(+) diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index daac07ca3..44c802111 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -836,6 +836,13 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /Do not use CSS selectors/); assert.match(help, /Truncated text\/input previews: do not use get text first/); assert.match(help, /snapshot -s @e7/); + assert.match(help, /Read-only visible\/state question: use snapshot\/get\/is\/find/); + assert.match(help, /Use snapshot -i only when refs are needed/); + assert.match(help, /install-from-source --github-actions-artifact org\/repo:app-debug/); + assert.match(help, /Do not open artifact paths or invent package ids/); + assert.match(help, /agent-device get attrs @e4/); + assert.match(help, /Ambiguous find: add --first or --last/); + assert.match(help, /report that gap instead of typing\/searching\/navigating/); assert.match(help, /If snapshot -i shows one, dismiss\/close its visible control/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open "Expo Go" exp:\/\/127\.0\.0\.1:8081 --platform ios/); @@ -856,6 +863,24 @@ test('workflow help keeps common copyable command forms', () => { assert.match(help, /snapshot -s @ref/); }); +test('usageForCommand resolves remote help topic', () => { + const help = usageForCommand('remote'); + if (help === null) throw new Error('Expected remote help text'); + assert.match(help, /agent-device open com\.example\.app --remote-config \.\/remote-config\.json/); + assert.match(help, /disconnect --remote-config \.\/remote-config\.json/); + assert.match(help, /Script flow, per-command config/); + assert.match(help, /same --remote-config to every operational command/); + assert.match(help, /install-from-source --github-actions-artifact org\/repo:artifact/); +}); + +test('usageForCommand resolves macos help topic', () => { + const help = usageForCommand('macos'); + if (help === null) throw new Error('Expected macos help text'); + assert.match(help, /agent-device click @e66 --button secondary --platform macos/); + assert.match(help, /Context menus are not ambient UI/); + assert.match(help, /menu-item refs/); +}); + test('usageForCommand resolves dogfood help topic', () => { const help = usageForCommand('dogfood'); if (help === null) throw new Error('Expected dogfood help text'); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 9c44ce761..e1de5209c 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -177,6 +177,7 @@ const AGENT_QUICKSTART_LINES = [ 'Default loop: devices/apps -> open -> snapshot -i -> press/fill/get/is/wait/find -> verify -> close.', 'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.', 'Plain snapshot reads state; snapshot -i is required to refresh interactive refs.', + 'Read-only visible/state question: use snapshot/get/is/find; use snapshot -i only when refs are needed.', 'Truncated text/input preview: expand first with snapshot -s @ref, not get text.', 'RN warning/error overlays can block taps: snapshot -i, dismiss/close, then diff snapshot -i.', 'Expo Go/dev clients need their provided exp:// or dev-client URL; do not invent app ids.', @@ -247,7 +248,9 @@ Bootstrap: agent-device open --session checkout --platform android agent-device install com.example.app ./dist/app.apk --platform android agent-device reinstall com.example.app ./build/MyApp.app --platform ios + agent-device install-from-source --github-actions-artifact org/repo:app-debug --platform android If app id is unknown, plan devices, apps, then open . Install arguments are app/package id then artifact path. Fresh install state: open with --relaunch. + Do not open artifact paths or invent package ids. If apps lookup misses the target and no URL/artifact is provided, ask or stop. Snapshots and refs: snapshot reads visible state. snapshot -i gets current interactive refs. @@ -272,11 +275,15 @@ Text entry: Debounced field with no result selector: agent-device wait 1000. Keyboard read-only: keyboard status/get. Blocked control: keyboard dismiss. Read-only and waits: + Read-only visible/state question: use snapshot/get/is/find. agent-device snapshot agent-device get text 'id="product-title"' + agent-device get attrs @e4 agent-device is visible 'label="Online"' agent-device wait visible 'label="Refreshing metrics..."' 3000 agent-device find "Increment" press --json + Use snapshot -i only when refs are needed for an action or targeted query. + Ambiguous find: add --first or --last. If info is not visible/exposed, report that gap instead of typing/searching/navigating to reveal it. Navigation and gestures: Use scroll for lists; swipe for coordinate gestures/carousels. @@ -445,9 +452,16 @@ Normal flow: agent-device snapshot agent-device disconnect +Script flow, per-command config: + agent-device open com.example.app --remote-config ./remote-config.json + agent-device snapshot --remote-config ./remote-config.json + agent-device disconnect --remote-config ./remote-config.json + Rules: connect and disconnect are top-level commands. Do not write agent-device remote connect or agent-device remote disconnect. Prefer --remote-config over --daemon-base-url, --tenant, --run-id, and --lease-id in ordinary remote flows. + For self-contained scripts, pass the same --remote-config to every operational command, including disconnect; a preceding connect is optional but not required. + For remote artifact installs, use install-from-source or install-from-source --github-actions-artifact org/repo:artifact; do not download CI artifacts locally first. After connect, let the active remote connection supply runtime hints. For remote Android React DevTools, run agent-device react-devtools normally. The CLI opens the needed local service tunnel for the DevTools daemon and cleans it up when the command exits. Use --debug when remote connection or transport errors need diagnostic ids and remote log hints.`, @@ -472,8 +486,13 @@ Menu bar app example: agent-device open "Agent Device Tester Menu" --platform macos --surface menubar agent-device snapshot -i --platform macos --surface menubar +Context menu example: + agent-device click @e66 --button secondary --platform macos + agent-device snapshot -i --platform macos + Rules: Use open and snapshot -i for menu bar inspection. Do not output inspect as a command. + Context menus are not ambient UI: secondary-click a visible target, then re-snapshot and use the new menu-item refs. Do not let iOS simulator-set scoping hide macOS desktop targets. Prefer refs/selectors over raw coordinates. macOS snapshot rects are window-space; use current refs or overlay refs instead of guessing coordinates.`, diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index f9b2c4312..62c402c47 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -551,6 +551,48 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ ], forbiddenOutputs: [/open\s+\.\/dist\/agent-device-tester\.apk/i], }), + makeCase({ + id: 'install-from-github-artifact-before-open', + contract: [ + 'Platform: Android', + 'Install source: GitHub Actions artifact callstackincubator/agent-device:agent-device-tester-apk', + 'Known package after install: com.callstack.agentdevicetester', + 'Remote daemon can resolve the artifact server-side', + ], + task: 'Plan commands to install from the GitHub Actions artifact, then open the installed package in fresh runtime state.', + outputs: [ + commandPattern('install-from-source'), + /--github-actions-artifact\s+callstackincubator\/agent-device:agent-device-tester-apk/i, + commandPattern('open'), + /com\.callstack\.agentdevicetester/i, + /--relaunch/i, + ], + forbiddenOutputs: [ + /curl\b/i, + /gh\s+(?:run|artifact|download)/i, + /open\s+.*agent-device-tester-apk/i, + ], + }), + makeCase({ + id: 'hidden-info-do-not-force-ui', + contract: [ + 'App name: Agent Device Tester', + 'Current screen: Home tab', + 'Question: what is the hidden promo code?', + 'The current screen does not expose any promo code text or selector', + 'No interaction was requested', + ], + task: 'Plan the minimal read-only command to inspect exposed UI without typing, navigating, or mutating the app to reveal hidden information.', + outputs: [commandPattern('snapshot')], + forbiddenOutputs: [ + /snapshot -i/i, + commandPattern('press'), + commandPattern('click'), + commandPattern('fill'), + commandPattern('type'), + commandPattern('open'), + ], + }), makeCase({ id: 'metro-reload-dev-loop', contract: [ @@ -878,6 +920,22 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ commandPattern('screenshot'), ], }), + makeCase({ + id: 'remote-config-script-flow', + contract: [ + 'Remote config path: ./remote-config.json', + 'App package: com.callstack.agentdevicetester', + 'This is a self-contained script where every command must be explicit', + 'The remote profile owns tenant, run, lease, and Metro hints', + ], + task: 'Plan a self-contained remote script that opens the app, captures a snapshot, and disconnects using the remote config on every command.', + outputs: [ + /open\b[^\n]*--remote-config\s+\.\/remote-config\.json/i, + /snapshot\b[^\n]*--remote-config\s+\.\/remote-config\.json/i, + /disconnect\b[^\n]*--remote-config\s+\.\/remote-config\.json/i, + ], + forbiddenOutputs: [/--daemon-base-url/i, /--tenant/i, /--run-id/i], + }), makeCase({ id: 'macos-menubar-surface', contract: [ @@ -890,6 +948,24 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ outputs: [/--platform macos/i, /--surface menubar/i, /snapshot\b.*(?:-i\b|\s-i\b)/i], forbiddenOutputs: [/--surface app/i, /snapshot --raw/i], }), + makeCase({ + id: 'macos-context-menu-secondary-click', + contract: [ + 'Platform: macOS', + 'Current surface: app', + 'Target row current ref: @e66', + 'Need to open its native context menu and inspect menu item refs', + ], + task: 'Plan commands to open the context menu for @e66 and then refresh interactive refs for the menu items.', + outputs: [ + commandPattern('click'), + /@e66/i, + /--button\s+secondary/i, + /--platform\s+macos/i, + /snapshot\b.*-i/i, + ], + forbiddenOutputs: [commandPattern('longpress'), RAW_COORDINATE_TARGET, /--surface menubar/i], + }), makeCase({ id: 'replay-maintenance-update', contract: [ From e710350bb76181c951a839d2ade3a8641463db17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 17:33:04 -0400 Subject: [PATCH 17/22] test: cover device workflow edge cases --- src/__tests__/cli-help.test.ts | 17 ++++ src/utils/__tests__/args.test.ts | 6 +- src/utils/command-schema.ts | 29 ++++-- test/skillgym/README.md | 8 +- test/skillgym/skillgym.config.ts | 2 +- .../suites/agent-device-smoke-suite.ts | 91 +++++++++++++++++-- 6 files changed, 132 insertions(+), 21 deletions(-) diff --git a/src/__tests__/cli-help.test.ts b/src/__tests__/cli-help.test.ts index c6a1ea60b..5e451cf37 100644 --- a/src/__tests__/cli-help.test.ts +++ b/src/__tests__/cli-help.test.ts @@ -61,6 +61,23 @@ test('help workflow prints agent workflow topic and skips daemon dispatch', asyn assert.match(result.stdout, /Do not use CSS selectors/); }); +test('help workflow preserves known device workaround guidance', async () => { + const result = await runCliCapture(['help', 'workflow']); + assert.equal(result.code, 0); + assert.equal(result.calls.length, 0); + assert.match(result.stdout, /disabled\/hittable:false/); + assert.match(result.stdout, /snapshot -i -c --json/); + assert.match(result.stdout, /@Label_Name/); + assert.match(result.stdout, /press @e12/); + assert.match(result.stdout, /Snapshot legend:/); + assert.match(result.stdout, /preview="Leave at side\.\.\." truncated/); + assert.match(result.stdout, /wait text/); + assert.match(result.stdout, /Never use args/); + assert.match(result.stdout, /Never use args, step/); + assert.match(result.stdout, /scrollintoview/); + assert.match(result.stdout, /--delay-ms/); +}); + test('help unknown command prints error plus global usage and skips daemon dispatch', async () => { const result = await runCliCapture(['help', 'not-a-command']); assert.equal(result.code, 1); diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index 44c802111..c0a0cc0f5 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -789,7 +789,7 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /Default loop: devices\/apps -> open -> snapshot -i/); assert.match(usageText, /Use selectors or refs as positional targets/); assert.match(usageText, /Plain snapshot reads state; snapshot -i is required/); - assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @ref/); + assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @e12/); assert.match(usageText, /RN warning\/error overlays can block taps: snapshot -i/); assert.match(usageText, /Expo Go\/dev clients need their provided exp:\/\//); assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/); @@ -834,6 +834,8 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /agent-device help workflow/); assert.match(help, /Use selectors as positional targets/); assert.match(help, /Do not use CSS selectors/); + assert.match(help, /Snapshot legend:/); + assert.match(help, /@e12 \[button\] label="Add to cart"/); assert.match(help, /Truncated text\/input previews: do not use get text first/); assert.match(help, /snapshot -s @e7/); assert.match(help, /Read-only visible\/state question: use snapshot\/get\/is\/find/); @@ -860,7 +862,7 @@ test('workflow help keeps common copyable command forms', () => { assert.match(help, /connect --remote-config/); assert.match(help, /metro reload/); assert.match(help, /screenshot --overlay-refs/); - assert.match(help, /snapshot -s @ref/); + assert.match(help, /snapshot -s @e7/); }); test('usageForCommand resolves remote help topic', () => { diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index e1de5209c..c40ab3e1f 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -175,14 +175,16 @@ const AGENT_WORKFLOWS = [ const AGENT_QUICKSTART_LINES = [ 'Default loop: devices/apps -> open -> snapshot -i -> press/fill/get/is/wait/find -> verify -> close.', - 'Use selectors or refs as positional targets: id="submit", label="Allow", or @ref after snapshot -i.', + 'Use selectors or refs as positional targets: id="submit", label="Allow", or @e12 from snapshot -i.', 'Plain snapshot reads state; snapshot -i is required to refresh interactive refs.', 'Read-only visible/state question: use snapshot/get/is/find; use snapshot -i only when refs are needed.', - 'Truncated text/input preview: expand first with snapshot -s @ref, not get text.', + 'Truncated text/input preview: expand first with snapshot -s @e12, not get text.', 'RN warning/error overlays can block taps: snapshot -i, dismiss/close, then diff snapshot -i.', 'Expo Go/dev clients need their provided exp:// or dev-client URL; do not invent app ids.', 'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.', 'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.', + 'Raw coordinates are fallback-only: use snapshot -i -c --json rects when iOS refs no-op or child refs are missing.', + 'Batch JSON steps use "command", "positionals", "flags"; never "args" or "step".', 'Navigation: app-owned back uses back; system back uses back --system.', 'Verification commands must name the expected text/selector; bare screenshots/snapshots are not enough.', 'Debug evidence: logs clear/mark/path; trace start ./path; trace stop ./path; network dump --include headers.', @@ -235,9 +237,8 @@ Command shape: Put subcommand first, then positionals, then flags: agent-device open com.example.app --session checkout --platform android --relaunch agent-device record start ./checkout.mp4 --session checkout - Unknown current ref placeholder: @ref. Use provided labels/ids/selectors when known. Never invent @e#. - After snapshot -i, use @ref in plans when the exact @e number is unknown. - If a task explicitly says to act by @ref, output press @ref or click @ref after refreshing refs. + Snapshot refs look like @e12. After snapshot -i, use the exact @eN ref from that output. + If the exact ref is not known yet, first output snapshot -i, then use a concrete example shape like press @e12 in the next command; do not write @, @ref, @Label_Name, or @eN placeholders. Close means agent-device close. App-owned back means back; system back means back --system. Taps are press or click. Gestures are direct commands: swipe, longpress, pinch. @@ -249,14 +250,21 @@ Bootstrap: agent-device install com.example.app ./dist/app.apk --platform android agent-device reinstall com.example.app ./build/MyApp.app --platform ios agent-device install-from-source --github-actions-artifact org/repo:app-debug --platform android - If app id is unknown, plan devices, apps, then open . Install arguments are app/package id then artifact path. Fresh install state: open with --relaunch. + agent-device open com.example.app --platform android --relaunch + If app id is unknown, plan devices, apps, then open . Install arguments are app/package id then artifact path. After install, install-from-source, or reinstall, open the installed id with --relaunch for fresh runtime state. Do not open artifact paths or invent package ids. If apps lookup misses the target and no URL/artifact is provided, ask or stop. Snapshots and refs: snapshot reads visible state. snapshot -i gets current interactive refs. + Snapshot legend: + @e12 [button] label="Add to cart" id="add-cart" enabled hittable -> press @e12 or press 'id="add-cart"'. + @e13 [textinput] label="Notes" preview="Leave at side..." truncated -> snapshot -s @e13 before reading. + [off-screen below] 4 items: "Privacy", "About" -> scroll down, then snapshot -i; those are hints, not refs. Re-snapshot after navigation, submit, modal/list/reload/dynamic changes. Off-screen summaries are scroll hints; use scroll, not swipe, then snapshot -i. + Missing target in a long list: use a short manual scroll + snapshot loop with a max attempt count; do not rely on unbounded scrollintoview. Truncated text/input previews: do not use get text first; expand with snapshot -s @ref (for example snapshot -s @e7), then read the scoped output. + Rare iOS accessibility gaps: if a row ref is shown disabled/hittable:false and press @ref reports success but no UI change, or a horizontal tab/filter bar is collapsed into one composite/seekbar with no child refs, run agent-device snapshot -i -c --json to read rects, compute the target center, press x y, then diff snapshot -i. Coordinates are fallback-only; document why you used them. Selectors: Use selectors as positional targets: id="field-email" or label="Allow". @@ -273,6 +281,7 @@ Text entry: agent-device press 'id="product-note"' agent-device type "Handle with care" --delay-ms 80 Debounced field with no result selector: agent-device wait 1000. Keyboard read-only: keyboard status/get. Blocked control: keyboard dismiss. + Search-as-you-type fields on iOS can drop characters when driven too fast; use --delay-ms on fill/type before trying clipboard paste. Read-only and waits: Read-only visible/state question: use snapshot/get/is/find. @@ -280,13 +289,16 @@ Read-only and waits: agent-device get text 'id="product-title"' agent-device get attrs @e4 agent-device is visible 'label="Online"' - agent-device wait visible 'label="Refreshing metrics..."' 3000 + agent-device wait text "Refreshing metrics..." 3000 + agent-device wait 'label="Ready"' 3000 agent-device find "Increment" press --json + For async/list text presence, prefer wait text over is visible when no interaction is needed. Use snapshot -i only when refs are needed for an action or targeted query. Ambiguous find: add --first or --last. If info is not visible/exposed, report that gap instead of typing/searching/navigating to reveal it. Navigation and gestures: Use scroll for lists; swipe for coordinate gestures/carousels. + If app-owned back is ambiguous or has just misrouted, prefer a visible nav/back button ref, tab-bar ref, or deep link over repeated back/system back. Keep count/pause/pattern on one swipe; flags are --count, --pause-ms, --pattern ping-pong. longpress duration and pinch scale/center are positional: agent-device longpress 300 500 800 @@ -302,6 +314,9 @@ Validation and evidence: Startup/CPU/memory: perf --json or metrics. Replay maintenance: replay -u ./flow.ad. Recording: record start/stop. Tracing: trace start ./trace.log, trace stop ./trace.log. Paths are positional. Stable known flow: batch ./steps.json, not workflow batch. + Inline batch JSON example: + agent-device batch --steps '[{"command":"open","positionals":["settings"],"flags":{}},{"command":"wait","positionals":["100"],"flags":{}}]' + Batch step keys are command, positionals, flags, and optional runtime. Never use args, step, text, or target as batch step fields. Android animations: settings animations off/on, not animations disable/restore. Network headers: network dump --include headers. Remote config: connect --remote-config ./remote-config.json, open, snapshot, disconnect. diff --git a/test/skillgym/README.md b/test/skillgym/README.md index 4c669d9b9..0ac741998 100644 --- a/test/skillgym/README.md +++ b/test/skillgym/README.md @@ -16,7 +16,7 @@ The included suite focuses on the first two layers so it stays stable and CI-saf - `../../examples/test-app/`: minimal Expo SDK 55 fixture app for broad UI coverage - `skillgym.config.ts`: starter config that runs Codex and Claude Haiku against this repo -- `suites/agent-device-smoke-suite.ts`: 48-case suite for skill routing, fixture-aware planning, and skill-guidance regressions +- `suites/agent-device-smoke-suite.ts`: 64-case suite for skill routing, fixture-aware planning, and skill-guidance regressions ## Current coverage @@ -35,12 +35,12 @@ Fixture smoke cases cover concrete app surfaces: Skill-guidance regression cases cover distinct command-planning habits: - read-only inspection versus mutation -- fresh `@ref` targeting, durable selectors, and off-screen scroll recovery +- fresh `@ref` targeting, durable selectors, raw-rect fallbacks, and off-screen scroll recovery - text replacement, append semantics, keyboard status, and keyboard dismiss -- install/open setup, app discovery, session scoping, and in-app back navigation +- install/open setup, app discovery, session scoping, and app-owned navigation fallbacks - Metro reload, logs, network dump, alert fallback, and screenshot evidence - performance metrics, React DevTools profiling, gestures, settings, and trace capture -- remote config, macOS menu bar surfaces, replay update, and batch during recording +- remote config, macOS menu bar surfaces, replay update, and batch schema/recording `assertAgentDeviceEvidence` is intentionally soft when a runner does not expose skill-detection telemetry. When telemetry exists, the suite asserts that `agent-device` was loaded; when it is absent, the cases still judge command-planning output instead of failing on missing runner metadata. diff --git a/test/skillgym/skillgym.config.ts b/test/skillgym/skillgym.config.ts index f9fe03818..b01784b8b 100644 --- a/test/skillgym/skillgym.config.ts +++ b/test/skillgym/skillgym.config.ts @@ -13,7 +13,7 @@ const config: SkillGymConfig = { cwd: '../..', outputDir: './.skillgym-results', reporter: 'standard', - schedule: 'parallel', + schedule: 'isolated-by-runner', }, defaults: { timeoutMs: 600_000, diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 62c402c47..7fc1a44d9 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -114,6 +114,7 @@ function assertExpectedOutput(report: SessionReport, matchers: Array Date: Mon, 27 Apr 2026 17:36:43 -0400 Subject: [PATCH 18/22] docs: clarify agent-device install guidance --- README.md | 16 +++++++++++++++- website/docs/docs/commands.md | 3 +++ website/docs/docs/installation.md | 19 +++++++++++++++++++ website/docs/docs/introduction.md | 2 +- 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6acac2cc7..31cc4513e 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,26 @@ If you know Vercel's [agent-browser](https://github.com/vercel-labs/agent-browse ## Quick Start -Install the CLI. +Install the CLI first: ```bash npm install -g agent-device +agent-device --version +agent-device help workflow ``` +The CLI help is the source of truth for agents and is shipped with the installed version. Skills are optional but recommended when your agent runtime supports them: they auto-route device, React DevTools, and dogfood tasks to the right `agent-device help ` page and verify the CLI is new enough before acting. + +If you install skills separately, keep the CLI on `agent-device >= 0.13.4`. Older CLIs do not include the workflow help topics that the router skills expect. + +```bash +npm install -g agent-device@latest +agent-device --version +agent-device help +``` + +`agent-device` performs a lightweight background upgrade check for interactive CLI runs and, when a newer package is available, suggests a global reinstall command. Updating the package also refreshes the bundled `skills/` shipped with the CLI. + Prerequisites: Node.js 22+, Xcode for iOS/tvOS/macOS targets, Android SDK + ADB for Android, and macOS Accessibility permission for desktop automation. See [Installation](https://incubator.callstack.com/agent-device/docs/installation). Try the loop. diff --git a/website/docs/docs/commands.md b/website/docs/docs/commands.md index 5cf69754c..5425ae0a6 100644 --- a/website/docs/docs/commands.md +++ b/website/docs/docs/commands.md @@ -11,6 +11,7 @@ For persistent defaults and project-scoped CLI settings, see [Configuration](/do For agent workflow guidance that is matched to the installed CLI, run: ```bash +agent-device help agent-device help workflow agent-device help debugging agent-device help react-devtools @@ -19,6 +20,8 @@ agent-device help macos agent-device help dogfood ``` +Skills are recommended for auto-routing when your agent runtime supports them, but they are not required. The CLI help topics are the version-matched operating contract. + ## Navigation ```bash diff --git a/website/docs/docs/installation.md b/website/docs/docs/installation.md index 124661da1..d7729d5d1 100644 --- a/website/docs/docs/installation.md +++ b/website/docs/docs/installation.md @@ -8,6 +8,23 @@ title: Installation ```bash npm install -g agent-device +agent-device --version +agent-device help +``` + +Use global install for normal agent workflows. It gives agents a stable `agent-device` command and version-matched help topics: + +```bash +agent-device help workflow +agent-device help debugging +agent-device help react-devtools +``` + +The CLI help is the source of truth. Skills are optional but recommended when your agent runtime supports them because they auto-route relevant tasks to the installed CLI's own help. The published router skills require `agent-device >= 0.13.4`; if you install or update skills separately, update the CLI too: + +```bash +npm install -g agent-device@latest +agent-device --version ``` Interactive CLI runs periodically check for a newer published `agent-device` package in the background. When an upgrade is available, the CLI suggests reinstalling the package globally; that also refreshes the bundled `skills/` directory shipped with the release. @@ -20,6 +37,8 @@ Set `AGENT_DEVICE_NO_UPDATE_NOTIFIER=1` to disable the notice. npx agent-device open Settings --platform ios ``` +One-off `npx` usage is fine for humans and scripts. For agents, prefer global install so repeated commands and any installed skills resolve to the same CLI version. If an agent cannot rely on skills, it should run `agent-device help` or `agent-device help workflow` before planning device commands. + ## Requirements - Node.js 22+ diff --git a/website/docs/docs/introduction.md b/website/docs/docs/introduction.md index 08a1d78ba..a97cd4264 100644 --- a/website/docs/docs/introduction.md +++ b/website/docs/docs/introduction.md @@ -13,7 +13,7 @@ title: Introduction - Performance snapshots with `perf`/`metrics`, including CPU and memory data where supported If you know `agent-browser`, this is the mobile-native counterpart for iOS/Android UI automation and app-level observability. -For agent-oriented operating guidance, start with `agent-device help workflow`. For exploratory QA, use `agent-device help dogfood`. For React Native component trees, props/state/hooks, and render profiling, use `agent-device help react-devtools` and the `agent-device react-devtools` passthrough. +For agent-oriented operating guidance, start with `agent-device help` or `agent-device help workflow`. Skills are recommended auto-routing helpers when your agent runtime supports them, but agents can operate from CLI help alone. For exploratory QA, use `agent-device help dogfood`. For React Native component trees, props/state/hooks, and render profiling, use `agent-device help react-devtools` and the `agent-device react-devtools` passthrough. ## What it’s good at From c83bcddf75517426d7067096d78beb69e502de5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 18:20:53 -0400 Subject: [PATCH 19/22] docs: route known limitations through help --- src/__tests__/cli-help.test.ts | 3 + src/utils/__tests__/args.test.ts | 4 + src/utils/command-schema.ts | 3 + test/skillgym/README.md | 2 +- .../suites/agent-device-smoke-suite.ts | 31 +++++++ website/docs/docs/_meta.json | 5 -- website/docs/docs/skillgym.md | 83 ------------------- 7 files changed, 42 insertions(+), 89 deletions(-) delete mode 100644 website/docs/docs/skillgym.md diff --git a/src/__tests__/cli-help.test.ts b/src/__tests__/cli-help.test.ts index 5e451cf37..3c2c5ce3f 100644 --- a/src/__tests__/cli-help.test.ts +++ b/src/__tests__/cli-help.test.ts @@ -76,6 +76,9 @@ test('help workflow preserves known device workaround guidance', async () => { assert.match(result.stdout, /Never use args, step/); assert.match(result.stdout, /scrollintoview/); assert.match(result.stdout, /--delay-ms/); + assert.match(result.stdout, /iOS Allow Paste prompt cannot be exercised under XCUITest/); + assert.match(result.stdout, /agent-device clipboard write "some text"/); + assert.match(result.stdout, /trusted ADB keyboard IME/); }); test('help unknown command prints error plus global usage and skips daemon dispatch', async () => { diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index c0a0cc0f5..a6f55c916 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -846,6 +846,9 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /Ambiguous find: add --first or --last/); assert.match(help, /report that gap instead of typing\/searching\/navigating/); assert.match(help, /If snapshot -i shows one, dismiss\/close its visible control/); + assert.match(help, /iOS Allow Paste prompt cannot be exercised under XCUITest/); + assert.match(help, /agent-device clipboard write "some text"/); + assert.match(help, /trusted ADB keyboard IME/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open "Expo Go" exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform android/); @@ -863,6 +866,7 @@ test('workflow help keeps common copyable command forms', () => { assert.match(help, /metro reload/); assert.match(help, /screenshot --overlay-refs/); assert.match(help, /snapshot -s @e7/); + assert.match(help, /clipboard write "some text"/); }); test('usageForCommand resolves remote help topic', () => { diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index c40ab3e1f..3c257026d 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -182,6 +182,7 @@ const AGENT_QUICKSTART_LINES = [ 'RN warning/error overlays can block taps: snapshot -i, dismiss/close, then diff snapshot -i.', 'Expo Go/dev clients need their provided exp:// or dev-client URL; do not invent app ids.', 'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.', + 'Clipboard limits: iOS Allow Paste cannot be automated through XCUITest; prefill with clipboard write. Android non-ASCII should use fill/type, not raw adb input.', 'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.', 'Raw coordinates are fallback-only: use snapshot -i -c --json rects when iOS refs no-op or child refs are missing.', 'Batch JSON steps use "command", "positionals", "flags"; never "args" or "step".', @@ -282,6 +283,8 @@ Text entry: agent-device type "Handle with care" --delay-ms 80 Debounced field with no result selector: agent-device wait 1000. Keyboard read-only: keyboard status/get. Blocked control: keyboard dismiss. Search-as-you-type fields on iOS can drop characters when driven too fast; use --delay-ms on fill/type before trying clipboard paste. + iOS Allow Paste prompt cannot be exercised under XCUITest. To test paste-driven app behavior, prefill first with agent-device clipboard write "some text"; test the system prompt manually. + Android non-ASCII can fail on some system images. Try fill/type normally; agent-device uses safer fallbacks. If the shell reports unsupported non-ASCII input, configure a trusted ADB keyboard IME outside the command plan and restore the previous IME afterward. Read-only and waits: Read-only visible/state question: use snapshot/get/is/find. diff --git a/test/skillgym/README.md b/test/skillgym/README.md index 0ac741998..3e7bfe125 100644 --- a/test/skillgym/README.md +++ b/test/skillgym/README.md @@ -16,7 +16,7 @@ The included suite focuses on the first two layers so it stays stable and CI-saf - `../../examples/test-app/`: minimal Expo SDK 55 fixture app for broad UI coverage - `skillgym.config.ts`: starter config that runs Codex and Claude Haiku against this repo -- `suites/agent-device-smoke-suite.ts`: 64-case suite for skill routing, fixture-aware planning, and skill-guidance regressions +- `suites/agent-device-smoke-suite.ts`: 66-case suite for skill routing, fixture-aware planning, and skill-guidance regressions ## Current coverage diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 7fc1a44d9..8beffa635 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -508,6 +508,37 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ ], forbiddenOutputs: [commandPattern('type'), /(?:^|\n)(?:agent-device\s+)?fill\s+\d+\s+\d+/i], }), + makeCase({ + id: 'ios-allow-paste-prefill-only', + contract: [ + 'App name: Agent Device Tester', + 'Platform: iOS simulator', + 'App reads UIPasteboard.general when opened', + 'iOS Allow Paste system prompt is suppressed under XCUITest automation', + 'Need to test app behavior when pasteboard contains: some text', + ], + task: 'Plan commands to prefill the simulator pasteboard and open the app for paste-driven behavior. Do not try to automate the Allow Paste system dialog.', + outputs: [commandPattern('clipboard'), /write/i, /some text/i, commandPattern('open')], + forbiddenOutputs: [ + /Allow Paste/i, + /alert (?:wait|accept|dismiss)/i, + /\bxcrun\b/i, + /\bsimctl\b/i, + ], + }), + makeCase({ + id: 'android-non-ascii-text-stays-in-fill', + contract: [ + 'Platform: Android', + 'Current screen: Checkout form tab', + 'Field selector: id="field-name"', + 'Desired value: Café ☕ 🎉', + 'Some Android system images fail with direct platform-shell text injection', + ], + task: 'Plan only the robust agent-device command to fill the field with the provided non-ASCII value.', + outputs: [commandPattern('fill'), /id=(?:["']field-name["']|field-name)/i, /Café ☕ 🎉/i], + forbiddenOutputs: [/\badb\b/i, /shell input text/i, /\bime\b/i, /ADBKeyBoard/i], + }), makeCase({ id: 'offscreen-target-scroll-resnapshot', contract: [ diff --git a/website/docs/docs/_meta.json b/website/docs/docs/_meta.json index fb6c8e07b..2bd8a8c56 100644 --- a/website/docs/docs/_meta.json +++ b/website/docs/docs/_meta.json @@ -59,11 +59,6 @@ "type": "file", "label": "Snapshots" }, - { - "name": "skillgym", - "type": "file", - "label": "Skillgym" - }, { "name": "known-limitations", "type": "file", diff --git a/website/docs/docs/skillgym.md b/website/docs/docs/skillgym.md deleted file mode 100644 index 7c8f1b413..000000000 --- a/website/docs/docs/skillgym.md +++ /dev/null @@ -1,83 +0,0 @@ -# Skillgym - -`agent-device` works well with [`skillgym`](https://github.com/callstackincubator/skillgym) when you want to benchmark help-guided command planning and workflow quality before paying the cost of full live-device runs. - -## What `skillgym` gives us - -- repeatable agent sessions against the real repo -- assertions on detected skills, file reads, tool calls, commands, and final output -- artifact capture and token regression snapshots - -For `agent-device`, that makes it a strong fit for: - -- verifying that agents use version-matched `agent-device help workflow` guidance instead of stale priors -- checking that planning guidance produces valid `agent-device` command shapes for a known fixture app -- keeping optional skill telemetry visible without making it a hard dependency for every runner - -## Included starter - -This repo now includes a starter setup under `test/skillgym` plus a fixture app under `examples/test-app`: - -- `examples/test-app`: a minimal Expo fixture app -- `test/skillgym/skillgym.config.ts`: starter config -- `test/skillgym/suites/agent-device-smoke-suite.ts`: CI-safe smoke suite - -## Recommended rollout - -1. Start with fixture-aware planning suites against `Agent Device Tester` to keep prompts concrete. -2. Add targeted cases when new help guidance or command surfaces are introduced. -3. Add local-only cases that expect real `agent-device` command usage when a simulator or device is available. - -## Fixture app coverage - -`Agent Device Tester` keeps the screen count low while still covering a wide range of cases: - -- visible-text verification -- interactive refs and selector targeting -- form fill and multiline notes -- search debounce and filter chips -- long-list scroll and detail drill-in -- modals, toggles, checkboxes, validation errors, and retryable async states - -The default suite now covers 48 cases in two MECE groups. - -Fixture smoke cases cover concrete app behavior: - -- Expo Go open/snapshot/close -- Home banner dismissal, confirmation alerts, and refresh waits -- Catalog search debounce, category filters, favorites, add-to-cart, and scroll -- Product detail navigation, quantity edits, note append, and save-to-cart -- Form validation errors, successful submit, keyboard dismiss, and reset -- Settings diagnostics error/retry, preference toggles, and reset alert handling -- Accessibility audit (screenshot vs snapshot) - -Skill-guidance regression cases cover command-planning habits: - -- read-only inspection versus mutation -- fresh `@ref` targeting, durable selectors, and off-screen scroll recovery -- text replacement, append semantics, keyboard status, and keyboard dismiss -- install/open setup, app discovery, session scoping, and in-app back navigation -- Metro reload, logs, network dump, alert fallback, and screenshot evidence -- performance metrics, React DevTools profiling, gestures, settings, and trace capture -- remote config, macOS menu bar surfaces, replay update, and batch during recording - -Runner skill telemetry is treated as optional. When a runner reports detected skills, the suite asserts that `agent-device` was selected; otherwise the suite still evaluates the final command plan. - -## Run it - -`skillgym` is installed as a repo dev dependency. From the repo root: - -```bash -cd /absolute/path/to/agent-device -pnpm install -pnpm test:skillgym -``` - -Equivalent direct command after building the CLI: - -```bash -pnpm build -pnpm exec skillgym run \ - ./test/skillgym/suites/agent-device-smoke-suite.ts \ - --config ./test/skillgym/skillgym.config.ts -``` From b194baf481472b60949a1b1bb82d49aa2ad99320 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 18:26:38 -0400 Subject: [PATCH 20/22] docs: clarify bootstrap help for small models --- src/__tests__/cli-help.test.ts | 3 +++ src/utils/__tests__/args.test.ts | 6 +++++- src/utils/command-schema.ts | 7 ++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/__tests__/cli-help.test.ts b/src/__tests__/cli-help.test.ts index 3c2c5ce3f..344666407 100644 --- a/src/__tests__/cli-help.test.ts +++ b/src/__tests__/cli-help.test.ts @@ -76,6 +76,9 @@ test('help workflow preserves known device workaround guidance', async () => { assert.match(result.stdout, /Never use args, step/); assert.match(result.stdout, /scrollintoview/); assert.match(result.stdout, /--delay-ms/); + assert.match(result.stdout, /Discovery is not enough when the task asks to open\/start/); + assert.match(result.stdout, /If the task says install, use install/); + assert.match(result.stdout, /do not inspect project files to find one/); assert.match(result.stdout, /iOS Allow Paste prompt cannot be exercised under XCUITest/); assert.match(result.stdout, /agent-device clipboard write "some text"/); assert.match(result.stdout, /trusted ADB keyboard IME/); diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index a6f55c916..dfa844ba4 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -791,7 +791,8 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /Plain snapshot reads state; snapshot -i is required/); assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @e12/); assert.match(usageText, /RN warning\/error overlays can block taps: snapshot -i/); - assert.match(usageText, /Expo Go\/dev clients need their provided exp:\/\//); + assert.match(usageText, /Expo Go\/dev clients: use the provided URL when given/); + assert.match(usageText, /if only a target name is given, open that target/); assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/); assert.match(usageText, /After mutation: diff snapshot -i/); assert.match(usageText, /app-owned back uses back/); @@ -841,6 +842,8 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /Read-only visible\/state question: use snapshot\/get\/is\/find/); assert.match(help, /Use snapshot -i only when refs are needed/); assert.match(help, /install-from-source --github-actions-artifact org\/repo:app-debug/); + assert.match(help, /Discovery is not enough when the task asks to open\/start/); + assert.match(help, /If the task says install, use install/); assert.match(help, /Do not open artifact paths or invent package ids/); assert.match(help, /agent-device get attrs @e4/); assert.match(help, /Ambiguous find: add --first or --last/); @@ -849,6 +852,7 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /iOS Allow Paste prompt cannot be exercised under XCUITest/); assert.match(help, /agent-device clipboard write "some text"/); assert.match(help, /trusted ADB keyboard IME/); + assert.match(help, /if no URL is provided but a target\/app name is provided, open that target/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open "Expo Go" exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform android/); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 3c257026d..195a53187 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -180,7 +180,7 @@ const AGENT_QUICKSTART_LINES = [ 'Read-only visible/state question: use snapshot/get/is/find; use snapshot -i only when refs are needed.', 'Truncated text/input preview: expand first with snapshot -s @e12, not get text.', 'RN warning/error overlays can block taps: snapshot -i, dismiss/close, then diff snapshot -i.', - 'Expo Go/dev clients need their provided exp:// or dev-client URL; do not invent app ids.', + 'Expo Go/dev clients: use the provided URL when given; if only a target name is given, open that target and do not search project files for a URL.', 'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.', 'Clipboard limits: iOS Allow Paste cannot be automated through XCUITest; prefill with clipboard write. Android non-ASCII should use fill/type, not raw adb input.', 'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.', @@ -252,7 +252,8 @@ Bootstrap: agent-device reinstall com.example.app ./build/MyApp.app --platform ios agent-device install-from-source --github-actions-artifact org/repo:app-debug --platform android agent-device open com.example.app --platform android --relaunch - If app id is unknown, plan devices, apps, then open . Install arguments are app/package id then artifact path. After install, install-from-source, or reinstall, open the installed id with --relaunch for fresh runtime state. + If app id is unknown, plan devices, apps, then open . Discovery is not enough when the task asks to open/start the app. + Install arguments are app/package id then artifact path. If the task says install, use install; use reinstall only when explicitly requested. Fresh runtime state is open --relaunch after install. Do not open artifact paths or invent package ids. If apps lookup misses the target and no URL/artifact is provided, ask or stop. Snapshots and refs: @@ -331,7 +332,7 @@ React Native dev loop: agent-device find "Home" Do not use agent-device reload. Use open --relaunch for native startup reset. Warning/error overlays can obscure UI and intercept taps. If snapshot -i shows one, dismiss/close its visible control (for example Dismiss or Close) if it is not the task target, then diff snapshot -i or snapshot -i before tapping the real UI. - Expo Go is a host shell; use the provided project URL instead of inventing a bundle id. iOS simulators can open the URL directly; use host + URL when targeting a specific host shell: + Expo Go is a host shell. Use a provided project URL instead of inventing a bundle id; if no URL is provided but a target/app name is provided, open that target and do not inspect project files to find one. iOS simulators can open a URL directly; use host + URL when targeting a specific host shell: agent-device open exp://127.0.0.1:8081 --platform ios agent-device open "Expo Go" exp://127.0.0.1:8081 --platform ios Android uses the URL target directly; do not write open there: From 1ee3125ec76abcd1d94cb160e27c7579fc27a273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 20:02:44 -0400 Subject: [PATCH 21/22] docs: tighten debug workflow guidance --- src/__tests__/cli-help.test.ts | 2 + src/utils/__tests__/args.test.ts | 44 +++++++++++-------- src/utils/command-schema.ts | 9 ++-- .../suites/agent-device-smoke-suite.ts | 2 +- 4 files changed, 35 insertions(+), 22 deletions(-) diff --git a/src/__tests__/cli-help.test.ts b/src/__tests__/cli-help.test.ts index 344666407..ecd0b11f3 100644 --- a/src/__tests__/cli-help.test.ts +++ b/src/__tests__/cli-help.test.ts @@ -79,6 +79,8 @@ test('help workflow preserves known device workaround guidance', async () => { assert.match(result.stdout, /Discovery is not enough when the task asks to open\/start/); assert.match(result.stdout, /If the task says install, use install/); assert.match(result.stdout, /do not inspect project files to find one/); + assert.match(result.stdout, /do not split clear\/restart/); + assert.match(result.stdout, /do not write network log headers/); assert.match(result.stdout, /iOS Allow Paste prompt cannot be exercised under XCUITest/); assert.match(result.stdout, /agent-device clipboard write "some text"/); assert.match(result.stdout, /trusted ADB keyboard IME/); diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index dfa844ba4..968e8d55b 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -763,24 +763,28 @@ test('usage includes concise top-level commands', () => { test('usage includes only global flags in the top-level flags section', () => { const usageText = usage(); - assert.match(usageText, /--target mobile\|tv/); - assert.match(usageText, /--ios-simulator-device-set /); - assert.match(usageText, /--android-device-allowlist /); - assert.match(usageText, /--state-dir /); - assert.match(usageText, /--daemon-transport auto\|socket\|http/); - assert.match(usageText, /--daemon-server-mode socket\|http\|dual/); - assert.match(usageText, /--tenant /); - assert.match(usageText, /--session-isolation none\|tenant/); - assert.match(usageText, /--run-id /); - assert.match(usageText, /--lease-id /); - assert.match(usageText, /--lease-backend ios-simulator\|ios-instance\|android-instance/); - assert.doesNotMatch(usageText, /--relaunch/); - assert.doesNotMatch(usageText, /--header /); - assert.doesNotMatch(usageText, /--restart/); - assert.doesNotMatch(usageText, /--fps /); - assert.doesNotMatch(usageText, /--quality <5-10>/); - assert.doesNotMatch(usageText, /--save-script \[path\]/); - assert.doesNotMatch(usageText, /--metadata/); + const flagsSection = usageText.slice( + usageText.indexOf('Flags:'), + usageText.indexOf('Agent Quickstart:'), + ); + assert.match(flagsSection, /--target mobile\|tv/); + assert.match(flagsSection, /--ios-simulator-device-set /); + assert.match(flagsSection, /--android-device-allowlist /); + assert.match(flagsSection, /--state-dir /); + assert.match(flagsSection, /--daemon-transport auto\|socket\|http/); + assert.match(flagsSection, /--daemon-server-mode socket\|http\|dual/); + assert.match(flagsSection, /--tenant /); + assert.match(flagsSection, /--session-isolation none\|tenant/); + assert.match(flagsSection, /--run-id /); + assert.match(flagsSection, /--lease-id /); + assert.match(flagsSection, /--lease-backend ios-simulator\|ios-instance\|android-instance/); + assert.doesNotMatch(flagsSection, /--relaunch/); + assert.doesNotMatch(flagsSection, /--header /); + assert.doesNotMatch(flagsSection, /--restart/); + assert.doesNotMatch(flagsSection, /--fps /); + assert.doesNotMatch(flagsSection, /--quality <5-10>/); + assert.doesNotMatch(flagsSection, /--save-script \[path\]/); + assert.doesNotMatch(flagsSection, /--metadata/); }); test('usage includes agent workflows, config, environment, and examples footers', () => { @@ -793,9 +797,11 @@ test('usage includes agent workflows, config, environment, and examples footers' assert.match(usageText, /RN warning\/error overlays can block taps: snapshot -i/); assert.match(usageText, /Expo Go\/dev clients: use the provided URL when given/); assert.match(usageText, /if only a target name is given, open that target/); + assert.match(usageText, /Install flows: install\/install-from-source first/); assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/); assert.match(usageText, /After mutation: diff snapshot -i/); assert.match(usageText, /app-owned back uses back/); + assert.match(usageText, /logs clear --restart\/mark\/path/); assert.match(usageText, /trace start \.\/path; trace stop \.\/path/); assert.match(usageText, /network dump --include headers/); assert.match(usageText, /Full operating guide: agent-device help workflow/); @@ -853,6 +859,8 @@ test('usageForCommand resolves workflow help topic', () => { assert.match(help, /agent-device clipboard write "some text"/); assert.match(help, /trusted ADB keyboard IME/); assert.match(help, /if no URL is provided but a target\/app name is provided, open that target/); + assert.match(help, /do not split clear\/restart/); + assert.match(help, /do not write network log headers/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open "Expo Go" exp:\/\/127\.0\.0\.1:8081 --platform ios/); assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform android/); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 195a53187..8acf47c65 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -181,6 +181,7 @@ const AGENT_QUICKSTART_LINES = [ 'Truncated text/input preview: expand first with snapshot -s @e12, not get text.', 'RN warning/error overlays can block taps: snapshot -i, dismiss/close, then diff snapshot -i.', 'Expo Go/dev clients: use the provided URL when given; if only a target name is given, open that target and do not search project files for a URL.', + 'Install flows: install/install-from-source first, then open the installed id with --relaunch.', 'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.', 'Clipboard limits: iOS Allow Paste cannot be automated through XCUITest; prefill with clipboard write. Android non-ASCII should use fill/type, not raw adb input.', 'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.', @@ -188,7 +189,7 @@ const AGENT_QUICKSTART_LINES = [ 'Batch JSON steps use "command", "positionals", "flags"; never "args" or "step".', 'Navigation: app-owned back uses back; system back uses back --system.', 'Verification commands must name the expected text/selector; bare screenshots/snapshots are not enough.', - 'Debug evidence: logs clear/mark/path; trace start ./path; trace stop ./path; network dump --include headers.', + 'Debug evidence: logs clear --restart/mark/path; trace start ./path; trace stop ./path; network dump --include headers.', 'Use agent-device commands in final plans; raw platform tools, pseudo commands, and helper prose are wrong.', 'Full operating guide: agent-device help workflow. Exploratory QA: agent-device help dogfood.', ] as const; @@ -322,7 +323,8 @@ Validation and evidence: agent-device batch --steps '[{"command":"open","positionals":["settings"],"flags":{}},{"command":"wait","positionals":["100"],"flags":{}}]' Batch step keys are command, positionals, flags, and optional runtime. Never use args, step, text, or target as batch step fields. Android animations: settings animations off/on, not animations disable/restore. - Network headers: network dump --include headers. + Debug logs: logs clear --restart, logs mark, reproduce, then logs path; do not split clear/restart into separate stop/start commands. + Network headers: network dump --include headers; do not write network log headers. Remote config: connect --remote-config ./remote-config.json, open, snapshot, disconnect. macOS menu bar: open ... --platform macos --surface menubar; snapshot -i --platform macos --surface menubar. @@ -370,13 +372,14 @@ Logs: agent-device press 'id="load-diagnostics"' agent-device logs path Do not cat a full stale log into agent context. Open or grep only the relevant window when needed. + logs clear --restart is the compact command to clear old logs and start a fresh capture; do not split it into logs stop, logs clear, logs start. Network: Use network dump for recent session HTTP traffic parsed from app logs. agent-device network dump --include headers agent-device network dump 20 --include all Use this instead of logs path when the question is request/response metadata. - network log is a supported alias, but network dump --include headers is the clearest plan form. + network log is a supported alias, but network dump --include headers is the clearest plan form. Do not write network log headers. Alerts: Native alerts: diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 8beffa635..a67b044ba 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -620,7 +620,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ 'No app session is open yet', 'Session name: discovery', ], - task: 'Plan the bootstrap commands to discover the correct Android device and app identifier before opening the app in the named session.', + task: 'Plan the bootstrap commands to discover the correct Android device and app identifier, then open the discovered app in the named session.', outputs: [ commandPattern('devices'), commandPattern('apps'), From 21c037e1b37f8d14234ee8bc658e4700277eef0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 27 Apr 2026 21:24:59 -0400 Subject: [PATCH 22/22] docs: remove local test note from workflow help --- src/utils/command-schema.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index 8acf47c65..e5d849391 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -235,7 +235,7 @@ Core loop: devices/apps -> open -> snapshot or snapshot -i -> get/is/find/wait or press/fill/scroll/back -> verify -> close Command shape: - Reading help through node bin/agent-device.mjs help in local tests is fine; final command plans should use agent-device, not node bin/agent-device.mjs, pnpm ad, raw platform tools, or helper prose. + Plans should use agent-device commands, not raw platform tools, pseudo commands, package-manager aliases, or helper prose. Put subcommand first, then positionals, then flags: agent-device open com.example.app --session checkout --platform android --relaunch agent-device record start ./checkout.mp4 --session checkout