diff --git a/README.md b/README.md index 6acac2cc7..31cc4513e 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,26 @@ If you know Vercel's [agent-browser](https://github.com/vercel-labs/agent-browse ## Quick Start -Install the CLI. +Install the CLI first: ```bash npm install -g agent-device +agent-device --version +agent-device help workflow ``` +The CLI help is the source of truth for agents and is shipped with the installed version. Skills are optional but recommended when your agent runtime supports them: they auto-route device, React DevTools, and dogfood tasks to the right `agent-device help ` page and verify the CLI is new enough before acting. + +If you install skills separately, keep the CLI on `agent-device >= 0.13.4`. Older CLIs do not include the workflow help topics that the router skills expect. + +```bash +npm install -g agent-device@latest +agent-device --version +agent-device help +``` + +`agent-device` performs a lightweight background upgrade check for interactive CLI runs and, when a newer package is available, suggests a global reinstall command. Updating the package also refreshes the bundled `skills/` shipped with the CLI. + Prerequisites: Node.js 22+, Xcode for iOS/tvOS/macOS targets, Android SDK + ADB for Android, and macOS Accessibility permission for desktop automation. See [Installation](https://incubator.callstack.com/agent-device/docs/installation). Try the loop. diff --git a/package.json b/package.json index cbb557cb2..5cf221241 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "agent-device", - "version": "0.13.3", + "version": "0.13.4", "description": "Agent-driven CLI for mobile UI automation, network inspection, and performance diagnostics across iOS, Android, tvOS, and macOS.", "license": "MIT", "author": "Callstack", @@ -100,7 +100,7 @@ "test-app:typecheck": "pnpm --dir examples/test-app typecheck", "test": "vitest run", "test:unit": "vitest run", - "test:skillgym": "skillgym run ./test/skillgym/suites/agent-device-smoke-suite.ts --config ./test/skillgym/skillgym.config.ts", + "test:skillgym": "pnpm build && skillgym run ./test/skillgym/suites/agent-device-smoke-suite.ts --config ./test/skillgym/skillgym.config.ts", "test:smoke": "node --test test/integration/smoke-*.test.ts", "test:integration": "node --test test/integration/*.test.ts", "test:replay:ios": "node --experimental-strip-types src/bin.ts test test/integration/replays/ios/simulator", diff --git a/skills/agent-device/SKILL.md b/skills/agent-device/SKILL.md index bb228c277..8172a2bac 100644 --- a/skills/agent-device/SKILL.md +++ b/skills/agent-device/SKILL.md @@ -1,76 +1,34 @@ --- name: agent-device -description: Automates interactions for Apple-platform apps (iOS, tvOS, macOS) and Android devices. Use when navigating apps, taking snapshots/screenshots, tapping, typing, scrolling, extracting UI info, or collecting logs, network inspection, and perf snapshots across mobile, TV, and desktop targets. +description: Automates Apple-platform apps (iOS, tvOS, macOS) and Android devices. Use when navigating apps, taking snapshots/screenshots, tapping, typing, scrolling, extracting UI info, collecting logs/network/perf evidence, or planning agent-device CLI commands. --- # agent-device -Use this skill as a router with mandatory defaults. Read this file first. For normal device tasks, always load `references/bootstrap-install.md` and `references/exploration.md` before acting. Use bootstrap to confirm or establish deterministic setup. Use exploration for UI inspection, interaction, and verification once the app session is open. +Router only. Private setup before using this skill: -## Default operating rules +```bash +agent-device --version +``` -- Start conservative. Prefer read-only inspection before mutating the UI. -- Start deterministic. If the app name, package, device, or session is uncertain, load bootstrap and discover them before interacting. -- Use plain `snapshot` when the task is to verify what text or structure is currently visible on screen. -- Use `snapshot -i` only when you need interactive refs such as `@e3` for a requested action or targeted query. On iOS and Android, default snapshot output uses the same visible-first model: off-screen interactive content is exposed as discovery hints, not tappable refs. -- Prefer `diff snapshot` after a nearby mutation when you only need to know what changed. -- Avoid speculative mutations. You may take the smallest reversible UI action needed to unblock inspection or complete the requested task, such as dismissing a popup, closing an alert, or clearing an unintended surface. -- In React Native dev or debug builds, check early for visible warning or error overlays, tooltips, and toasts that can steal focus or intercept taps. If they are not part of the requested behavior, dismiss them and continue. If you saw them, report them in the final summary. -- In Metro-backed React Native dev loops, use `agent-device metro reload` for a JS app reload before falling back to `open --relaunch`. It mirrors pressing `r` in the Metro terminal and preserves the native app process. -- Do not browse the web or use external sources unless the user explicitly asks. -- Re-snapshot after meaningful UI changes instead of reusing stale refs. -- Treat refs in default snapshot output as actionable-now, not durable identities. If a target appears only in an off-screen summary, use `scroll ` and re-snapshot until the target is visible. -- Prefer `@ref` or selector targeting over raw coordinates. -- Ensure the correct target is pinned and an app session is open before interacting. -- Keep the loop short: `open` -> inspect/act -> verify if needed -> `close`. +Require `agent-device >= 0.13.4`; older CLIs lack these help topics. If older, run `npm install -g agent-device@latest`, recheck, then continue. If you cannot upgrade, stop and tell the user. Do not include version/upgrade commands in final plans. -## Default flow +Before your first agent-device command or plan, read the version-matched CLI guide: -1. Load [references/bootstrap-install.md](references/bootstrap-install.md) and [references/exploration.md](references/exploration.md) before acting on a normal device task. -2. Use bootstrap first to confirm or establish the correct target, app install, and open app session. -3. Once the app session is open and stable, use exploration for inspection, interaction, and verification. -4. Start with plain `snapshot` if the goal is to read or verify what is visible. -5. Escalate to `snapshot -i` only if you need refs for interactive exploration or a requested action. -6. Use `get`, `is`, or `find` before mutating the UI when a read-only command can answer the question. -7. End by capturing proof if needed, then `close`. +```bash +agent-device help workflow +``` -## QA modes +Escalate only when relevant: -- Open-ended bug hunt with reporting: use [../dogfood/SKILL.md](../dogfood/SKILL.md). -- Pass/fail QA from acceptance criteria: stay in this skill, start with [references/bootstrap-install.md](references/bootstrap-install.md), then use the QA loop in [references/exploration.md](references/exploration.md). +```bash +agent-device help debugging +agent-device help react-devtools +agent-device help remote +agent-device help macos +agent-device help dogfood +``` -## Required references +Default loop: `open -> snapshot/-i -> get/is/find or press/fill/scroll/wait -> verify -> close`. -- For every normal device task, after reading this file, load [references/bootstrap-install.md](references/bootstrap-install.md) first, then [references/exploration.md](references/exploration.md), before acting. -- Use bootstrap to confirm or establish deterministic setup, especially in sandbox or cloud environments. -- Use exploration once the app session is open and stable. -- Load additional references only when their scope is needed. - -## Decision rules - -- Use plain `snapshot` when you need to verify whether text is visible. -- Use `snapshot -i` mainly for interactive exploration and choosing refs. -- Use `diff snapshot` for compact post-action verification; use `snapshot --diff` when that alias is easier to discover from snapshot help. -- Use `get`, `is`, or `find` when they can answer the question without changing UI state. -- Use `fill` to replace text. -- Use `type` to append text. -- Do not write `type @eN "text"`. Use `fill @eN "text"` to target a field directly, or `press @eN` then `type "text"` when the field already has focus and you want append semantics. -- If the on-screen keyboard blocks the next step, prefer `keyboard dismiss` over navigation. On iOS, keep an app session open first; `keyboard status|get` remains Android-only. -- When a task asks to "go back", use plain `back` for predictable app-owned navigation and reserve `back --system` for platform back gestures or button semantics. -- Use `type --delay-ms` or `fill --delay-ms` for debounced search fields that drop characters when typed too quickly. -- If there is no simulator, no app install, or no open app session yet, switch to `bootstrap-install.md` instead of improvising setup steps. -- Use the smallest unblock action first when transient UI blocks inspection, but do not navigate, search, or enter new text just to make the UI reveal data unless the user asked for that interaction. -- In React Native dev or debug apps, treat visible warning or error overlays as transient blockers unless the user is explicitly asking you to diagnose them. Dismiss them when safe, then continue the requested flow. -- For React Native code changes where the app is already connected to Metro, prefer `agent-device metro reload`, then wait and re-snapshot. Use `open --relaunch` only when Metro reload does not reconnect or native startup state must reset. -- Do not use external lookups to compensate for missing on-screen data unless the user asked for them. -- If the needed information is not exposed on screen, say that plainly instead of compensating with extra navigation, text entry, or web search. -- Prefer `@ref` or selector targeting over raw coordinates. - -## Additional references - -- Need logs, network, alerts, permissions, or failure triage: [references/debugging.md](references/debugging.md) -- Need screenshots, diff, recording, replay maintenance, or perf data: [references/verification.md](references/verification.md) -- Need desktop surfaces, menu bar behavior, or macOS-specific interaction rules: [references/macos-desktop.md](references/macos-desktop.md) -- Need remote HTTP transport, `connect --remote-config`, or tenant leases on a remote macOS host: [references/remote-tenancy.md](references/remote-tenancy.md) - This includes remote React Native runs where `agent-device` now prepares Metro locally and manages the local Metro companion tunnel automatically. -- Need the React Native component tree, props, state, hooks, or render profiling: use `agent-device react-devtools ...` and the [react-devtools skill](../react-devtools/SKILL.md). +Keep refs current, prefer selectors/refs over coordinates, use `fill` to replace text, and use `back` for app-owned navigation. Let `help workflow` provide the exact command shapes. diff --git a/skills/agent-device/references/bootstrap-install.md b/skills/agent-device/references/bootstrap-install.md deleted file mode 100644 index a6b5a0f09..000000000 --- a/skills/agent-device/references/bootstrap-install.md +++ /dev/null @@ -1,244 +0,0 @@ -# Bootstrap and Install - -## When to open this file - -Open this file when you still need to choose the right target, start the right session, install or relaunch the app, or pin automation to one device before interacting. This is the deterministic setup layer for sandbox, cloud, or other environments where install paths, device state, or app readiness may be uncertain. - -## Open-first path - -- `devices` -- `apps` -- `ensure-simulator` -- `open` -- `session list` - -Use this exact order when you are not sure about the installed app identifier. On Android dev builds in particular, `apps` is cheaper than guessing package suffixes and retrying failed `open` calls. - -## Install path - -- `install` or `reinstall` -- `install-from-source` when the artifact already exists at a URL the daemon can reach -- `install-from-source --github-actions-artifact` when a compatible remote daemon should resolve a GitHub Actions artifact - -## Most common mistake to avoid - -Do not start acting before you have pinned the correct target and opened an `app` session. In mixed-device environments, always pass `--device`, `--udid`, or `--serial`. - -## Deterministic setup rule - -If there is no simulator, no app install, no open app session, or any uncertainty about where the app should come from, stay in this file and use deterministic setup commands or bootstrap scripts first. Do not improvise install paths or app-launch flows while exploring. - -After setup is confirmed or completed, move to `exploration.md` before doing UI inspection or interaction. - -## Open-first rule - -- If the user asks to test an app and does not provide an install artifact or explicit install instruction, try `open ` first. -- If `open ` fails, run `agent-device apps` and retry with a discovered app name before considering install steps. -- Do not install or reinstall on the first attempt unless the user explicitly asks for installation or provides a concrete artifact path or URL. -- When installation is required from a known location, prefer a checked-in shell script or other deterministic bootstrap command over ad hoc path guessing. -- Use `apps --platform ` together with `--device`, `--udid`, or `--serial` when target selection matters. -- Once you have the correct app name, retry `open` with that exact discovered value. - -## Common starting points - -These are examples, not required exact sequences. Use the smallest setup flow that matches the task. - -### Boot a simulator and open an app - -```bash -agent-device ensure-simulator --platform ios --device "iPhone 17 Pro" --boot -agent-device open MyApp --platform ios --device "iPhone 17 Pro" --relaunch -``` - -### Install an app artifact - -```bash -agent-device install com.example.app ./build/app.apk --platform android --serial emulator-5554 -``` - -```bash -agent-device install com.example.app ./build/MyApp.app --platform ios --device "iPhone 17 Pro" -``` - -```bash -ARTIFACT_URL="" -agent-device install-from-source "$ARTIFACT_URL" --platform android -``` - -Daemon-resolved GitHub Actions artifacts: - -```bash -agent-device install-from-source \ - --github-actions-artifact ORG/REPO:1234567890 \ - --platform android -``` - -Project config can provide an artifact name instead: - -```json -{ - "platform": "android", - "installSource": { - "type": "github-actions-artifact", - "repo": "ORG/REPO", - "artifact": "app-debug" - } -} -``` - -## Install guidance - -- Use `install ` when the app may already be installed and you do not need a fresh-state reset. -- Use `reinstall ` when you explicitly need uninstall plus install as one deterministic step. -- Use `install-from-source ` only when an existing artifact URL is trusted, operator-approved, and reachable by the daemon. -- Use `--github-actions-artifact /:` when a compatible remote daemon should resolve a GitHub Actions artifact. Numeric artifacts are IDs; non-numeric artifacts are names. -- Local `.apk`, `.aab`, `.app`, and `.ipa` paths go through `install` or `reinstall`; existing reachable URLs go through `install-from-source`. -- Do not download, re-zip, publish temporary GitHub releases, or move CI artifacts elsewhere just to make an install command work. -- Keep install and open as separate phases. Do not turn them into one default command flow. -- Supported binary formats: - - Android: `.apk` and `.aab` - - iOS: `.app` and `.ipa` -- Android URL sources can be direct `.apk` or `.aab` files. -- Trusted artifact service URLs may point at archive-backed downloads that contain one installable artifact. Prefer `--github-actions-artifact` for GitHub Actions artifacts that a compatible remote daemon can resolve with its own credentials. -- If a trusted artifact archive contains multiple installables, stop and ask for the intended artifact instead of guessing. -- `.aab` still requires `bundletool` in `PATH`, or `AGENT_DEVICE_BUNDLETOOL_JAR=` with `java` in `PATH`, when the daemon installs the materialized artifact. -- For `.ipa` archives with multiple app bundles, `` is the bundle id or bundle name selection hint. -- After install or reinstall, later use `open ` with the exact discovered or known package/bundle identifier, not the artifact path. - -## Choose the right starting point - -- iOS local QA: prefer simulators unless the task explicitly requires physical hardware. -- iOS in mixed simulator and device environments: run `ensure-simulator` first, then keep using `--device` or `--udid`. -- TV targets: use `--target tv` together with `--platform` when the task is for tvOS or Android TV rather than phone or tablet surfaces. -- Android binary flow: use `install` or `reinstall` for `.apk` or `.aab`, then open by installed package name. -- macOS desktop app flow: use `open --platform macos`. Only load [macos-desktop.md](macos-desktop.md) if a desktop surface or macOS-specific behavior matters. - -TV example: - -```bash -agent-device open MyTvApp --platform ios --target tv -agent-device open com.example.androidtv --platform android --target tv -``` - -## Session rules - -- Use `--session ` when you need a named session: - -```bash -agent-device --session auth open Settings --platform ios -agent-device --session auth snapshot -i -``` - -- Use `open ` before interactions. -- Use `close` when done. Add `--shutdown` when you want simulators or emulators torn down with the session. -- Use semantic session names when you need multiple concurrent runs. -- Use `--save-script=` on `close` when you want to keep a replay script. -- For dev loops where state can linger, prefer `open --relaunch`. -- For Metro-backed React Native JS changes with the app already running, prefer `metro reload` instead of `open --relaunch`; it asks Metro to reload connected apps without restarting the native process. -- In iOS sessions, use `open ` for the app itself. Use `open ` for deep links, and `open ` when you need to launch the app and deep link in one step. -- On iOS, `appstate` is session-scoped and requires the matching active session on the target device. - -## After a session is established - -Once you have opened the correct session on the correct target, default to the conservative rule: keep the session binding on follow-up commands, and stop repeating device-routing flags unless you are intentionally retargeting. - -- Prefer `--session ` on follow-up commands, or use sandboxed `AGENT_DEVICE_SESSION`. -- Do not keep repeating `--platform`, `--target`, `--device`, `--udid`, `--serial`, or similar target-selection flags on normal follow-up commands. -- Only omit follow-up session flags when the environment explicitly guarantees isolation. - -Good shared-host pattern: - -```bash -agent-device --session auth open Settings --platform ios --device "iPhone 17 Pro" -agent-device --session auth snapshot -i -agent-device --session auth press @e3 -agent-device --session auth close -``` - -Bad shared-host pattern: - -```bash -agent-device --session auth open Settings --platform ios --device "iPhone 17 Pro" -agent-device --session auth snapshot -i --platform ios --device "iPhone 17 Pro" -``` - -Use target-selection flags again only when you are choosing the target before opening a session, or when you explicitly mean to retarget. - -## Session-bound automation - -Use this when an orchestrator must keep plain CLI calls on one session and device. - -```bash -export AGENT_DEVICE_SESSION=qa-ios -export AGENT_DEVICE_PLATFORM=ios -export AGENT_DEVICE_SESSION_LOCK=strip - -agent-device open MyApp --relaunch -``` - -- `AGENT_DEVICE_SESSION` plus `AGENT_DEVICE_PLATFORM` provides the default binding. -- `--session-lock reject|strip` controls whether conflicting per-call routing flags fail or are ignored. -- Conflicts include explicit retargeting flags such as `--platform`, `--target`, `--device`, `--udid`, `--serial`, `--ios-simulator-device-set`, and `--android-device-allowlist`. -- Lock policy applies to nested `batch` steps too. -- Compatibility aliases remain supported: `--session-locked`, `--session-lock-conflicts`, `AGENT_DEVICE_SESSION_LOCKED`, and `AGENT_DEVICE_SESSION_LOCK_CONFLICTS`. - -Android emulator variant: - -```bash -export AGENT_DEVICE_SESSION=qa-android -export AGENT_DEVICE_PLATFORM=android - -agent-device --session-lock reject open com.example.myapp --relaunch -``` - -## Scoped discovery - -Use scoped discovery when one run must not see host-global device lists. - -```bash -agent-device devices --platform ios --ios-simulator-device-set /tmp/tenant-a/simulators -agent-device devices --platform android --android-device-allowlist emulator-5554,device-1234 -``` - -- Scope is applied before `--device`, `--udid`, and `--serial`. -- Out-of-scope selectors fail with `DEVICE_NOT_FOUND`. -- With iOS simulator-set scope enabled, iOS physical devices are not enumerated. -- If the scoped iOS simulator set is empty, the error should point at the set path and suggest creating a simulator in that set. -- Environment equivalents: - - `AGENT_DEVICE_IOS_SIMULATOR_DEVICE_SET` - - `AGENT_DEVICE_ANDROID_DEVICE_ALLOWLIST` - -## Session inspection and replay - -```bash -agent-device session list -agent-device replay ./session.ad --session auth -agent-device replay -u ./session.ad --session auth -``` - -- iOS session entries include `device_udid` and `ios_simulator_device_set`. Use them to confirm routing in concurrent runs. -- Prefer selector-based actions and assertions in saved replay scripts. -- Tenant isolation namespaces sessions as `:` during tenant-scoped runs. - -## When to leave this file - -- Once the correct target and session are pinned, move to [exploration.md](exploration.md). -- If opening, startup, permissions, or logs become the blocker, switch to [debugging.md](debugging.md). - -## Install examples - -```bash -agent-device reinstall MyApp /path/to/app-debug.apk --platform android --serial emulator-5554 -``` - -```bash -agent-device install com.example.app ./build/MyApp.ipa --platform ios --device "iPhone 17 Pro" -``` - -Do not use `open --relaunch` on Android. - -## Security and trust notes - -- Treat signing, provisioning, and daemon auth values as host secrets. Do not paste them into shared logs or commit them to source control. -- Prefer Xcode Automatic Signing over manual overrides when a physical iOS device is involved. -- Keep persistent host-specific defaults in environment variables rather than checked-in project config. diff --git a/skills/agent-device/references/coordinate-system.md b/skills/agent-device/references/coordinate-system.md deleted file mode 100644 index 03b8f2ef0..000000000 --- a/skills/agent-device/references/coordinate-system.md +++ /dev/null @@ -1,28 +0,0 @@ -# Coordinate System - -## When to open this file - -Open this file only when you must use raw coordinates instead of selectors or `@ref` targeting. - -## Main commands to reach for first - -- `screenshot` -- coordinate-based `click` or `swipe` - -## Most common mistake to avoid - -Do not assume coordinates mean the same thing across platforms or runs. Prefer selectors and refs first. - -## Canonical loop - -```bash -agent-device screenshot /tmp/current-screen.png -agent-device click 120 240 -``` - -## Rules - -- Origin is the top-left of the device screen. -- iOS uses device points. -- Android uses pixels. -- Use screenshots to reason about coordinates before acting. diff --git a/skills/agent-device/references/debugging.md b/skills/agent-device/references/debugging.md deleted file mode 100644 index adf1ba898..000000000 --- a/skills/agent-device/references/debugging.md +++ /dev/null @@ -1,138 +0,0 @@ -# Debugging - -## When to open this file - -Open this file when the task turns into failure triage, logs, network inspection, permission prompts, setup trouble, or unstable session behavior. - -If the debugging task needs the React Native component tree, props, state, hooks, or render profiling, use `agent-device react-devtools ...` and the `skills/react-devtools` workflow instead of trying to infer those internals from the accessibility tree or app logs alone. - -## Main commands to reach for first - -- `logs clear --restart` -- `network dump` -- `logs path` -- `logs doctor` -- `alert wait` -- `alert accept` or `alert dismiss` - -## Most common mistake to avoid - -Do not leave logging on for normal flows or dump full log files into context. Keep debug windows short and inspect logs with `grep` or `tail`. - -In React Native dev or debug builds, do not dismiss visible warning or error overlays without remembering to report them later. If you close one to keep the flow moving, keep at least a screenshot or a short marked log window so the summary can name it. - -## Canonical loop - -```bash -agent-device open MyApp --platform ios -agent-device logs clear --restart -agent-device network dump 25 -agent-device logs path -agent-device close -``` - -## Log and network flow - -Logging is off by default. Enable it only when you need a debugging window. - -- Default app logs live under `~/.agent-device/sessions//app.log`. -- `logs clear --restart` is the fastest clean repro loop. -- `network dump [limit] [summary|headers|body|all]` parses recent HTTP(s) entries from the same session app log. -- On macOS, `network dump` is app-scoped and only sees Unified Logging associated with the active session app. -- On iOS simulators, `network dump` can recover recent app log history with `simctl log show` when the live session stream is sparse, so check the returned notes before assuming the repro window was empty. -- On iOS, `network dump` is still limited to what Unified Logging exposes for the app process. If the app does not emit request metadata there, `network dump` can legitimately return no HTTP entries even during a real repro. -- Summary output already shows timestamp, status, and duration when the log backend exposes them. -- Prefer the explicit flag form `network dump 25 --include headers|body|all` when you need more than the default summary view. -- If iOS simulator notes say app logs were recovered but none looked like HTTP traffic, treat that as an app instrumentation gap rather than a missing repro and inspect `logs path` for the non-network diagnostics that were captured. -- `logs doctor` checks backend and runtime readiness for the current session and device. -- `logs mark "before tap"` inserts a timestamped marker into the app log. -- Android `network dump` surfaces timestamps from logcat-style prefixes and can backfill status and request/response duration from adjacent GIBSDK packet lines, so check it before dumping raw log windows. -- Android app-log streaming rebinds to the current app PID after relaunches, so rerun the repro window before assuming the last log slice is stale. -- Marker lines are emitted with the `[agent-device][mark][...]` prefix. When you grep later, prefer a narrow pattern such as `grep -n -E "agent-device.*mark|before tap" `. -- Session app logs can contain runtime data, headers, or payload fragments. Review them before sharing. -- `logs start` requires an active app session and appends to `app.log`. -- `logs stop` stops streaming. `close` also stops logging. -- `logs clear` truncates `app.log` and removes rotated `app.log.N` files, and requires logging to be stopped first. -- `logs path` returns the log path plus metadata about the active backend and file state. -- `network log` is an alias for `network dump`. - -Operational limits: - -- `app.log` rotates to `app.log.1` after 5 MB by default. -- `network dump` scans the last 4000 app-log lines, returns up to 200 entries, and truncates header or payload fields at 2048 characters. -- Retention knobs: - - `AGENT_DEVICE_APP_LOG_MAX_BYTES` - - `AGENT_DEVICE_APP_LOG_MAX_FILES` -- Redaction hook: - - `AGENT_DEVICE_APP_LOG_REDACT_PATTERNS` - -Useful shell follow-up after `logs path`: - -```bash -grep -n -E "Error|Exception|Fatal|crash" -grep -n -E "agent-device.*mark|before tap" -tail -50 -``` - -If the app showed a visible warning or error overlay during the flow: - -- Prefer a narrow grep window around your `logs mark` lines instead of loading the whole file. -- Mention the surfaced warning or error in the final summary even if it did not block completion. -- If the overlay kept returning, call that out as a stability issue instead of treating it as operator noise. - -## Alerts and permissions - -Use `alert` for iOS simulator permission dialogs and macOS desktop alerts instead of tapping coordinates. - -```bash -agent-device alert wait 5000 -agent-device alert accept -``` - -- `alert` is supported on iOS simulators and macOS desktop targets. -- `alert accept` and `alert dismiss` retry internally for a short window, so you usually do not need manual sleeps. -- If a permission sheet or modal is visible in `snapshot` or `screenshot` but `alert accept` says no alert was found, treat it as normal tappable UI for that run: take a scoped `snapshot -i -s ""` and `press @ref` instead of looping on `alert`. -- iOS 16+ "Allow Paste" prompts are suppressed under XCUITest. Use `xcrun simctl pbcopy booted` when you need to seed simulator clipboard content directly. - -## Setup problems worth recognizing early - -- iOS snapshots do not require macOS Accessibility permissions. -- iOS physical-device XCTest setup does require valid signing and provisioning. -- If physical-device runner setup fails, prefer Xcode Automatic Signing first. -- Optional overrides are: - - `AGENT_DEVICE_IOS_TEAM_ID` - - `AGENT_DEVICE_IOS_SIGNING_IDENTITY` - - `AGENT_DEVICE_IOS_PROVISIONING_PROFILE` - - `AGENT_DEVICE_IOS_BUNDLE_ID` -- If daemon startup is timing out during setup, increase `AGENT_DEVICE_DAEMON_TIMEOUT_MS`. -- If daemon startup fails with stale metadata hints, clean `~/.agent-device/daemon.json` and `~/.agent-device/daemon.lock`, then retry. -- Free Apple Developer personal-team accounts may reject generic bundle IDs. Use a unique reverse-DNS value for `AGENT_DEVICE_IOS_BUNDLE_ID` when that happens. - -## Common failure patterns - -- `snapshot` returns 0 nodes: the app may no longer be foregrounded or the UI is not stable yet. Re-open the app or retry when state settles. -- Logs are empty: confirm you opened an app session before `logs clear --restart`. -- Android logs look stale after relaunch: retry the repro window after the process rebinds. -- Android accessibility snapshots can lag behind visible screen transitions. The next snapshot retries suspicious trees for a short post-action deadline after navigation-sensitive actions, and `@ref` actions refresh while that window is active. If the tree still looks stale, use `screenshot` as visual truth, wait briefly, then re-run `snapshot -i`. For animation-heavy runs, try `settings animations off` and restore with `settings animations on`. -- React Native dev warnings or errors keep reappearing: treat them as part of the app state, not as disposable chrome. Capture one clean repro and include them in the summary. -- Permission prompts block the flow: wait for the alert and handle it explicitly. -- If snapshots keep returning 0 nodes on an iOS simulator, restart Simulator and re-open the app. -- If a macOS snapshot looks incomplete, compare with `snapshot --raw --platform macos` to separate collector filtering from missing AX content. - -## Crash triage fast path - -Always start from the session app log, then branch by platform. - -```bash -agent-device logs path -grep -n -E "SIGABRT|SIGSEGV|EXC_|fatal|exception|terminated|killed|jetsam|memorystatus|FATAL EXCEPTION|Abort message" -``` - -- iOS: if the log suggests `ReportCrash`, `SIGABRT`, or `EXC_*`, inspect `~/Library/Logs/DiagnosticReports`. -- Android: if the app log is not enough, use `adb logcat` for `FATAL EXCEPTION`, `Abort message`, or `signal` lines around process death. -- If no crash signature appears in app logs, stop collecting broad logs and switch to the platform-native crash source. - -## When to leave this file - -- Return to [exploration.md](exploration.md) once the app is stable again. -- Load [verification.md](verification.md) if you need evidence artifacts after reproducing the issue. diff --git a/skills/agent-device/references/exploration.md b/skills/agent-device/references/exploration.md deleted file mode 100644 index bce87335a..000000000 --- a/skills/agent-device/references/exploration.md +++ /dev/null @@ -1,362 +0,0 @@ -# Exploration - -## When to open this file - -Open this file when the app or screen is already running and you need to discover the UI, choose targets, read state, wait for conditions, or perform normal interactions. - -## Read-only first - -- If the question is what text, labels, or structure is visible on screen, start with plain `snapshot`. -- Escalate to `snapshot -i` only when you need refs such as `@e3` for interactive exploration or a requested action. -- If you intend to `press`, `fill`, or otherwise interact, start with `snapshot -i` and fall back to plain `snapshot` only if interactive refs are unavailable. -- Prefer `get`, `is`, or `find` before mutating the UI when a read-only command can answer the question. -- You may take the smallest reversible UI action needed to unblock inspection, such as dismissing a popup, closing an alert, or backing out of an unintended surface. -- Do not type or fill text just to make hidden information easier to access unless the user asked for that interaction. -- Do not use external sources to infer missing UI state unless the user explicitly asked. -- If the answer is not visible or exposed in the UI, report that gap instead of compensating with search, navigation, or text entry. - -## Decision shortcut - -- User asks what is visible on screen: `snapshot` -- User asks for exact text from a known target: `get text` -- User asks you to tap, type, or choose an element: `snapshot -i`, then act -- User asks for the React Native component tree, props/state/hooks, or render profiling: use `agent-device react-devtools ...` and the `skills/react-devtools` workflow -- User asks to reload a Metro-backed React Native app after JS changes: `agent-device metro reload`, then wait briefly and re-run `snapshot` or `snapshot -i` -- React Native dev or debug build shows warning/error UI: capture enough evidence to identify it, dismiss it if it is not the requested behavior, then continue the flow and report it in the summary -- The on-screen keyboard is blocking the next step: `keyboard dismiss`; on iOS do this only while an app session is active, and use `keyboard status|get` only on Android -- UI does not expose the answer: say so plainly; do not browse or force the app into a new state unless asked - -## Read-only commands - -- `snapshot` -- `get` -- `is` -- `find` -- `keyboard status|get` on Android when keyboard visibility or input type matters - -## Interaction commands - -- `snapshot -i` -- `press` -- `fill` -- `type` -- `scroll` -- `wait` -- `keyboard dismiss` when the keyboard obscures the next target - -## Common mistakes to avoid - -**Stale refs.** Do not treat `@ref` values as durable after navigation or dynamic updates. Re-snapshot after the UI changes, and switch to selectors when the flow must stay stable. - -**Android AX tree lag.** After submits, route changes, or composer transitions, the accessibility tree can lag behind the visible UI. If `snapshot -i` and `screenshot` disagree: - -1. Trust the screenshot as visual truth. -2. Take one fresh `snapshot -i`. Android retries suspicious trees for a short post-action deadline after navigation-sensitive actions. -3. If the tree still disagrees with the screenshot, wait briefly, then take one more fresh snapshot. Do not loop snapshots immediately. -4. For animation-heavy Android runs, use `settings animations off` as an opt-in stabilizer and restore with `settings animations on` after the run. - -**React Native dev overlays.** In dev or debug builds, warning or error overlays can block taps, change focus, or hide the real UI. Check for them near app open and after major transitions. - -- Not blocking the task: dismiss and continue. -- Blocking or recurring: switch to [debugging.md](debugging.md) and collect evidence. -- Seen at any point: mention in the final summary even if dismissed. - -**React Native Metro reload.** When a dev app is already running and connected to Metro, prefer a Metro reload over restarting the native app process: - -```bash -agent-device metro reload -agent-device wait 1000 -agent-device snapshot -i -``` - -Use `--metro-host`, `--metro-port`, or `--bundle-url` only when the active connection does not already carry the right runtime hints. Fall back to `open --relaunch` when the app is not connected to Metro, Metro reload fails, or native startup state needs a clean process. - -## Common example loops - -These are examples, not required exact sequences. Adapt them to the app, state, and task at hand. - -### Interactive exploration loop - -```bash -agent-device open Settings --platform ios -agent-device snapshot -i -agent-device press @e3 -agent-device wait visible 'label="Privacy & Security"' 3000 -agent-device get text 'label="Privacy & Security"' -agent-device close -``` - -### Screen verification loop - -```bash -agent-device open MyApp --platform ios -# perform the necessary actions to reach the state you need to verify -agent-device snapshot -# verify whether the expected element or text is present -agent-device close -``` - -## Snapshot choices - -- Use plain `snapshot` when you only need to verify whether visible text or structure is on screen. -- Use `snapshot -i` when you need refs such as `@e3` for interactive exploration or for an intended interaction. -- On iOS and Android, default snapshot output is visible-first. Off-screen interactive content is surfaced as discovery hints (including inline scroll/list hidden-content hints when known), not shown as directly tappable refs. -- Treat large text-surface lines in `snapshot -i` as discovery output. If a node shows preview or truncation metadata, use `get text @ref` only after you have already decided that `snapshot -i` is needed for that surface. -- Use `snapshot -i -s "Camera"` or `snapshot -i -s @e3` when you want a smaller, scoped result. -- If `snapshot -i -s ""` returns 0 nodes, the scope did not match the current screen. Widen the query or re-check the screen state instead of assuming the command silently fell back to the full tree. -- If `snapshot -i` returns 0 nodes but the screen is visibly populated, treat `screenshot` as visual truth, wait briefly, then re-run `snapshot -i` once before escalating. -- If `snapshot -i -d ` says the interactive output is empty at that depth, retry without `-d` instead of taking more shallow snapshots. - -Example: - -```bash -agent-device snapshot -i -``` - -Sample output: - -```text -Page: com.apple.Preferences -App: com.apple.Preferences - -@e1 [ioscontentgroup] - @e2 [button] "Camera" - @e3 [button] "Privacy & Security" -[off-screen below] 2 interactive items: "Location Services", "Battery" -``` - -## Refs vs selectors - -- Use refs for discovery, debugging, and short local loops. -- When a target appears only in a visible-first off-screen summary, such as `[off-screen below] ... "Battery"`, use `scroll down` and then `snapshot -i`. For `[off-screen above]`, use `scroll up` and then `snapshot -i`. -- For more than two repeated scroll checks, create a short shell loop instead of issuing each command by hand. Stop when the label appears or the snapshot stops changing. -- Visible-first off-screen summaries are intentionally compact. If you need the full off-screen tree instead of a short summary, retry with `snapshot --raw`. -- Cap long searches in the loop when the list may be unbounded or the target may not exist. -- Use selectors for deterministic scripts, assertions, and replay-friendly actions. -- Prefer selector or `@ref` targeting over raw coordinates. -- For tap interactions, `press` is canonical and `click` is an equivalent alias. - -Examples: - -```bash -agent-device press @e2 -agent-device fill @e5 "test" -agent-device press 'id="camera_row" || label="Camera" role=button' -agent-device is visible 'id="camera_settings_anchor"' -``` - -Example loop: - -```bash -previous='' -for _ in 1 2 3 4 5 6; do - current="$(agent-device snapshot -i)" - printf '%s\n' "$current" - printf '%s\n' "$current" | grep -q 'Battery' && break - [ "$current" = "$previous" ] && break - previous="$current" - agent-device scroll down 0.5 >/dev/null -done -``` - -## Interaction fallbacks - -When `press @ref` fails: - -1. If the error says the ref is off-screen, use the off-screen summary direction to run `scroll `, then take a fresh `snapshot -i`. -2. Re-snapshot if the UI may have changed. -3. Retry `press @ref` or a selector-based `press`. -4. If `screenshot --overlay-refs --json` returned a reliable `overlayRefs[].center`, use `agent-device press `. -5. Use an external vision-based tap tool only after semantic and coordinate targeting fail. - -- Prefer `@ref` over coordinates. -- Do not guess coordinates from the image when structured `center` is available. -- `agent-device` does not provide a built-in vision-tap flag. - -## Text entry rules - -- Use `fill` to replace text in an editable field. -- Use `type` to append text to the current insertion point. -- Use `fill @ref "text"` when you need to target a field directly by ref. -- Use `press @ref`, then `type "text"` when the field is already focused and you need append semantics. -- Do not write `type @ref "text"`; `type` only accepts text and will not target that ref for you. -- If the keyboard blocks the next control after text entry, prefer `keyboard dismiss` instead of backing out of the screen. -- On iOS, `keyboard dismiss` depends on the active app session to keep the target app foregrounded, so do not rely on selector-only dismiss calls after closing or without `open`. -- Do not use `fill` or `type` just to make the app reveal information that is not currently visible unless the user asked for that interaction. - -## React Native dev or debug overlays - -Use this loop for React Native dev clients, Metro-backed builds, and local debug sessions where warnings or errors may appear as tooltips, banners, toasts, or modal overlays. - -1. After `open`, inspect the visible UI for warning or error surfaces before relying on the next tap. -2. If a warning or error is visible, capture enough evidence to identify it: - - preferred: `screenshot` - - optional: `logs mark "warning visible"` or `logs mark "error visible"` if you are already in a debug window -3. If the overlay is not the thing the user asked you to investigate, dismiss or close it with the smallest reversible action. -4. Re-check the intended screen before continuing the task. -5. Report any visible warnings or errors in the final summary, even if the flow succeeded after dismissal. - -Use this rule of thumb: - -- Warning overlay that does not block the task: dismiss and keep going. -- Error overlay that does not block the task: dismiss, keep going, and report it. -- Error overlay that blocks the task or keeps returning: stop treating it as noise and switch to [debugging.md](debugging.md). - -## Query and sync rules - -- Use `get` to read text, attrs, or state from a known target. -- Use `is` for assertions. -- Use `wait` when the UI needs time to settle after a mutation. -- Use `find "" click --json` when you need search-driven targeting plus matched-target metadata. -- Use `find "" click --first` or `--last` when ambiguous matches are expected and you want the first or last occurrence without falling back to raw coordinates. -- If you are forced onto raw coordinates, open [coordinate-system.md](coordinate-system.md) first. - -Example: - -```bash -agent-device find "Increment" click --json -``` - -Returned metadata comes from the matched snapshot node and can be used for observability or replay maintenance. - -## QA from acceptance criteria - -Use this loop when the task starts from acceptance criteria and you need to turn them into concrete checks. - -Preferred mapping: - -- visibility claim for what is on-screen now: `is visible` or plain `snapshot` -- presence claim regardless of viewport visibility: `is exists` -- exact text, label, or value claim: `get text` -- post-action state change: act, then `wait`, then `is` or `get` -- nearby structural UI change: `diff snapshot` -- proof artifact for the final result: `screenshot` or `record` - -Notes: - -- `wait text` is useful for synchronizing on text presence, but it is not the same as `is visible`. -- After a nearby navigation or submit on Android, prefer `screenshot`, then one fresh `snapshot -i`; `@ref` interactions refresh while the Android freshness window is active. - -Anti-hallucination rules: - -- Do not invent app names, device ids, session names, refs, selectors, or package names. -- Discover them first with `devices`, `open`, `snapshot -i`, `find`, or `session list`. -- If refs drift after navigation, re-snapshot or switch to selectors instead of guessing. - -Avoid this escalation path for visible-text questions: - -- Do not jump from `snapshot -i` to `get text @ref`, then to web search, then to typing into a search box just to force the app to reveal the answer. -- Start with `snapshot`. If the text is not visible or exposed, report that directly. -- After Android submit or navigation-heavy actions when the UI looks wrong: `screenshot` first, then `snapshot -i`. - -Canonical QA loop: - -```bash -agent-device open MyApp --platform ios -agent-device snapshot -i -agent-device press @e3 -agent-device wait visible 'label="Success"' 3000 -agent-device is visible 'label="Success"' -agent-device screenshot /tmp/qa-proof.png -agent-device close -``` - -## Accessibility audit - -Use this pattern when you need to find UI that is visible to a user but missing from the accessibility tree. - -Audit loop: - -1. Capture a `screenshot` to see what is visually rendered. -2. Capture a `snapshot` or `snapshot -i` to see what the accessibility tree exposes. -3. Compare the two: - - visible in screenshot and present in snapshot: exposed to accessibility - - visible in screenshot and missing from snapshot: likely accessibility gap -4. If you suspect the node exists in AX but is filtered from interactive output, retry with `snapshot --raw`. - -Example: - -```bash -agent-device screenshot /tmp/accessibility-screen.png -agent-device snapshot -i -``` - -Use `screenshot` as the visual source of truth and `snapshot` as the accessibility source of truth for this audit. - -## Batch only when the sequence is already known - -Use `batch` when a short command sequence is already planned and belongs to one logical screen flow. - -```bash -agent-device batch --session sim --platform ios --steps-file /tmp/batch-steps.json --json -``` - -- Keep batch size moderate, roughly 5 to 20 steps. -- Add `wait` or `is exists` guards after mutating steps. -- Do not use `batch` for highly dynamic flows that need replanning after each step. - -Example: known chat-send flow - -```json -[ - { "command": "open", "positionals": ["ChatApp"], "flags": { "platform": "android" } }, - { "command": "click", "positionals": ["label=\"Travel chat\""], "flags": {} }, - { "command": "wait", "positionals": ["label=\"Message\"", "3000"], "flags": {} }, - { "command": "fill", "positionals": ["label=\"Message\"", "Filed the expense"], "flags": {} }, - { "command": "press", "positionals": ["label=\"Send\""], "flags": {} } -] -``` - -Step payload contract: - -```json -[ - { "command": "open", "positionals": ["Settings"], "flags": { "platform": "ios" } }, - { "command": "wait", "positionals": ["label=\"Privacy & Security\"", "3000"], "flags": {} }, - { "command": "click", "positionals": ["label=\"Privacy & Security\""], "flags": {} }, - { "command": "get", "positionals": ["text", "label=\"Tracking\""], "flags": {} } -] -``` - -- `positionals` is optional and defaults to `[]`. -- `flags` is optional and defaults to `{}`. -- Only `command`, `positionals`, `flags`, and `runtime` are accepted as top-level step keys. -- Nested `batch` and `replay` are rejected. -- Supported error mode is stop-on-first-error. - -Response handling: - -- Success returns fields such as `total`, `executed`, `totalDurationMs`, and `results[]`. -- Human-mode `batch` runs also print a short per-step success summary. -- Failed runs include `details.step`, `details.command`, `details.executed`, and `details.partialResults`. -- Replan from the first failing step instead of rerunning the whole flow blindly. - -Canonical batch recipe: open app -> open action menu -> choose option -> verify - -```json -[ - { "command": "open", "positionals": ["com.example.app"], "flags": { "platform": "android" } }, - { "command": "wait", "positionals": ["text", "Home", "3000"], "flags": {} }, - { "command": "press", "positionals": ["label=\"More actions\" role=button"], "flags": {} }, - { "command": "wait", "positionals": ["text", "Camera scan", "2000"], "flags": {} }, - { "command": "press", "positionals": ["label=\"Camera scan\""], "flags": {} }, - { "command": "wait", "positionals": ["text", "Expense created", "15000"], "flags": {} }, - { "command": "is", "positionals": ["visible", "label=\"Expense created\""], "flags": {} } -] -``` - -Common batch error categories: - -- `INVALID_ARGS`: fix the payload shape and retry. -- `SESSION_NOT_FOUND`: open or select the correct session, then retry. -- `UNSUPPORTED_OPERATION`: switch to a supported command or surface. -- `AMBIGUOUS_MATCH`: refine the selector or locator, then retry the failed step. -- `DEVICE_IN_USE`: the device is held by another session — close or reuse the existing session before retrying. -- `COMMAND_FAILED`: add sync guards and retry from the failing step. - -## Stop conditions - -- If refs drift after transitions, switch to selectors. -- If a desktop surface or context menu is involved on macOS, load [macos-desktop.md](macos-desktop.md). -- If logs, network, alerts, or setup failures become the blocker, switch to [debugging.md](debugging.md). -- If the flow is stable and you need proof or replay maintenance, switch to [verification.md](verification.md). diff --git a/skills/agent-device/references/macos-desktop.md b/skills/agent-device/references/macos-desktop.md deleted file mode 100644 index a411d370b..000000000 --- a/skills/agent-device/references/macos-desktop.md +++ /dev/null @@ -1,88 +0,0 @@ -# macOS Desktop - -## When to open this file - -Open this file only when `--platform macos` is involved or the task needs `frontmost-app`, `desktop`, or `menubar` surfaces. - -## Main commands to reach for first - -- `open --platform macos` -- `open --platform macos --surface frontmost-app|desktop|menubar` -- `snapshot -i` -- `get` -- `is` -- `click --button secondary` - -## Most common mistake to avoid - -Do not treat every macOS surface the same. Use the normal `app` surface when you want to act inside one app. Use `frontmost-app`, `desktop`, or `menubar` mainly to inspect what is visible before switching back to `app` for most interactions. - -## Canonical loop - -```bash -agent-device open TextEdit --platform macos -agent-device snapshot -agent-device close -``` - -## Surface rules - -- `app`: default surface and the normal choice for `click`, `fill`, `press`, `scroll`, `screenshot`, and `record`. -- `frontmost-app`: inspect the currently focused app without naming it first. -- `desktop`: inspect visible desktop windows across apps. -- `menubar`: inspect the active app menu bar and system menu extras. Use `open --platform macos --surface menubar` when you need one menu bar app's extras, such as a status-item app. -- Menu bar apps can expose a sparse or empty default `app` tree. Prefer the `menubar` surface first when the app lives entirely in the top bar. - -Use inspect-first surfaces to understand desktop-global UI, then switch back to `app` when you need to act in one app. - -## Snapshot expectations - -- `snapshot -i` should describe UI visible to a human. -- `desktop` snapshots can include multiple windows from multiple apps. -- `menubar` snapshots can include both app-menu items and system menu extras. -- Finder-style rows, sidebar items, toolbar controls, search fields, and opened context menus should appear when visible. -- Finder and other native apps may expose duplicate-looking row, cell, and child text nodes. Treat them as distinct AX nodes unless you have a stronger selector anchor. - -## Context menus - -Context menus are not ambient UI. Open them explicitly, then re-snapshot. - -```bash -agent-device click @e66 --button secondary --platform macos -agent-device snapshot -i -``` - -Expected loop: - -1. Snapshot visible content. -2. Secondary-click the target item. -3. Snapshot again. -4. Interact with the new `menu-item` nodes. - -## Targeting rules - -- Prefer selectors or `@ref` values over raw coordinates. -- On macOS, window position can vary across runs, so coordinate-only flows are fragile. -- If the task only needs shared exploration rules, return to [exploration.md](exploration.md). - -Selector guidance: - -- Good selectors usually anchor on stable labels or app-owned identifiers such as `label="Downloads"` or `role=menu-item label="Rename"`. -- Avoid relying on framework-generated `_NS:*` identifiers as stable selectors. - -Use `snapshot --raw --platform macos` only when debugging AX structure or collector filtering. Do not make raw snapshots the default agent loop. - -Things not to rely on: - -- Mobile-only helpers such as `install`, `reinstall`, or `push`. -- Desktop-global click, fill, or gesture parity from `desktop` or `menubar` sessions. -- Raw coordinate assumptions across runs. - -Troubleshooting: - -- If visible content is missing from `snapshot -i`, re-snapshot after the UI settles. -- If `desktop` is too broad, retry with `frontmost-app`. -- If `menubar` is missing the expected menu, retry with `open --platform macos --surface menubar` for menu bar apps, or make the app frontmost first and retry the generic menubar surface. -- If the wrong menu opened, retry secondary-clicking the row or cell wrapper rather than the nested text node. -- If the app has multiple windows, make the correct window frontmost before relying on refs. -- If overriding the local helper, set `AGENT_DEVICE_MACOS_HELPER_BIN` to an absolute executable path; relative helper paths are rejected. diff --git a/skills/agent-device/references/remote-tenancy.md b/skills/agent-device/references/remote-tenancy.md deleted file mode 100644 index 0fecd8195..000000000 --- a/skills/agent-device/references/remote-tenancy.md +++ /dev/null @@ -1,189 +0,0 @@ -# Remote Tenancy - -## When to open this file - -Open this file for remote daemon HTTP flows that let an agent running in a Linux sandbox talk to another `agent-device` instance on a remote macOS host in order to control devices that are not available locally. This file covers daemon URL setup, authentication, `connect`, tenant lease scope, and remote Metro companion lifecycle. - -## Main commands to reach for first - -- `agent-device connect --remote-config ` -- `agent-device install-from-source --remote-config --platform android` -- `agent-device install-from-source --github-actions-artifact /: --remote-config --platform android` -- `agent-device open --remote-config --relaunch` -- `agent-device metro reload --remote-config ` -- `agent-device snapshot --remote-config -i` -- `agent-device disconnect --remote-config ` -- `agent-device connection status` -- `agent-device auth status` -- `AGENT_DEVICE_DAEMON_AUTH_TOKEN=adc_live_...` for CI/service-token automation - -## Most common mistake to avoid - -Do not mix an arbitrary `--session` plus ad-hoc daemon, tenant, run, or lease flags. That can bypass saved Metro runtime hints. Use one of these patterns instead: - -- Interactive flow: run `connect --remote-config ` once, then normal commands, then `disconnect`. -- Script flow: pass the same `--remote-config ` to every command, including `disconnect`. - -## Choose one flow - -### Interactive flow - -Use this when the agent will run several commands in one session. - -```bash -agent-device connect --remote-config ./remote-config.json - -ARTIFACT_URL="" -agent-device install-from-source "$ARTIFACT_URL" --platform android -agent-device open com.example.app --relaunch -agent-device metro reload -agent-device snapshot -i -agent-device fill @e3 "test@example.com" -agent-device disconnect -``` - -After `connect`, normal commands use the active remote connection. If cloud credentials are missing, `connect` starts login automatically in an interactive local shell and stores a revocable CLI session that silently mints short-lived `adc_agent_...` command tokens. Linux sandboxes, CI, and other non-interactive shells should set `AGENT_DEVICE_DAEMON_AUTH_TOKEN=adc_live_...` instead. The cloud side remains responsible for token expiry, tenant/run claim checks, revocation, one-time device approval, and polling rate limits. End with `disconnect` to release the lease and stop the owned Metro companion. - -### Self-contained script flow - -Use this when each command must be explicit and repeatable. Pass the same `--remote-config` to each step. - -```bash -ARTIFACT_URL="" - -agent-device install-from-source "$ARTIFACT_URL" \ - --remote-config ./remote-config.json \ - --platform android - -agent-device open com.example.app \ - --remote-config ./remote-config.json \ - --relaunch - -agent-device snapshot \ - --remote-config ./remote-config.json \ - -i - -agent-device disconnect \ - --remote-config ./remote-config.json -``` - -The first command that needs a lease or Metro runtime prepares and persists it. Later commands with the same `--remote-config` reuse that state. End with `disconnect --remote-config ` to release the lease and stop the owned Metro companion. - -## Behavior summary - -- `connect` stores local non-secret connection state and defers tenant lease allocation plus Metro preparation until a later command needs them. -- Commands such as `install-from-source`, `open`, `snapshot`, `devices`, and `apps` allocate or refresh the lease when needed. -- `open` prepares Metro runtime hints when the remote profile has Metro fields and no compatible runtime is already saved. -- `metro reload` reuses saved Metro runtime hints and asks Metro to reload connected React Native apps without restarting the native process. -- `batch` also prepares Metro when any step opens an app and that step does not provide its own runtime. -- `disconnect` closes the session when possible, stops the Metro companion owned by the connection, releases the lease when one was allocated, and removes local connection state. - -Remote install examples: - -```bash -agent-device install com.example.app ./app.apk -ARTIFACT_URL="" -agent-device install-from-source "$ARTIFACT_URL" --platform android -``` - -- Use `install` or `reinstall` for local paths; remote daemons upload local artifacts automatically. -- Use `install-from-source` only for trusted, operator-approved artifact URLs the remote daemon can reach. Do not fetch arbitrary user-supplied URLs. -- Use `install-from-source --github-actions-artifact /:` when the remote daemon has repository credentials and supports daemon-resolved GitHub Actions artifacts. -- For local-path versus URL artifact rules, follow [bootstrap-install.md](bootstrap-install.md). - -Use `agent-device connection status --session adc-android` to inspect the active connection without reading JSON state manually. Status output must not include auth tokens. - -## Remote config shape - -Example `remote-config.json` shape: - -```json -{ - "daemonBaseUrl": "https://bridge.example.com/agent-device", - "daemonTransport": "http", - "tenant": "acme", - "runId": "run-123", - "sessionIsolation": "tenant", - "platform": "ios", - "metroProxyBaseUrl": "https://bridge.example.com" -} -``` - -Optional overrides stay available for advanced cases: - -```json -{ - "session": "adc-ios", - "leaseBackend": "ios-instance", - "metroProjectRoot": ".", - "metroKind": "expo", - "metroPublicBaseUrl": "http://127.0.0.1:8081" -} -``` - -- Keep service tokens in env/config managed by the operator boundary. Do not persist auth tokens in connection state. Human login uses `agent-device auth login` or implicit `connect` login and stores only the CLI session credential. -- Omit Metro fields for non-React Native flows. -- Put `tenant`, `runId`, and `sessionIsolation` in the remote profile so agents can run `agent-device connect --remote-config ./remote-config.json` without extra scope flags. Add `platform`, `leaseBackend`, `session`, or Metro overrides only when the default inference is not enough for that flow. -- Explicit command-line flags override connected defaults. Use them intentionally when switching session, platform, target, tenant, run, or lease scope. -- For React Native Metro runs with `metroProxyBaseUrl`, `agent-device >= 0.11.12` can manage the local companion tunnel, but Metro itself still needs to be running locally. `metroProxyBaseUrl` is the bridge origin, not a prebuilt `/api/metro/...` route. -- Set `AGENT_DEVICE_CLOUD_BASE_URL` to the bridge/control-plane API origin. It does not need to be the dashboard origin; `/api-keys` on the bridge can redirect to the dashboard for service-token setup. -- For cloud stock React Native iOS, use the bridge descriptor's wildcard HTTPS Metro hints directly; do not install or launch the XCTest runner just to make Metro reachable. -- Android keeps using bridge-provided `/api/metro/runtimes//...` Metro routes. -- `metroPublicBaseUrl` is only needed for direct/non-bridge bundle hints. Bridged profiles can omit it. -- Use a lease backend that matches the bridge target platform, for example `android-instance`, `ios-instance`, or an explicit `--lease-backend` override. - -## Transport prerequisites - -- Start the daemon in HTTP mode with `AGENT_DEVICE_DAEMON_SERVER_MODE=http|dual` on the host. -- Point the profile or env at the remote host with `daemonBaseUrl` or `AGENT_DEVICE_DAEMON_BASE_URL=http(s)://host:port[/base-path]`. -- For humans, run `connect --remote-config ` and let it refresh or create the CLI session. Use `agent-device auth status` to inspect it and `agent-device auth logout` to remove it. -- For CI/non-interactive shells, set `AGENT_DEVICE_DAEMON_AUTH_TOKEN=adc_live_...` or pass `--daemon-auth-token`. The client does not start device-code polling in CI by default. -- Prefer an auth hook such as `AGENT_DEVICE_HTTP_AUTH_HOOK` when the host needs caller validation or tenant injection. - -## Lease debug fallback - -The main agent flow should use `connect` and `connection status`. For daemon-side auth, scope, or lease debugging, inspect host-side daemon logs and operator tooling instead of issuing raw daemon RPC from the agent shell. - -## GitHub Actions artifact install - -Use this when a compatible remote daemon resolves GitHub Actions artifacts server-side. Do not download CI artifacts locally or add a local `GITHUB_TOKEN` just to install CI output. - -Artifact ID shape: - -```bash -agent-device install-from-source \ - --github-actions-artifact OWNER/REPO:1234567890 \ - --remote-config ./remote-config.json \ - --platform android -``` - -Artifact-name shape: - -```bash -agent-device install-from-source \ - --github-actions-artifact OWNER/REPO:app-debug \ - --remote-config ./remote-config.json \ - --platform ios -``` - -Config shape: - -```json -{ - "installSource": { - "type": "github-actions-artifact", - "repo": "OWNER/REPO", - "artifact": "app-debug" - } -} -``` - -Numeric artifacts are passed as artifact IDs. Non-numeric artifacts are passed as artifact names. - -## Failure semantics and trust notes - -- Missing tenant, run, or lease fields in tenant-isolation mode should fail as `INVALID_ARGS`. -- Inactive or scope-mismatched leases should fail as `UNAUTHORIZED`. -- Inspect logs on the remote host during remote debugging. Client-side `--debug` does not tail a local daemon log once `AGENT_DEVICE_DAEMON_BASE_URL` is set. -- Do not point `AGENT_DEVICE_DAEMON_BASE_URL` at untrusted hosts. Remote daemon requests can launch apps and execute interaction commands. -- Treat daemon auth tokens and lease identifiers as sensitive operational data. diff --git a/skills/agent-device/references/verification.md b/skills/agent-device/references/verification.md deleted file mode 100644 index b74da377d..000000000 --- a/skills/agent-device/references/verification.md +++ /dev/null @@ -1,134 +0,0 @@ -# Verification - -## When to open this file - -Open this file when the task needs evidence, regression checks, replay maintenance, or session performance measurements after the main interaction flow is already working. - -## Main commands to reach for first - -- `screenshot` -- `diff snapshot` -- `diff screenshot` -- `record` -- `replay -u` -- `perf` - -## Most common mistake to avoid - -Do not use verification tools as the first exploration step. First get the app into the correct state with the normal interaction flow, then capture proof or maintain replay assets. - -## Canonical loop - -```bash -agent-device open Settings --platform ios -# after using exploration to reach the state you want to verify -agent-device snapshot -agent-device screenshot /tmp/settings-proof.png --overlay-refs -agent-device close -``` - -## Structural verification with diff snapshot - -Use `diff snapshot` when you need a compact view of how the UI changed between nearby states. - -```bash -agent-device snapshot -i -agent-device press @e5 -agent-device diff snapshot -i -``` - -- Initialize the baseline at a stable point. -- Perform the mutation. -- Run `diff snapshot` to confirm the expected structural change. -- Re-run full `snapshot` only when you need fresh refs. - -## Screenshot artifacts - -Use `screenshot` when the proof needs a rendered image instead of a structural tree. - -- Add `--max-size 1024` when a full-resolution screenshot is too large for an agent, model, or chat attachment. -- Add `--overlay-refs` when you want the saved PNG to show fresh `@eN` refs burned into the screenshot. -- Combine them as `screenshot /tmp/proof.png --max-size 1024 --overlay-refs` when you need a smaller visual proof that still includes tappable refs. -- Avoid very small `--max-size` values when text, icons, or labels need to remain readable. - -## Visual regression with diff screenshot - -Use `diff screenshot` when comparing the current rendered screen against a saved visual baseline. - -```bash -agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png -agent-device diff screenshot --baseline ./baseline.png ./current.png --out /tmp/diff.png -agent-device diff screenshot --baseline ./baseline.png --out /tmp/diff.png --overlay-refs -``` - -- Text output includes ranked changed regions with screen-space rectangles, shape, size, density, average color, and luminance. JSON also includes normalized bounds. -- The diff PNG uses a light grayscale current-screen context with changed pixels tinted red and changed regions outlined. -- When a current image path is provided, `diff screenshot` compares the two saved files instead of capturing from the live device or requiring an active session. -- Install `tesseract` when you want `diff screenshot` to add best-effort OCR text deltas, movement clusters, and bbox size-change hints. OCR improves the text/JSON descriptions only; it does not change the pixel comparison or the diff PNG. -- When OCR is available, `diff screenshot` also reports best-effort non-text visual deltas by masking OCR text boxes out of the pixel diff and clustering the remaining residuals. Treat these as hints for icons, controls, and separators, not semantic icon recognition. -- Add `--overlay-refs` to `diff screenshot` when you also want a separate current-screen overlay guide for a live capture. The raw screenshot is still used for pixel comparison; the overlay guide is only context for non-text controls, icons, and tappable regions. When overlay refs intersect changed regions, the output lists the best current-screen ref matches under the affected region. Saved-image comparisons do not have live accessibility refs, so omit `--overlay-refs` when passing a current image path. - -## Session recording - -Use `record` for debugging, documentation, or shareable verification artifacts. - -```bash -agent-device record start ./recordings/ios.mov -agent-device open App -agent-device snapshot -i -agent-device press @e3 -agent-device close -agent-device record stop -``` - -- `record` supports iOS simulators, iOS devices, and Android. -- On iOS, recording is a wrapper around `simctl` for simulators and the corresponding device capture path for physical devices. -- On Android, recording is a wrapper around `adb`. -- Recording writes a video artifact and a gesture-telemetry sidecar JSON. -- Use `record start --quality 5` when a smaller video is easier to inspect or share. The scale is 5-10, where 10 is native resolution; omit it to preserve native/current resolution. -- On macOS hosts, touch overlay burn-in is available for supported recordings. -- On non-macOS hosts, recording still succeeds but the video stays raw and `record stop` can return an `overlayWarning`. -- If the agent already knows the interaction sequence and wants a more lifelike, uninterrupted recording, drive the flow with `batch` while recording instead of replanning between each step. - -Example: - -```bash -agent-device record start ./recordings/smoke.mov -agent-device batch --session sim --platform ios --steps-file /tmp/smoke-steps.json --json -agent-device record stop -``` - -- Use this only after exploration has stabilized the flow. -- Keep the batch short and add `wait` or `is exists` guards after mutating steps so the recorded flow still tracks realistic UI timing. - -## Replay maintenance - -Use replay updates when selectors drift but the recorded scenario is still correct. - -```bash -agent-device replay -u ./session.ad -agent-device test ./smoke --platform android -``` - -- Prefer selector-based actions in recorded `.ad` replays. -- Use `test` when you already have multiple `.ad` flows and need a quick regression pass after updating or recording them. -- Keep the skill-level rule simple: use `replay -u` to maintain one script, use `test` to verify a folder or matcher of scripts. -- Treat `test` as a human and CI-facing suite runner that an agent can invoke for verification, not as the main source of product documentation. -- Failed runs keep suite artifacts under `.agent-device/test-artifacts` by default, which is usually enough for debugging without extra agent-side processing. -- Use update mode for maintenance, not as a substitute for fixing a broken interaction strategy. - -## Performance checks - -Use `perf --json` or `metrics --json` when you need session performance data for the active session. - -```bash -agent-device open Settings --platform ios -agent-device perf --json -``` - -- `startup` is command round-trip timing around `open`. -- It is not true first-frame or first-interactive telemetry. -- Android app sessions also expose `memory` (`dumpsys meminfo`) and `cpu` (`dumpsys cpuinfo`) snapshots when the session has an app package context. -- Apple app sessions on macOS, iOS simulators, and physical iOS devices also expose `memory` and `cpu` process snapshots when the session has an app bundle ID. -- On physical iOS devices, sampling uses a short `xcrun xctrace` Activity Monitor capture, so keep the device unlocked, connected, and the app active in the foreground while sampling. -- `fps` is still unavailable in this release. diff --git a/skills/dogfood/SKILL.md b/skills/dogfood/SKILL.md index 03fef0cf0..af549c5ae 100644 --- a/skills/dogfood/SKILL.md +++ b/skills/dogfood/SKILL.md @@ -1,184 +1,25 @@ --- name: dogfood -description: 'Systematically explore and test a mobile app on iOS/Android with agent-device to find bugs, UX issues, and other problems. Use when asked to "dogfood", "QA", "exploratory test", "find issues", "bug hunt", or "test this app" on mobile. Produces a structured report with reproducible evidence: screenshots, optional repro videos, and detailed steps for every issue.' +description: Systematically explore and test a mobile app on iOS/Android with agent-device to find bugs, UX issues, and other problems. Use when asked to dogfood, QA, exploratory test, find issues, bug hunt, or test this app on mobile. allowed-tools: Bash(agent-device:*), Bash(npx agent-device:*) --- -# Dogfood (agent-device) +# Dogfood -Systematically explore a mobile app, find issues, and produce a report with full reproduction evidence for every finding. - -## Setup - -Only the **Target app** is required. Everything else has sensible defaults. - -| Parameter | Default | Example override | -| -------------------- | ----------------------------------------------------------- | -------------------------------------------- | -| **Target app** | _(required)_ | `Settings`, `com.example.app`, deep link URL | -| **Platform** | Infer from user context; otherwise ask (`ios` or `android`) | `--platform ios` | -| **Session name** | Slugified app/platform (for example `settings-ios`) | `--session my-session` | -| **Output directory** | `./dogfood-output/` | `Output directory: /tmp/mobile-qa` | -| **Scope** | Full app | `Focus on onboarding and profile` | -| **Authentication** | None | `Sign in to user@example.com` | - -If the user gives enough context to start, begin immediately with defaults. Ask follow-up only when a required detail is missing (for example platform or credentials). - -Prefer direct `agent-device` binary when available. - -## Workflow - -``` -1. Initialize Set up session, output dirs, report file -2. Launch/Auth Open app and sign in if needed -3. Orient Capture initial snapshot and map navigation -4. Explore Systematically test flows and states -5. Document Record reproducible evidence per issue -6. Wrap up Reconcile summary, close session -``` - -### 1. Initialize - -```bash -mkdir -p {OUTPUT_DIR}/screenshots {OUTPUT_DIR}/videos -cp {SKILL_DIR}/templates/dogfood-report-template.md {OUTPUT_DIR}/report.md -``` - -### 2. Launch/Auth - -Start a named session and launch target app: - -```bash -agent-device --session {SESSION} open {TARGET_APP} --platform {PLATFORM} -agent-device --session {SESSION} snapshot -i -``` - -If login is required: +Router for exploratory QA. Private setup before using this skill: ```bash -agent-device --session {SESSION} snapshot -i -agent-device --session {SESSION} fill @e1 "{EMAIL}" -agent-device --session {SESSION} fill @e2 "{PASSWORD}" -agent-device --session {SESSION} press @e3 -agent-device --session {SESSION} wait 1000 -agent-device --session {SESSION} snapshot -i +agent-device --version ``` -For OTP/email codes: ask the user, wait for input, then continue. - -### 3. Orient +Require `agent-device >= 0.13.4`; older CLIs lack these help topics. If older, run `npm install -g agent-device@latest`, recheck, then continue. If you cannot upgrade, stop and tell the user. Do not include version/upgrade commands in final plans. -Capture initial evidence and navigation anchors: +Read current CLI guidance: ```bash -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/initial.png -agent-device --session {SESSION} snapshot -i +agent-device help dogfood ``` -Map top-level navigation, tabs, and key workflows before deep testing. - -### 4. Explore - -Read [references/issue-taxonomy.md](references/issue-taxonomy.md) for severity/category calibration. - -Strategy: - -- Move through each major app area (tabs, drawers, settings pages). -- Test core journeys end-to-end (create, edit, delete, submit, recover). -- Validate edge states (empty/error/loading/offline/permissions denied). -- Use `diff snapshot -i` after UI transitions to avoid stale refs. -- Periodically capture `logs path` and inspect the app log when behavior looks suspicious. - -Useful commands per screen: - -```bash -agent-device --session {SESSION} snapshot -i -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/{screen-name}.png -agent-device --session {SESSION} appstate -agent-device --session {SESSION} logs path -``` - -### 5. Document Issues (Repro-First) - -Explore and document in one pass. When you find an issue, stop and fully capture evidence before continuing. - -#### Interactive/behavioral issues - -Use video + step screenshots: - -1. Start recording: - -```bash -agent-device --session {SESSION} record start {OUTPUT_DIR}/videos/issue-{NNN}-repro.mp4 -``` - -2. Reproduce with visible pacing. Capture each step: - -```bash -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/issue-{NNN}-step-1.png -sleep 1 -# perform action -sleep 1 -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/issue-{NNN}-step-2.png -``` - -3. Capture final broken state: - -```bash -sleep 2 -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/issue-{NNN}-result.png -``` - -4. Stop recording: - -```bash -agent-device --session {SESSION} record stop -``` - -5. Append issue immediately to report with numbered steps and screenshot references. - -#### Static/on-load issues - -Single screenshot is sufficient; no video required: - -```bash -agent-device --session {SESSION} screenshot {OUTPUT_DIR}/screenshots/issue-{NNN}.png -``` - -Set **Repro Video** to `N/A` in the report. - -### 6. Wrap Up - -Target 5-10 well-evidenced issues, then finish: - -1. Reconcile summary severity counts in `report.md`. -2. Close session: - -```bash -agent-device --session {SESSION} close -``` - -3. Report total issues, severity breakdown, and highest-risk findings. - -## Guidance - -- Repro quality matters more than issue count. -- Use refs (`@eN`) for fast exploration, selectors for deterministic replay assertions when needed. -- Re-snapshot after any mutation (navigation, modal, list update, form submit). -- Use `fill` for clear-then-type semantics; use `type` for incremental typing behavior checks. -- Keep logs optional and targeted: enable/read app logs only when useful for diagnosis. -- If the issue appears rooted in React Native internals rather than device/app runtime behavior, use `agent-device react-devtools ...` and the `skills/react-devtools` workflow for component-tree or render-profiling inspection. -- Never read source code of the app under test; findings must come from observed runtime behavior. -- Write each issue immediately to avoid losing evidence. -- Never delete screenshots/videos/report artifacts during a session. - -## References - -| Reference | When to Read | -| ------------------------------------------------------------ | ----------------------------------------------- | -| [references/issue-taxonomy.md](references/issue-taxonomy.md) | Start of session; severity/categories/checklist | - -## Templates +Loop: open named session -> snapshot -i + screenshot -> explore flows -> capture evidence per issue -> close. -| Template | Purpose | -| ---------------------------------------------------------------------------- | --------------------------------------------- | -| [templates/dogfood-report-template.md](templates/dogfood-report-template.md) | Copy into output directory as the report file | +Target app is required; infer platform or ask. Default output is `./dogfood-output/`. Findings must come from runtime behavior, not source reads. Re-snapshot after mutations. Use logs, network, trace, perf, overlay screenshots, or react-devtools only when they add evidence. diff --git a/skills/dogfood/references/issue-taxonomy.md b/skills/dogfood/references/issue-taxonomy.md deleted file mode 100644 index 27a678d61..000000000 --- a/skills/dogfood/references/issue-taxonomy.md +++ /dev/null @@ -1,83 +0,0 @@ -# Issue Taxonomy (Mobile) - -Reference for categorizing issues found during mobile dogfooding. - -## Severity Levels - -| Severity | Definition | -| ------------ | ------------------------------------------------------------------------- | -| **critical** | Blocks a core workflow, causes data loss, or crashes/freeze loops the app | -| **high** | Major feature broken or unusable, no practical workaround | -| **medium** | Feature works with notable friction or partial failure; workaround exists | -| **low** | Minor cosmetic or polish issue | - -## Categories - -### Visual / UI - -- Layout broken, clipped, overlapped, or unreadable text -- Safe-area/notch overlap issues -- Incorrect dark/light appearance rendering -- Missing assets/icons -- Animation glitches or flicker - -### Functional - -- Buttons/controls do nothing or trigger wrong action -- Flows fail (create/edit/delete/submit) -- Navigation dead-ends or wrong destination -- State loss after background/foreground transitions -- Deep link opens wrong screen or fails - -### UX - -- Confusing hierarchy or navigation labels -- Missing loading/progress feedback -- Unclear error handling or no recovery affordance -- Excessive steps for common tasks -- Inconsistent behavior between similar screens - -### Content - -- Typos, incorrect copy, placeholder text -- Wrong labels/help text -- Truncated text with no affordance -- Inconsistent terminology across screens - -### Performance - -- Slow startup or route transitions -- Input lag or gesture jank -- Scroll hitches/frame drops -- Notable battery/thermal symptoms during basic usage - -### Diagnostics / Logs - -- Native crashes or repeated fatal exceptions -- Repeated warnings correlated with broken behavior -- Unhandled runtime errors visible during repro - -### Permissions / Platform - -- Permission prompt flow broken or loops forever -- Denied permissions not handled gracefully -- Platform-specific regressions (iOS-only or Android-only) -- Background/foreground lifecycle regressions - -### Accessibility - -- Missing labels or incorrect accessibility names -- Focus order/navigation issues for assistive tech -- Low contrast or unreadable text scaling -- Touch targets too small for reliable interaction - -## Exploration Checklist - -1. Visual scan: capture screenshot; verify layout/safe areas/text/icon rendering. -2. Interactions: press controls, open menus/modals, validate expected response. -3. Forms/input: test valid/invalid/empty/boundary input. -4. Navigation: traverse all top-level sections and return paths. -5. App states: loading/empty/error/offline/permission-denied/background-resume. -6. Logs/diagnostics: inspect app logs when behavior is suspicious. -7. Platform parity: verify critical flows on each requested platform. -8. Accessibility basics: labels, touch target sizes, readability/contrast. diff --git a/skills/dogfood/templates/dogfood-report-template.md b/skills/dogfood/templates/dogfood-report-template.md deleted file mode 100644 index fd11566a6..000000000 --- a/skills/dogfood/templates/dogfood-report-template.md +++ /dev/null @@ -1,52 +0,0 @@ -# Dogfood Report: {APP_NAME} - -| Field | Value | -| -------------- | -------------- | -| **Date** | {DATE} | -| **Platform** | {PLATFORM} | -| **Target App** | {TARGET_APP} | -| **Session** | {SESSION_NAME} | -| **Scope** | {SCOPE} | - -## Summary - -| Severity | Count | -| --------- | ----- | -| Critical | 0 | -| High | 0 | -| Medium | 0 | -| Low | 0 | -| **Total** | **0** | - -## Issues - - - -### ISSUE-001: {Short title} - -| Field | Value | -| ------------------ | -------------------------------------------------------------------------------------------- | -| **Severity** | critical / high / medium / low | -| **Category** | visual / functional / ux / content / performance / diagnostics / permissions / accessibility | -| **Screen / Route** | {screen where issue was found} | -| **Repro Video** | {path to video, or N/A for static issues} | - -**Description** - -{What is wrong, what was expected, and what actually happened.} - -**Repro Steps** - -1. Open {screen/entry point} - ![Step 1](screenshots/issue-001-step-1.png) - -2. {Action} - ![Step 2](screenshots/issue-001-step-2.png) - -3. {Action} - ![Step 3](screenshots/issue-001-step-3.png) - -4. **Observe:** {broken behavior} - ![Result](screenshots/issue-001-result.png) - ---- diff --git a/skills/react-devtools/SKILL.md b/skills/react-devtools/SKILL.md index 521992353..5bebf10f0 100644 --- a/skills/react-devtools/SKILL.md +++ b/skills/react-devtools/SKILL.md @@ -1,55 +1,39 @@ --- name: react-devtools -description: Inspect and profile React Native component trees from agent-device. Use when debugging React Native props, state, hooks, render causes, slow components, excessive re-renders, or questions like why a component re-rendered. +description: Inspect and profile React Native component trees from agent-device. Use for React Native performance, profiling, props, state, hooks, render causes, slow components, excessive rerenders, or questions like why a component rerendered. --- # react-devtools -Use this skill when the task needs React Native internals that are not visible in the accessibility tree: component hierarchy, props, state, hooks, render causes, or profiling data. +Router for React Native internals. Private setup before using this skill: -Run commands through `agent-device react-devtools`. The command dynamically runs pinned `agent-react-devtools@0.4.0` and passes arguments through 1:1. +```bash +agent-device --version +``` -The first run may download the pinned package from npm. `agent-device` global flags work before or after `react-devtools`; use `--` before downstream flags only when they intentionally share an `agent-device` global flag name. +Require `agent-device >= 0.13.4`; older CLIs lack these help topics. If older, run `npm install -g agent-device@latest`, recheck, then continue. If you cannot upgrade, stop and tell the user. Do not include version/upgrade commands in final plans. -## Default flow +Read current CLI guidance: -1. Use `agent-device` to open the React Native app and verify the visible state when needed. -2. Check `agent-device react-devtools status`. -3. If no app is connected, start or wait for the devtools daemon, then reload or relaunch the app. -4. Inspect with `get tree`, `find`, and `get component`. -5. Profile only around the interaction being investigated. -6. Verify the fix with the same command sequence and interaction. +```bash +agent-device help react-devtools +``` -For cross-platform validation with explicit `--device`, `--udid`, or `--serial` selectors, prefer an isolated `--state-dir` over separate named sessions. Named sessions enable bound-session locks during setup. Restart `agent-device react-devtools` between iOS and Android runs so `status`, `get tree`, and profiling clearly refer to the currently launched app. +Use `agent-device react-devtools ...` for component tree, props, state, hooks, render ownership, performance profiling, slow components, or rerenders. It dynamically runs pinned `agent-react-devtools@0.4.0`. Use normal `agent-device` commands for visible UI, refs, screenshots, logs, network, or device-level perf. -## Main commands +Core loop: ```bash agent-device react-devtools status agent-device react-devtools wait --connected agent-device react-devtools get tree --depth 3 -agent-device react-devtools find -agent-device react-devtools get component @c5 agent-device react-devtools profile start +# perform the interaction with normal agent-device commands agent-device react-devtools profile stop agent-device react-devtools profile slow --limit 5 agent-device react-devtools profile rerenders --limit 5 ``` -## Decision rules - -- Need current UI text, refs, screenshots, logs, network, or device metrics: use the `agent-device` skill. -- Need props, state, hooks, component ownership, render causes, or React profiler data: use this skill. -- Start component-tree reads with `get tree --depth 3` or `find ` to keep output bounded. -- Labels like `@c5` reset when the app reloads or components remount. After reload, run `wait --connected` and inspect again. -- Profiling only captures renders between `profile start` and `profile stop`. -- On Android, set `adb reverse tcp:8097 tcp:8097` for React DevTools. If Metro is local, also set `adb reverse tcp:8081 tcp:8081`. -- For Android sessions connected through `agent-device connect --remote-config`, run `agent-device react-devtools ...` normally. The CLI registers a bridge companion tunnel to the local DevTools daemon on `127.0.0.1:8097` and unregisters it when the command exits. -- Remote Android React DevTools assumes the React Native-bundled DevTools behavior in React Native 0.83+. Do not assume older browser/Chromium DevTools workflows exist in remote sandboxes. For Expo apps, verify the SDK's bundled React Native version and runtime behavior first; no Expo SDK version is separately verified by this skill. - -## References +Rules: -| File | When to read | -| --------------------------------------- | --------------------------------------------- | -| [commands.md](references/commands.md) | Command reference and common inspection flows | -| [profiling.md](references/profiling.md) | Render profiling workflow and interpretation | +Keep reads bounded with `--depth`/`find`, treat `@c` refs as reload-local, profile only the investigated interaction, and run the same command in remote Android sessions; the CLI manages the needed local service tunnel. diff --git a/skills/react-devtools/references/commands.md b/skills/react-devtools/references/commands.md deleted file mode 100644 index 8b2cc11f5..000000000 --- a/skills/react-devtools/references/commands.md +++ /dev/null @@ -1,91 +0,0 @@ -# React DevTools Commands - -All commands are run through `agent-device react-devtools`. - -## Connection - -```bash -agent-device react-devtools start -agent-device react-devtools stop -agent-device react-devtools status -agent-device react-devtools wait --connected --timeout 30 -agent-device react-devtools wait --component --timeout 30 -``` - -- `status` shows the daemon port, connected apps, component count, profiling state, uptime, and last connection event. -- Most commands auto-start the daemon, but `start` is useful before launching or reloading the app. -- React Native development builds connect to the daemon on port 8097. For Android emulators or physical devices, use `adb reverse tcp:8097 tcp:8097` if the app cannot reach the host. If the app also uses local Metro, set `adb reverse tcp:8081 tcp:8081`. - -## Validation Notes - -- When validating the same app across iOS and Android with explicit `--device`, `--udid`, or `--serial` selectors, prefer an isolated `--state-dir` over separate named sessions. A named `--session` enables bound-session lock behavior, so setup commands with explicit target selectors can be rejected. -- Restart the React DevTools daemon between platforms so `status`, `get tree`, and profiling output belong to the currently launched app. -- Verify the app is visibly loaded with `snapshot` before collecting React internals. Use `react-devtools` for component state and profiling, not for proving the device/app surface is open. - -## Component Inspection - -```bash -agent-device react-devtools get tree --depth 3 -agent-device react-devtools get component @c5 -agent-device react-devtools find Button -agent-device react-devtools find Button --exact -agent-device react-devtools count -agent-device react-devtools errors -``` - -- `get tree` prints a component hierarchy with labels like `@c1`, `@c2`. -- Use `--depth` on large apps. Start at `--depth 3` or `--depth 4`. -- `get component` accepts a label or numeric React fiber id and shows props, state, and hooks. -- `find` searches by display name. Use `--exact` when fuzzy results are noisy. -- `errors` lists components with React-tracked warnings or errors. - -## Profiling - -```bash -agent-device react-devtools profile start "interaction name" -agent-device react-devtools profile stop -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -agent-device react-devtools profile report @c5 -agent-device react-devtools profile timeline --limit 20 -agent-device react-devtools profile commit 3 -agent-device react-devtools profile export profile.json -agent-device react-devtools profile diff before.json after.json --limit 10 -``` - -- `profile slow` ranks components by average render duration. -- `profile rerenders` ranks components by render count. -- `profile report @cN` shows render causes and changed props/state/hooks for one component. -- `profile timeline` lists commits. Use `--limit` and `--offset` for long sessions. -- `profile export` writes React DevTools Profiler JSON that can be diffed later. - -## Common Flows - -Inspect a component: - -```bash -agent-device react-devtools status -agent-device react-devtools get tree --depth 3 -agent-device react-devtools find SearchScreen -agent-device react-devtools get component @c12 -``` - -Profile a slow interaction: - -```bash -agent-device react-devtools profile start "slow search" -# Trigger the interaction with agent-device or ask the user to perform it. -agent-device react-devtools profile stop -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -``` - -Verify a render fix: - -```bash -agent-device react-devtools profile start "after fix" -# Repeat the same interaction. -agent-device react-devtools profile stop -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -``` diff --git a/skills/react-devtools/references/profiling.md b/skills/react-devtools/references/profiling.md deleted file mode 100644 index ee77d706c..000000000 --- a/skills/react-devtools/references/profiling.md +++ /dev/null @@ -1,74 +0,0 @@ -# React Native Profiling - -Use this workflow when the user reports slow interactions, excessive re-renders, unstable props, or unclear render causes. - -## Baseline - -```bash -agent-device react-devtools status -agent-device react-devtools count -agent-device react-devtools get tree --depth 3 -``` - -If the app is not connected, run: - -```bash -agent-device react-devtools start -agent-device react-devtools wait --connected -``` - -Then reload or relaunch the React Native app if needed. - -## Capture One Interaction - -```bash -agent-device react-devtools profile start "short label" -# Trigger exactly the interaction being investigated. -agent-device react-devtools profile stop -``` - -Keep the profiling window narrow. Extra navigation, warm-up work, or unrelated gestures make the report harder to interpret. - -## Identify Suspects - -```bash -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -``` - -- A component with high average render time is a slow-render suspect. -- A component with high render count is a re-render suspect. -- A component can be both. - -## Drill In - -```bash -agent-device react-devtools profile report @c12 -agent-device react-devtools get component @c12 -``` - -Use `profile report` to identify render causes and changed keys. Use `get component` to inspect current props, state, and hooks. - -Common interpretations: - -| Signal | Meaning | Typical follow-up | -| ------------------------------------------ | ----------------------------------- | ---------------------------------------------- | -| `props-changed` with function props | Parent may pass unstable callbacks | Check whether the parent can use `useCallback` | -| `props-changed` with object or array props | Parent may pass unstable references | Check whether the parent can use `useMemo` | -| `parent-rendered` with many child renders | Child has no bailout | Check whether `React.memo` is appropriate | -| `state-changed` | Component state caused the render | Check whether the state update is necessary | -| `hooks-changed` | Hook value or dependency changed | Inspect hook values and dependencies | - -## Verify - -After making a change, repeat the same interaction: - -```bash -agent-device react-devtools profile start "after fix" -# Repeat the same interaction. -agent-device react-devtools profile stop -agent-device react-devtools profile slow --limit 5 -agent-device react-devtools profile rerenders --limit 5 -``` - -Compare render counts, average durations, changed keys, and commit counts against the baseline. diff --git a/src/__tests__/cli-help.test.ts b/src/__tests__/cli-help.test.ts index ce6effe16..ecd0b11f3 100644 --- a/src/__tests__/cli-help.test.ts +++ b/src/__tests__/cli-help.test.ts @@ -43,12 +43,47 @@ test('connect help documents cloud auth environment origins', async () => { assert.match(result.stdout, /AGENT_DEVICE_DAEMON_AUTH_TOKEN/); }); -test('help react-devtools prints passthrough command help and skips daemon dispatch', async () => { +test('help react-devtools prints agent workflow topic and skips daemon dispatch', async () => { const result = await runCliCapture(['help', 'react-devtools']); assert.equal(result.code, 0); assert.equal(result.calls.length, 0); - assert.match(result.stdout, /Usage:\n agent-device react-devtools \[\.\.\.args\]/); - assert.match(result.stdout, /React Native component trees/); + assert.match(result.stdout, /agent-device help react-devtools/); + assert.match(result.stdout, /React Native performance\/profiling/); + assert.match(result.stdout, /agent-device react-devtools status/); +}); + +test('help workflow prints agent workflow topic and skips daemon dispatch', async () => { + const result = await runCliCapture(['help', 'workflow']); + assert.equal(result.code, 0); + assert.equal(result.calls.length, 0); + assert.match(result.stdout, /agent-device help workflow/); + assert.match(result.stdout, /Core loop:/); + assert.match(result.stdout, /Do not use CSS selectors/); +}); + +test('help workflow preserves known device workaround guidance', async () => { + const result = await runCliCapture(['help', 'workflow']); + assert.equal(result.code, 0); + assert.equal(result.calls.length, 0); + assert.match(result.stdout, /disabled\/hittable:false/); + assert.match(result.stdout, /snapshot -i -c --json/); + assert.match(result.stdout, /@Label_Name/); + assert.match(result.stdout, /press @e12/); + assert.match(result.stdout, /Snapshot legend:/); + assert.match(result.stdout, /preview="Leave at side\.\.\." truncated/); + assert.match(result.stdout, /wait text/); + assert.match(result.stdout, /Never use args/); + assert.match(result.stdout, /Never use args, step/); + assert.match(result.stdout, /scrollintoview/); + assert.match(result.stdout, /--delay-ms/); + assert.match(result.stdout, /Discovery is not enough when the task asks to open\/start/); + assert.match(result.stdout, /If the task says install, use install/); + assert.match(result.stdout, /do not inspect project files to find one/); + assert.match(result.stdout, /do not split clear\/restart/); + assert.match(result.stdout, /do not write network log headers/); + assert.match(result.stdout, /iOS Allow Paste prompt cannot be exercised under XCUITest/); + assert.match(result.stdout, /agent-device clipboard write "some text"/); + assert.match(result.stdout, /trusted ADB keyboard IME/); }); test('help unknown command prints error plus global usage and skips daemon dispatch', async () => { diff --git a/src/utils/__tests__/args.test.ts b/src/utils/__tests__/args.test.ts index a260f74f0..968e8d55b 100644 --- a/src/utils/__tests__/args.test.ts +++ b/src/utils/__tests__/args.test.ts @@ -758,37 +758,61 @@ test('usage includes concise top-level commands', () => { assert.match(usageText, /pinch \[x\] \[y\]/); assert.match(usageText, /rotate /); assert.match(usageText, /record start \[path\] \| record stop/); - assert.match(usageText, /trace start \[path\] \| trace stop/); + assert.match(usageText, /trace start \| trace stop /); }); test('usage includes only global flags in the top-level flags section', () => { const usageText = usage(); - assert.match(usageText, /--target mobile\|tv/); - assert.match(usageText, /--ios-simulator-device-set /); - assert.match(usageText, /--android-device-allowlist /); - assert.match(usageText, /--state-dir /); - assert.match(usageText, /--daemon-transport auto\|socket\|http/); - assert.match(usageText, /--daemon-server-mode socket\|http\|dual/); - assert.match(usageText, /--tenant /); - assert.match(usageText, /--session-isolation none\|tenant/); - assert.match(usageText, /--run-id /); - assert.match(usageText, /--lease-id /); - assert.match(usageText, /--lease-backend ios-simulator\|ios-instance\|android-instance/); - assert.doesNotMatch(usageText, /--relaunch/); - assert.doesNotMatch(usageText, /--header /); - assert.doesNotMatch(usageText, /--restart/); - assert.doesNotMatch(usageText, /--fps /); - assert.doesNotMatch(usageText, /--quality <5-10>/); - assert.doesNotMatch(usageText, /--save-script \[path\]/); - assert.doesNotMatch(usageText, /--metadata/); -}); - -test('usage includes skills, config, environment, and examples footers', () => { + const flagsSection = usageText.slice( + usageText.indexOf('Flags:'), + usageText.indexOf('Agent Quickstart:'), + ); + assert.match(flagsSection, /--target mobile\|tv/); + assert.match(flagsSection, /--ios-simulator-device-set /); + assert.match(flagsSection, /--android-device-allowlist /); + assert.match(flagsSection, /--state-dir /); + assert.match(flagsSection, /--daemon-transport auto\|socket\|http/); + assert.match(flagsSection, /--daemon-server-mode socket\|http\|dual/); + assert.match(flagsSection, /--tenant /); + assert.match(flagsSection, /--session-isolation none\|tenant/); + assert.match(flagsSection, /--run-id /); + assert.match(flagsSection, /--lease-id /); + assert.match(flagsSection, /--lease-backend ios-simulator\|ios-instance\|android-instance/); + assert.doesNotMatch(flagsSection, /--relaunch/); + assert.doesNotMatch(flagsSection, /--header /); + assert.doesNotMatch(flagsSection, /--restart/); + assert.doesNotMatch(flagsSection, /--fps /); + assert.doesNotMatch(flagsSection, /--quality <5-10>/); + assert.doesNotMatch(flagsSection, /--save-script \[path\]/); + assert.doesNotMatch(flagsSection, /--metadata/); +}); + +test('usage includes agent workflows, config, environment, and examples footers', () => { const usageText = usage(); - assert.match(usageText, /Agent Skills:/); - assert.match(usageText, /agent-device\s+Canonical mobile automation flows/); - assert.match(usageText, /dogfood\s+Exploratory QA and bug hunts/); - assert.match(usageText, /See `skills\/\/SKILL\.md` in the installed package\./); + assert.match(usageText, /Agent Quickstart:/); + assert.match(usageText, /Default loop: devices\/apps -> open -> snapshot -i/); + assert.match(usageText, /Use selectors or refs as positional targets/); + assert.match(usageText, /Plain snapshot reads state; snapshot -i is required/); + assert.match(usageText, /Truncated text\/input preview: expand first with snapshot -s @e12/); + assert.match(usageText, /RN warning\/error overlays can block taps: snapshot -i/); + assert.match(usageText, /Expo Go\/dev clients: use the provided URL when given/); + assert.match(usageText, /if only a target name is given, open that target/); + assert.match(usageText, /Install flows: install\/install-from-source first/); + assert.match(usageText, /fill 'id="field-email"' "qa@example\.com" replaces/); + assert.match(usageText, /After mutation: diff snapshot -i/); + assert.match(usageText, /app-owned back uses back/); + assert.match(usageText, /logs clear --restart\/mark\/path/); + assert.match(usageText, /trace start \.\/path; trace stop \.\/path/); + assert.match(usageText, /network dump --include headers/); + assert.match(usageText, /Full operating guide: agent-device help workflow/); + assert.match(usageText, /Exploratory QA: agent-device help dogfood/); + assert.match(usageText, /Agent Workflows:/); + assert.match(usageText, /help workflow\s+Normal bootstrap, exploration, and validation loop/); + assert.match(usageText, /help debugging\s+Logs, network, alerts, diagnostics, and traces/); + assert.match( + usageText, + /help react-devtools\s+React Native performance, profiling, component tree, and renders/, + ); assert.match(usageText, /Configuration:/); assert.match( usageText, @@ -811,6 +835,106 @@ test('usage includes skills, config, environment, and examples footers', () => { assert.match(usageText, /agent-device test \.\/suite --platform android/); }); +test('usageForCommand resolves workflow help topic', () => { + const help = usageForCommand('workflow'); + if (help === null) throw new Error('Expected workflow help text'); + assert.match(help, /agent-device help workflow/); + assert.match(help, /Use selectors as positional targets/); + assert.match(help, /Do not use CSS selectors/); + assert.match(help, /Snapshot legend:/); + assert.match(help, /@e12 \[button\] label="Add to cart"/); + assert.match(help, /Truncated text\/input previews: do not use get text first/); + assert.match(help, /snapshot -s @e7/); + assert.match(help, /Read-only visible\/state question: use snapshot\/get\/is\/find/); + assert.match(help, /Use snapshot -i only when refs are needed/); + assert.match(help, /install-from-source --github-actions-artifact org\/repo:app-debug/); + assert.match(help, /Discovery is not enough when the task asks to open\/start/); + assert.match(help, /If the task says install, use install/); + assert.match(help, /Do not open artifact paths or invent package ids/); + assert.match(help, /agent-device get attrs @e4/); + assert.match(help, /Ambiguous find: add --first or --last/); + assert.match(help, /report that gap instead of typing\/searching\/navigating/); + assert.match(help, /If snapshot -i shows one, dismiss\/close its visible control/); + assert.match(help, /iOS Allow Paste prompt cannot be exercised under XCUITest/); + assert.match(help, /agent-device clipboard write "some text"/); + assert.match(help, /trusted ADB keyboard IME/); + assert.match(help, /if no URL is provided but a target\/app name is provided, open that target/); + assert.match(help, /do not split clear\/restart/); + assert.match(help, /do not write network log headers/); + assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform ios/); + assert.match(help, /agent-device open "Expo Go" exp:\/\/127\.0\.0\.1:8081 --platform ios/); + assert.match(help, /agent-device open exp:\/\/127\.0\.0\.1:8081 --platform android/); + assert.match(help, /apps lookup misses the project but shows Expo Go\/dev-client/); + assert.match(help, /metro prepare --kind expo/); + assert.match(help, /help react-devtools/); +}); + +test('workflow help keeps common copyable command forms', () => { + const help = usageForCommand('workflow'); + if (help === null) throw new Error('Expected workflow help text'); + assert.match(help, /network dump --include headers/); + assert.match(help, /settings animations off/); + assert.match(help, /connect --remote-config/); + assert.match(help, /metro reload/); + assert.match(help, /screenshot --overlay-refs/); + assert.match(help, /snapshot -s @e7/); + assert.match(help, /clipboard write "some text"/); +}); + +test('usageForCommand resolves remote help topic', () => { + const help = usageForCommand('remote'); + if (help === null) throw new Error('Expected remote help text'); + assert.match(help, /agent-device open com\.example\.app --remote-config \.\/remote-config\.json/); + assert.match(help, /disconnect --remote-config \.\/remote-config\.json/); + assert.match(help, /Script flow, per-command config/); + assert.match(help, /same --remote-config to every operational command/); + assert.match(help, /install-from-source --github-actions-artifact org\/repo:artifact/); +}); + +test('usageForCommand resolves macos help topic', () => { + const help = usageForCommand('macos'); + if (help === null) throw new Error('Expected macos help text'); + assert.match(help, /agent-device click @e66 --button secondary --platform macos/); + assert.match(help, /Context menus are not ambient UI/); + assert.match(help, /menu-item refs/); +}); + +test('usageForCommand resolves dogfood help topic', () => { + const help = usageForCommand('dogfood'); + if (help === null) throw new Error('Expected dogfood help text'); + assert.match(help, /agent-device help dogfood/); + assert.match(help, /Find user-visible issues from runtime behavior/); + assert.match(help, /Severity: critical blocks a core flow\/data\/crashes/); + assert.match(help, /Interactive\/behavioral issues need step screenshots/); + assert.match(help, /Static\/on-load issues can use one screenshot/); + assert.match(help, /React Native warning\/error overlays can be real findings/); + assert.match(help, /Expo Go\/dev-client shells/); + assert.match(help, /dogfood-output\/report\.md/); + assert.match(help, /ID, severity, category, title, affected flow\/screen/); + assert.match(help, /Never delete screenshots, videos, traces, or report artifacts/); + assert.match(help, /screenshot \.\/dogfood-output\/screenshots\/issue-001\.png --overlay-refs/); +}); + +test('usageForCommand resolves react-devtools help topic', () => { + const help = usageForCommand('react-devtools'); + if (help === null) throw new Error('Expected react-devtools help text'); + assert.match(help, /agent-device react-devtools start/); + assert.match(help, /agent-device react-devtools wait --component /); + assert.match(help, /agent-device react-devtools find --exact/); + assert.match(help, /agent-device react-devtools errors/); + assert.match(help, /agent-device react-devtools profile report @c5/); + assert.match(help, /agent-device react-devtools profile timeline --limit 20/); + assert.match(help, /agent-device react-devtools profile export profile\.json/); + assert.match( + help, + /agent-device react-devtools profile diff before\.json after\.json --limit 10/, + ); + assert.match(help, /render causes and changed props\/state\/hooks/); + assert.match(help, /@c refs reset after reload\/remount/); + assert.match(help, /isolated --state-dir/); + assert.match(help, /local service tunnel/); +}); + test('apps defaults to --all filter and allows overrides', () => { const defaultFilter = parseArgs(['apps'], { strictFlags: true }); assert.equal(defaultFilter.command, 'apps'); diff --git a/src/utils/command-schema.ts b/src/utils/command-schema.ts index e93232f3d..e5d849391 100644 --- a/src/utils/command-schema.ts +++ b/src/utils/command-schema.ts @@ -158,10 +158,40 @@ const SELECTOR_SNAPSHOT_FLAGS = [ const FIND_SNAPSHOT_FLAGS = ['snapshotDepth', 'snapshotRaw'] as const satisfies readonly FlagKey[]; -const AGENT_SKILLS = [ - { label: 'agent-device', description: 'Canonical mobile automation flows' }, - { label: 'react-devtools', description: 'React Native component tree and render profiling' }, - { label: 'dogfood', description: 'Exploratory QA and bug hunts' }, +const AGENT_WORKFLOWS = [ + { label: 'help workflow', description: 'Normal bootstrap, exploration, and validation loop' }, + { label: 'help debugging', description: 'Logs, network, alerts, diagnostics, and traces' }, + { + label: 'help react-devtools', + description: 'React Native performance, profiling, component tree, and renders', + }, + { + label: 'help remote', + description: 'Remote/cloud config, tenants, leases, and local service tunnels', + }, + { label: 'help macos', description: 'Desktop, frontmost-app, and menu bar surfaces' }, + { label: 'help dogfood', description: 'Exploratory QA report workflow' }, +] as const; + +const AGENT_QUICKSTART_LINES = [ + 'Default loop: devices/apps -> open -> snapshot -i -> press/fill/get/is/wait/find -> verify -> close.', + 'Use selectors or refs as positional targets: id="submit", label="Allow", or @e12 from snapshot -i.', + 'Plain snapshot reads state; snapshot -i is required to refresh interactive refs.', + 'Read-only visible/state question: use snapshot/get/is/find; use snapshot -i only when refs are needed.', + 'Truncated text/input preview: expand first with snapshot -s @e12, not get text.', + 'RN warning/error overlays can block taps: snapshot -i, dismiss/close, then diff snapshot -i.', + 'Expo Go/dev clients: use the provided URL when given; if only a target name is given, open that target and do not search project files for a URL.', + 'Install flows: install/install-from-source first, then open the installed id with --relaunch.', + 'Text: fill \'id="field-email"\' "qa@example.com" replaces; type appends after press.', + 'Clipboard limits: iOS Allow Paste cannot be automated through XCUITest; prefill with clipboard write. Android non-ASCII should use fill/type, not raw adb input.', + 'After mutation: diff snapshot -i. Off-screen hints: scroll, then snapshot -i.', + 'Raw coordinates are fallback-only: use snapshot -i -c --json rects when iOS refs no-op or child refs are missing.', + 'Batch JSON steps use "command", "positionals", "flags"; never "args" or "step".', + 'Navigation: app-owned back uses back; system back uses back --system.', + 'Verification commands must name the expected text/selector; bare screenshots/snapshots are not enough.', + 'Debug evidence: logs clear --restart/mark/path; trace start ./path; trace stop ./path; network dump --include headers.', + 'Use agent-device commands in final plans; raw platform tools, pseudo commands, and helper prose are wrong.', + 'Full operating guide: agent-device help workflow. Exploratory QA: agent-device help dogfood.', ] as const; const CONFIGURATION_LINES = [ @@ -194,6 +224,361 @@ const EXAMPLE_LINES = [ 'agent-device test ./suite --platform android', ] as const; +const HELP_TOPICS = { + workflow: { + summary: 'Normal agent-device bootstrap, exploration, and validation loop', + body: `agent-device help workflow + +Version-matched operating guide for normal agent-device work. + +Core loop: + devices/apps -> open -> snapshot or snapshot -i -> get/is/find/wait or press/fill/scroll/back -> verify -> close + +Command shape: + Plans should use agent-device commands, not raw platform tools, pseudo commands, package-manager aliases, or helper prose. + Put subcommand first, then positionals, then flags: + agent-device open com.example.app --session checkout --platform android --relaunch + agent-device record start ./checkout.mp4 --session checkout + Snapshot refs look like @e12. After snapshot -i, use the exact @eN ref from that output. + If the exact ref is not known yet, first output snapshot -i, then use a concrete example shape like press @e12 in the next command; do not write @, @ref, @Label_Name, or @eN placeholders. + Close means agent-device close. App-owned back means back; system back means back --system. + Taps are press or click. Gestures are direct commands: swipe, longpress, pinch. + +Bootstrap: + agent-device devices --platform ios + agent-device apps --platform android + agent-device open MyApp --platform ios --device "iPhone 17 Pro" + agent-device open --session checkout --platform android + agent-device install com.example.app ./dist/app.apk --platform android + agent-device reinstall com.example.app ./build/MyApp.app --platform ios + agent-device install-from-source --github-actions-artifact org/repo:app-debug --platform android + agent-device open com.example.app --platform android --relaunch + If app id is unknown, plan devices, apps, then open . Discovery is not enough when the task asks to open/start the app. + Install arguments are app/package id then artifact path. If the task says install, use install; use reinstall only when explicitly requested. Fresh runtime state is open --relaunch after install. + Do not open artifact paths or invent package ids. If apps lookup misses the target and no URL/artifact is provided, ask or stop. + +Snapshots and refs: + snapshot reads visible state. snapshot -i gets current interactive refs. + Snapshot legend: + @e12 [button] label="Add to cart" id="add-cart" enabled hittable -> press @e12 or press 'id="add-cart"'. + @e13 [textinput] label="Notes" preview="Leave at side..." truncated -> snapshot -s @e13 before reading. + [off-screen below] 4 items: "Privacy", "About" -> scroll down, then snapshot -i; those are hints, not refs. + Re-snapshot after navigation, submit, modal/list/reload/dynamic changes. + Off-screen summaries are scroll hints; use scroll, not swipe, then snapshot -i. + Missing target in a long list: use a short manual scroll + snapshot loop with a max attempt count; do not rely on unbounded scrollintoview. + Truncated text/input previews: do not use get text first; expand with snapshot -s @ref (for example snapshot -s @e7), then read the scoped output. + Rare iOS accessibility gaps: if a row ref is shown disabled/hittable:false and press @ref reports success but no UI change, or a horizontal tab/filter bar is collapsed into one composite/seekbar with no child refs, run agent-device snapshot -i -c --json to read rects, compute the target center, press x y, then diff snapshot -i. Coordinates are fallback-only; document why you used them. + +Selectors: + Use selectors as positional targets: id="field-email" or label="Allow". + Do not use CSS selectors, pseudo refs, --selector, --text, or raw x/y when refs/selectors exist. + agent-device fill 'id="catalog-search"' "tart" --delay-ms 80 + agent-device press 'id="submit-order"' + agent-device is visible 'label="Online"' + agent-device get text 'id="quantity-value"' + +Text entry: + fill replaces; type appends to focused field. + agent-device fill @e5 "qa@example.com" + agent-device fill 'id="field-email"' "qa@example.com" + agent-device press 'id="product-note"' + agent-device type "Handle with care" --delay-ms 80 + Debounced field with no result selector: agent-device wait 1000. Keyboard read-only: keyboard status/get. Blocked control: keyboard dismiss. + Search-as-you-type fields on iOS can drop characters when driven too fast; use --delay-ms on fill/type before trying clipboard paste. + iOS Allow Paste prompt cannot be exercised under XCUITest. To test paste-driven app behavior, prefill first with agent-device clipboard write "some text"; test the system prompt manually. + Android non-ASCII can fail on some system images. Try fill/type normally; agent-device uses safer fallbacks. If the shell reports unsupported non-ASCII input, configure a trusted ADB keyboard IME outside the command plan and restore the previous IME afterward. + +Read-only and waits: + Read-only visible/state question: use snapshot/get/is/find. + agent-device snapshot + agent-device get text 'id="product-title"' + agent-device get attrs @e4 + agent-device is visible 'label="Online"' + agent-device wait text "Refreshing metrics..." 3000 + agent-device wait 'label="Ready"' 3000 + agent-device find "Increment" press --json + For async/list text presence, prefer wait text over is visible when no interaction is needed. + Use snapshot -i only when refs are needed for an action or targeted query. + Ambiguous find: add --first or --last. If info is not visible/exposed, report that gap instead of typing/searching/navigating to reveal it. + +Navigation and gestures: + Use scroll for lists; swipe for coordinate gestures/carousels. + If app-owned back is ambiguous or has just misrouted, prefer a visible nav/back button ref, tab-bar ref, or deep link over repeated back/system back. + Keep count/pause/pattern on one swipe; flags are --count, --pause-ms, --pattern ping-pong. + longpress duration and pinch scale/center are positional: + agent-device longpress 300 500 800 + agent-device swipe 320 500 40 500 --count 8 --pause-ms 30 --pattern ping-pong + agent-device pinch 0.5 200 400 + +Validation and evidence: + Nearby mutation diff: agent-device diff snapshot -i. + Expected text/selector verification must include the exact text or selector via wait, is, get, or find; bare screenshots/snapshots are insufficient for named expectations. + Prefer provided testIDs/ids/selectors for verification; use visible text when no durable selector is provided. + If task says snapshot, use snapshot. If it asks visual evidence, use screenshot. + Icon/tappable visual proof: screenshot --overlay-refs. Flag is --overlay-refs. + Startup/CPU/memory: perf --json or metrics. Replay maintenance: replay -u ./flow.ad. + Recording: record start/stop. Tracing: trace start ./trace.log, trace stop ./trace.log. Paths are positional. + Stable known flow: batch ./steps.json, not workflow batch. + Inline batch JSON example: + agent-device batch --steps '[{"command":"open","positionals":["settings"],"flags":{}},{"command":"wait","positionals":["100"],"flags":{}}]' + Batch step keys are command, positionals, flags, and optional runtime. Never use args, step, text, or target as batch step fields. + Android animations: settings animations off/on, not animations disable/restore. + Debug logs: logs clear --restart, logs mark, reproduce, then logs path; do not split clear/restart into separate stop/start commands. + Network headers: network dump --include headers; do not write network log headers. + Remote config: connect --remote-config ./remote-config.json, open, snapshot, disconnect. + macOS menu bar: open ... --platform macos --surface menubar; snapshot -i --platform macos --surface menubar. + +React Native dev loop: + JS-only change with Metro connected: + agent-device metro reload + agent-device find "Home" + Do not use agent-device reload. Use open --relaunch for native startup reset. + Warning/error overlays can obscure UI and intercept taps. If snapshot -i shows one, dismiss/close its visible control (for example Dismiss or Close) if it is not the task target, then diff snapshot -i or snapshot -i before tapping the real UI. + Expo Go is a host shell. Use a provided project URL instead of inventing a bundle id; if no URL is provided but a target/app name is provided, open that target and do not inspect project files to find one. iOS simulators can open a URL directly; use host + URL when targeting a specific host shell: + agent-device open exp://127.0.0.1:8081 --platform ios + agent-device open "Expo Go" exp://127.0.0.1:8081 --platform ios + Android uses the URL target directly; do not write open there: + agent-device open exp://127.0.0.1:8081 --platform android + If apps lookup misses the project but shows Expo Go/dev-client and a project URL is available, open the URL/host shell; if no URL is available, ask instead of inventing an app id. + Expo Dev Client/development builds: open the installed dev-client app id/name; if a dev-client URL is provided, open that URL next. For Metro setup use metro prepare --kind expo. + +React DevTools minimum loop: + Keep the agent-device react-devtools prefix on every React DevTools command: + agent-device react-devtools status + agent-device react-devtools wait --connected + agent-device react-devtools profile start + interact with normal agent-device commands + agent-device react-devtools profile stop + agent-device react-devtools profile slow --limit 5 + agent-device react-devtools profile rerenders --limit 5 + +Escalate: + help debugging logs, network, alerts, traces, flaky runtime failures + help react-devtools React Native performance, profiling, props/state/hooks, slow renders, rerenders + help remote remote/cloud config, tenant, lease, local service tunnels + help macos desktop, frontmost-app, menu bar surfaces + help dogfood exploratory QA report workflow`, + }, + debugging: { + summary: 'Targeted failure evidence without dumping stale context', + body: `agent-device help debugging + +Use this when behavior fails, hangs, times out, throws alerts, or needs runtime evidence. + +Logs: + Keep log windows small. Prefer clear, mark, reproduce, then path. + agent-device logs clear --restart + agent-device logs mark "before diagnostics retry" + agent-device press 'id="load-diagnostics"' + agent-device logs path + Do not cat a full stale log into agent context. Open or grep only the relevant window when needed. + logs clear --restart is the compact command to clear old logs and start a fresh capture; do not split it into logs stop, logs clear, logs start. + +Network: + Use network dump for recent session HTTP traffic parsed from app logs. + agent-device network dump --include headers + agent-device network dump 20 --include all + Use this instead of logs path when the question is request/response metadata. + network log is a supported alias, but network dump --include headers is the clearest plan form. Do not write network log headers. + +Alerts: + Native alerts: + agent-device alert wait 3000 + agent-device alert accept + agent-device alert dismiss + If alert accept says no alert but a permission sheet is visibly on screen, treat it as normal UI: + agent-device snapshot -i + agent-device press 'label="Allow"' + +Diagnostics and traces: + Use --debug for CLI/daemon diagnostic ids and log paths. + Use trace for low-level session diagnostics around one repro: + agent-device trace start ./traces/diagnostics.trace + agent-device press 'id="load-diagnostics"' + agent-device trace stop ./traces/diagnostics.trace + The trace path is positional. Do not use --path for trace start or trace stop. + +Stabilizers: + Android animation-sensitive flows: + agent-device settings animations off + agent-device snapshot + agent-device settings animations on + Re-enable settings you changed before finishing. + +React Native internals: + If the question is about React Native performance, profiling, props, state, hooks, render causes, slow components, or rerenders, use help react-devtools instead of inferring from screenshots or logs.`, + }, + 'react-devtools': { + summary: 'React Native performance, profiling, and component internals', + body: `agent-device help react-devtools + +Use this for React Native performance/profiling and internals that the accessibility tree cannot expose: components, props, state, hooks, ownership, slow renders, and rerenders. + +Core commands: + agent-device react-devtools start + agent-device react-devtools stop + agent-device react-devtools status + agent-device react-devtools wait --connected + agent-device react-devtools wait --component + agent-device react-devtools count + agent-device react-devtools get tree --depth 3 + agent-device react-devtools find + agent-device react-devtools find --exact + agent-device react-devtools get component @c5 + agent-device react-devtools errors + agent-device react-devtools profile start + agent-device react-devtools profile stop + agent-device react-devtools profile slow --limit 5 + agent-device react-devtools profile rerenders --limit 5 + agent-device react-devtools profile report @c5 + agent-device react-devtools profile timeline --limit 20 + agent-device react-devtools profile export profile.json + agent-device react-devtools profile diff before.json after.json --limit 10 + +Profiling loop: + 1. Verify the app is connected: react-devtools status, then wait --connected if needed. + 2. Start profiling immediately before the interaction. + 3. Drive the interaction with normal agent-device commands. + 4. Stop profiling. + 5. Inspect slow components and rerenders. + 6. Use profile report @cN for render causes and changed props/state/hooks; use get component @cN for current props/state/hooks. + +Rules: + Start with get tree --depth 3 or find ; use find --exact when fuzzy results are noisy. + @c refs reset after reload/remount. After reload, wait --connected and inspect again. + Keep the profile window narrow; unrelated navigation makes render data noisy. + For cross-platform validation with explicit device selectors, prefer isolated --state-dir and restart react-devtools between platforms. + Remote Android runs normally through agent-device react-devtools; the CLI manages the needed local service tunnel. Expo support depends on the SDK's bundled React Native runtime. + +Example: + agent-device react-devtools status + agent-device react-devtools wait --connected + agent-device react-devtools profile start + agent-device fill 'id="catalog-search"' "tart" --delay-ms 80 + agent-device react-devtools profile stop + agent-device react-devtools profile slow --limit 5 + agent-device react-devtools profile rerenders --limit 5 + agent-device react-devtools profile report @c5 + +Use snapshot, screenshot, logs, network, and perf for device/app runtime evidence. Use react-devtools only when component internals or React rendering behavior matters.`, + }, + remote: { + summary: 'Remote config, tenant, lease, and remote host flow', + body: `agent-device help remote + +Use remote config when a profile owns daemon URL, auth, tenant, run, lease, device scope, and Metro hints. Do not restate those as individual flags unless overriding intentionally. + +Normal flow: + agent-device connect --remote-config ./remote-config.json + agent-device open com.example.app + agent-device snapshot + agent-device disconnect + +Script flow, per-command config: + agent-device open com.example.app --remote-config ./remote-config.json + agent-device snapshot --remote-config ./remote-config.json + agent-device disconnect --remote-config ./remote-config.json + +Rules: + connect and disconnect are top-level commands. Do not write agent-device remote connect or agent-device remote disconnect. + Prefer --remote-config over --daemon-base-url, --tenant, --run-id, and --lease-id in ordinary remote flows. + For self-contained scripts, pass the same --remote-config to every operational command, including disconnect; a preceding connect is optional but not required. + For remote artifact installs, use install-from-source or install-from-source --github-actions-artifact org/repo:artifact; do not download CI artifacts locally first. + After connect, let the active remote connection supply runtime hints. + For remote Android React DevTools, run agent-device react-devtools normally. The CLI opens the needed local service tunnel for the DevTools daemon and cleans it up when the command exits. + Use --debug when remote connection or transport errors need diagnostic ids and remote log hints.`, + }, + macos: { + summary: 'macOS desktop, frontmost-app, and menu bar surfaces', + body: `agent-device help macos + +Use macOS only when the task targets desktop apps, desktop surfaces, or menu bar extras. + +Open and inspect: + agent-device open TextEdit --platform macos + agent-device snapshot -i --platform macos + +Surfaces: + --surface app normal app session + --surface frontmost-app inspect whichever app is frontmost + --surface desktop desktop-wide surface + --surface menubar menu bar extras and menu bar-only apps + +Menu bar app example: + agent-device open "Agent Device Tester Menu" --platform macos --surface menubar + agent-device snapshot -i --platform macos --surface menubar + +Context menu example: + agent-device click @e66 --button secondary --platform macos + agent-device snapshot -i --platform macos + +Rules: + Use open and snapshot -i for menu bar inspection. Do not output inspect as a command. + Context menus are not ambient UI: secondary-click a visible target, then re-snapshot and use the new menu-item refs. + Do not let iOS simulator-set scoping hide macOS desktop targets. + Prefer refs/selectors over raw coordinates. + macOS snapshot rects are window-space; use current refs or overlay refs instead of guessing coordinates.`, + }, + dogfood: { + summary: 'Exploratory QA workflow with reproducible evidence', + body: `agent-device help dogfood + +Use this when asked to dogfood, exploratory test, bug hunt, QA, or find issues in an app. + +Goal: + Find user-visible issues from runtime behavior. Do not read app source or invent findings from code. + Produce a concise report with severity, repro commands, expected/actual behavior, and evidence paths. + +Loop: + 1. Identify target app/platform; ask only if missing. + 2. Create output dirs and open a named session. If auth or OTP is required, sign in or ask the user for the code. + 3. Capture baseline snapshot -i and screenshot. + 4. Map top-level navigation, then exercise primary flows and edge states. + 5. For each issue, capture evidence and write the finding immediately, then continue. + 6. Close the session and reconcile the report summary. + +Coverage: + Navigation, forms, empty/error/loading states, offline or retry behavior, permissions, settings, accessibility labels, orientation/keyboard, and obvious performance stalls. + React Native warning/error overlays can be real findings or test blockers. Capture them, dismiss if unrelated, re-snapshot, and report them. + Expo Go/dev-client shells: use the provided exp:// or dev-client URL and record whether the shell, project load, or app UI is being tested. + Categories: visual, functional, UX, content, performance, diagnostics, permissions, accessibility. + Severity: critical blocks a core flow/data/crashes; high breaks a major feature; medium has friction or workaround; low is polish. + +Evidence commands: + mkdir -p ./dogfood-output/screenshots ./dogfood-output/videos ./dogfood-output/traces + agent-device --session qa open --platform ios + agent-device --session qa snapshot -i + agent-device --session qa screenshot ./dogfood-output/screenshots/initial.png + agent-device --session qa screenshot ./dogfood-output/screenshots/issue-001.png --overlay-refs + agent-device --session qa logs clear --restart + agent-device --session qa logs mark "issue-001 repro" + agent-device --session qa logs path + agent-device --session qa record start ./dogfood-output/videos/issue-001.mp4 + agent-device --session qa record stop + agent-device --session qa close + +Evidence rules: + Interactive/behavioral issues need step screenshots and usually a repro video. + Static/on-load issues can use one screenshot; set repro video to N/A. + Use screenshot --overlay-refs when showing the tappable target or broken state helps repro. + +Report shape: + ./dogfood-output/report.md + Include date, platform, target app, session, scope, severity counts, and issues. + For each finding: ID, severity, category, title, affected flow/screen, repro commands, expected, actual, evidence files, notes. + Target 5-10 well-evidenced issues when available. If no issues are found, report coverage completed and residual risk instead of claiming the app is bug-free. + +Rules: + Findings must come from observed runtime behavior, not source reads. + Re-snapshot after each mutation. + Keep commands in the report reproducible; use selectors or refs from fresh snapshots, not guessed coordinates. + Prefer refs for exploration and selectors for deterministic replay. + Use logs, network, screenshot --overlay-refs, trace, perf, or react-devtools only when they add evidence to a specific issue. + Never delete screenshots, videos, traces, or report artifacts during a session. + Escalate to help debugging or help react-devtools when runtime symptoms require those tools.`, + }, +} as const satisfies Record; + const FLAG_DEFINITIONS: readonly FlagDefinition[] = [ { key: 'config', @@ -1224,8 +1609,8 @@ const COMMAND_SCHEMAS: Record = { usageOverride: 'react-devtools [...args]', listUsageOverride: 'react-devtools [...args]', helpDescription: - 'Run pinned agent-react-devtools commands for React Native component trees, props/state/hooks, and render profiling', - summary: 'Inspect and profile React Native component trees', + 'Run pinned agent-react-devtools commands for React Native performance profiling, component trees, props/state/hooks, and render analysis', + summary: 'Profile React Native performance and component renders', positionalArgs: ['args?'], allowsExtraPositionals: true, allowedFlags: [], @@ -1290,8 +1675,9 @@ const COMMAND_SCHEMAS: Record = { }, get: { usageOverride: 'get text|attrs <@ref|selector>', - helpDescription: 'Return element text/attributes by ref or selector', - summary: 'Get text or attrs by ref or selector', + helpDescription: + 'Return exposed element text/attributes by ref or selector; use snapshot -s @ref for truncated previews', + summary: 'Get exposed text or attrs by ref or selector', positionalArgs: ['subcommand', 'target'], allowedFlags: [...SELECTOR_SNAPSHOT_FLAGS], }, @@ -1409,9 +1795,10 @@ const COMMAND_SCHEMAS: Record = { allowedFlags: ['fps', 'quality', 'hideTouches'], }, trace: { - usageOverride: 'trace start [path] | trace stop [path]', - listUsageOverride: 'trace start [path] | trace stop', - helpDescription: 'Start/stop trace log capture', + usageOverride: 'trace start | trace stop ', + listUsageOverride: 'trace start | trace stop ', + helpDescription: + 'Start/stop trace log capture; when an artifact path is requested, pass the same positional path to start and stop', summary: 'Start or stop trace capture', positionalArgs: ['start|stop', 'path?'], allowedFlags: [], @@ -1565,10 +1952,8 @@ CLI to control iOS and Android devices for AI agents. const helpFlags = listHelpFlags(GLOBAL_FLAG_KEYS); const flagsSection = renderFlagSection('Flags:', helpFlags); - const skillsSection = [ - renderAlignedSection('Agent Skills:', AGENT_SKILLS), - 'See `skills//SKILL.md` in the installed package.', - ].join('\n\n'); + const quickstartSection = renderTextSection('Agent Quickstart:', AGENT_QUICKSTART_LINES); + const workflowsSection = renderAlignedSection('Agent Workflows:', AGENT_WORKFLOWS); const configSection = renderTextSection('Configuration:', CONFIGURATION_LINES); const environmentSection = renderAlignedSection('Environment:', ENVIRONMENT_LINES); const examplesSection = renderTextSection('Examples:', EXAMPLE_LINES); @@ -1578,7 +1963,9 @@ ${commandLines} ${flagsSection} -${skillsSection} +${quickstartSection} + +${workflowsSection} ${configSection} @@ -1648,6 +2035,8 @@ function renderCommandSection( } export function buildCommandUsageText(commandName: string): string | null { + const topicHelp = buildHelpTopicUsageText(commandName); + if (topicHelp) return topicHelp; const schema = getCommandSchema(commandName); if (!schema) return null; const usage = buildCommandUsage(commandName, schema); @@ -1669,3 +2058,15 @@ Usage: ${sections.join('\n\n')} `; } + +function buildHelpTopicUsageText(topicName: string): string | null { + const topic = HELP_TOPICS[topicName as keyof typeof HELP_TOPICS]; + if (!topic) return null; + return `${topic.body} + +Related: + agent-device help command list and global flags + agent-device help command-specific flags + agent-device help workflow normal app automation loop +`; +} diff --git a/test/skillgym/README.md b/test/skillgym/README.md index 39ffe4b0f..3e7bfe125 100644 --- a/test/skillgym/README.md +++ b/test/skillgym/README.md @@ -16,7 +16,7 @@ The included suite focuses on the first two layers so it stays stable and CI-saf - `../../examples/test-app/`: minimal Expo SDK 55 fixture app for broad UI coverage - `skillgym.config.ts`: starter config that runs Codex and Claude Haiku against this repo -- `suites/agent-device-smoke-suite.ts`: 48-case suite for skill routing, fixture-aware planning, and skill-guidance regressions +- `suites/agent-device-smoke-suite.ts`: 66-case suite for skill routing, fixture-aware planning, and skill-guidance regressions ## Current coverage @@ -35,16 +35,16 @@ Fixture smoke cases cover concrete app surfaces: Skill-guidance regression cases cover distinct command-planning habits: - read-only inspection versus mutation -- fresh `@ref` targeting, durable selectors, and off-screen scroll recovery +- fresh `@ref` targeting, durable selectors, raw-rect fallbacks, and off-screen scroll recovery - text replacement, append semantics, keyboard status, and keyboard dismiss -- install/open setup, app discovery, session scoping, and in-app back navigation +- install/open setup, app discovery, session scoping, and app-owned navigation fallbacks - Metro reload, logs, network dump, alert fallback, and screenshot evidence - performance metrics, React DevTools profiling, gestures, settings, and trace capture -- remote config, macOS menu bar surfaces, replay update, and batch during recording +- remote config, macOS menu bar surfaces, replay update, and batch schema/recording `assertAgentDeviceEvidence` is intentionally soft when a runner does not expose skill-detection telemetry. When telemetry exists, the suite asserts that `agent-device` was loaded; when it is absent, the cases still judge command-planning output instead of failing on missing runner metadata. -The `codex-main` baseline is a benchmark signal, not a required all-green gate. Its failures should map to command-planning regressions called out by individual case IDs; do not treat the historical pass/fail count as a fixed threshold. +The `codex-mini` baseline is a benchmark signal, not a required all-green gate. Its failures should map to command-planning regressions called out by individual case IDs; do not treat the historical pass/fail count as a fixed threshold. ## Suggested workflow @@ -62,10 +62,11 @@ pnpm install pnpm test:skillgym ``` -If you want to run `skillgym` directly instead of using the convenience script: +If you want to run `skillgym` directly instead of using the convenience script, build the local CLI first so agents can call `node bin/agent-device.mjs help workflow`: ```bash cd /absolute/path/to/agent-device +pnpm build pnpm exec skillgym run \ ./test/skillgym/suites/agent-device-smoke-suite.ts \ --config ./test/skillgym/skillgym.config.ts diff --git a/test/skillgym/bin/agent-device b/test/skillgym/bin/agent-device new file mode 100755 index 000000000..07f567f4b --- /dev/null +++ b/test/skillgym/bin/agent-device @@ -0,0 +1,2 @@ +#!/bin/sh +exec node "$(dirname "$0")/../../../bin/agent-device.mjs" "$@" diff --git a/test/skillgym/skillgym.config.ts b/test/skillgym/skillgym.config.ts index c9f46d341..b01784b8b 100644 --- a/test/skillgym/skillgym.config.ts +++ b/test/skillgym/skillgym.config.ts @@ -1,4 +1,11 @@ import type { SkillGymConfig } from 'skillgym'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const localBinDir = fileURLToPath(new URL('./bin', import.meta.url)); +const runnerEnv = { + PATH: [localBinDir, process.env.PATH].filter(Boolean).join(path.delimiter), +}; const config: SkillGymConfig = { run: { @@ -6,22 +13,24 @@ const config: SkillGymConfig = { cwd: '../..', outputDir: './.skillgym-results', reporter: 'standard', - schedule: 'parallel', + schedule: 'isolated-by-runner', }, defaults: { - timeoutMs: 120_000, + timeoutMs: 600_000, }, runners: { - 'codex-main': { + 'codex-mini': { agent: { type: 'codex', model: 'gpt-5.4-mini', + env: runnerEnv, }, }, 'claude-haiku': { agent: { type: 'claude-code', model: 'haiku', + env: runnerEnv, }, }, }, diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 7173d6e19..a67b044ba 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -12,9 +12,10 @@ You are benchmarking agent-device command planning for a known fixture app. Do not read project source files or project docs. Do not inspect examples/test-app, src/, README.md, or website/docs. -Use only the app contract provided in this prompt and your existing agent-device knowledge. -If you need command syntax, rely on known agent-device usage patterns instead of reading repository code. -Output only the requested commands, one per line, with no explanation. +Do not browse the web. +Use only this prompt plus local CLI help as private reference. +For local CLI help in this repo, use node bin/agent-device.mjs help or --help; final commands still use agent-device. +Final output: only agent-device commands, one per line. Any prose or Markdown fails. `.trim(); function buildPrompt(options: { contract: string[]; task: string }) { @@ -23,12 +24,21 @@ function buildPrompt(options: { contract: string[]; task: string }) { } function assertAgentDeviceEvidence(report: SessionReport) { - const hasDetectedSkills = (report.detectedSkills?.length ?? 0) > 0; + const detectedSkills = report.detectedSkills ?? []; + const hasDetectedSkills = detectedSkills.length > 0; + const hasBundledDeviceSkill = detectedSkills.some((skill) => + ['agent-device', 'react-devtools', 'dogfood'].includes(skill.skill), + ); // Some SkillGym runners do not expose skill telemetry. Keep this as a conditional routing // assertion instead of failing otherwise valid command-planning runs on missing metadata. if (hasDetectedSkills) { - assert.skills.has(report, 'agent-device'); + assert.ok( + hasBundledDeviceSkill, + `Expected detectedSkills to include an agent-device bundled skill. Observed detectedSkills: ${detectedSkills + .map((skill) => `${skill.skill} (${skill.confidence})`) + .join(', ')}`, + ); } } @@ -40,34 +50,58 @@ function assertNoProjectSourceReads(report: SessionReport) { function commandPattern(command: string) { // The suite asks agents for one command per line, so command-name assertions stay line anchored. - return new RegExp(`(?:^|\\n)(?:agent-device\\s+)?${command}(?:\\s|$)`, 'i'); + return new RegExp( + `(?:^|\\n)(?:agent-device(?:\\s+--[^\\s]+(?:\\s+(?!-)[^\\s]+)?)*\\s+)?${command}(?:\\s|$)`, + 'i', + ); } function commandAlternativesPattern(commands: string[]) { const alternatives = commands.join('|'); - return new RegExp(`(?:^|\\n)(?:agent-device\\s+)?(?:${alternatives})(?:\\s|$)`, 'i'); + return new RegExp( + `(?:^|\\n)(?:agent-device(?:\\s+--[^\\s]+(?:\\s+(?!-)[^\\s]+)?)*\\s+)?(?:${alternatives})(?:\\s|$)`, + 'i', + ); } function assertOutputs(report: SessionReport, matchers: Array) { + const output = normalizedFinalOutput(report); for (const matcher of matchers) { - assert.output.includes(report, matcher); + if (typeof matcher === 'string') { + assert.ok( + output.includes(matcher), + `Expected final output to include ${JSON.stringify(matcher)}. Observed final output: ${report.finalOutput}`, + ); + continue; + } + + assert.match(output, matcher); } } function assertNoOutputs(report: SessionReport, matchers: Array) { + const output = normalizedFinalOutput(report); for (const matcher of matchers) { if (typeof matcher === 'string') { assert.ok( - !report.finalOutput.includes(matcher), + !output.includes(matcher), `Expected final output not to include ${JSON.stringify(matcher)}. Observed final output: ${report.finalOutput}`, ); continue; } - assert.doesNotMatch(report.finalOutput, matcher); + assert.doesNotMatch(output, matcher); } } +function normalizedFinalOutput(report: SessionReport): string { + return report.finalOutput + .replace(/```[a-z]*\n?/gi, '') + .replace(/```/g, '') + .replace(/`([^`\n]+)`/g, '$1') + .trim(); +} + function assertExpectedOutput(report: SessionReport, matchers: Array = []) { if (matchers.length === 0) { assert.output.notEmpty(report); @@ -80,6 +114,7 @@ function assertExpectedOutput(report: SessionReport, matchers: Array