From 2f42430a7d35393598476b2462223008f3c45425 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 21 Jun 2026 10:36:32 -0600 Subject: [PATCH] fix(trace-analyst): count read/inspect tools as self-verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VERIFY_RE only matched verb-named tools (verif/eval/inspect/check/...), but real harnesses verify with Read/Grep/Glob (Claude Code), read_file/ls/cat (codex), and git status/diff — none of which match. So no-self-verification fired on ~any real session that reads or greps state, a false positive. Add the read/search/list/diff/status/test/lint families; leave bare shell (Bash/exec_command) unmatched since the name can't distinguish a test run from a mutation. --- src/trace-analyst/behavioral-metrics.test.ts | 10 ++++++++++ src/trace-analyst/behavioral-metrics.ts | 9 +++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/trace-analyst/behavioral-metrics.test.ts b/src/trace-analyst/behavioral-metrics.test.ts index e571819..a828b30 100644 --- a/src/trace-analyst/behavioral-metrics.test.ts +++ b/src/trace-analyst/behavioral-metrics.test.ts @@ -117,6 +117,16 @@ describe('computeTraceMetrics — deterministic behavioral signals (no LLM)', () expect(m.signals.map((s) => s.code)).not.toContain('no-self-verification') }) + it('counts real read/inspect tool names (Read/Grep) as self-verification', () => { + for (const tool of ['Read', 'Grep', 'read_file', 'git.status']) { + const spans = fixture530() + spans.push(toolSpan(8, tool)) + const m = computeTraceMetrics(spans) + expect(m.hasSelfVerification).toBe(true) + expect(m.signals.map((s) => s.code)).not.toContain('no-self-verification') + } + }) + it('FIRES monotonic-input-growth on a 0→huge blowup (first call reported 0 input tokens)', () => { // First LLM call reports 0 input tokens, then context explodes. Ratio is // unbounded — the old `first > 0 ? last/first : 0` forced growth to 0 and diff --git a/src/trace-analyst/behavioral-metrics.ts b/src/trace-analyst/behavioral-metrics.ts index a131da2..6c69556 100644 --- a/src/trace-analyst/behavioral-metrics.ts +++ b/src/trace-analyst/behavioral-metrics.ts @@ -50,8 +50,13 @@ export interface BehavioralMetrics { const INPUT_GROWTH_FACTOR = 3 /** Tool-usage signals need at least this many calls to be meaningful. */ const MIN_TOOL_CALLS = 3 -/** Tool names matching this are self-verification, not state mutation. */ -const VERIFY_RE = /verif|eval|inspect|check|assert|validat|review|confirm/i +/** Tool names that read or check state count as self-verification, not mutation. + * Covers the inspect verbs plus the read/search tools real harnesses use to + * verify (Claude Code Read/Grep/Glob, codex read_file/ls/cat, git status/diff, + * test/lint). A pure shell tool (Bash/exec_command) is intentionally NOT matched + * — its name can't tell a `pytest` from an `rm`. */ +const VERIFY_RE = + /verif|eval|inspect|check|assert|validat|review|confirm|read|grep|glob|search|view|\blist\b|\bls\b|\bcat\b|\bfind\b|diff|status|\btest|lint|typecheck/i function num(v: unknown): number | null { return typeof v === 'number' && Number.isFinite(v) ? v : null