diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index 3aec343..9009f07 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agent-eval-rpc" -version = "0.95.1" +version = "0.96.0" description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client." readme = "README.md" requires-python = ">=3.10" diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py index 9a34ebc..a451167 100644 --- a/clients/python/src/agent_eval_rpc/__init__.py +++ b/clients/python/src/agent_eval_rpc/__init__.py @@ -58,7 +58,7 @@ try: __version__ = version("agent-eval-rpc") except PackageNotFoundError: - __version__ = "0.95.1" + __version__ = "0.96.0" __all__ = [ "Client", diff --git a/package.json b/package.json index af77f26..85fbbc1 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.95.1", + "version": "0.96.0", "description": "Evaluate and improve AI agents from runs, traces, judges, and feedback. Compare candidates, cluster failures, measure lift, and gate releases.", "homepage": "https://github.com/tangle-network/agent-eval#readme", "repository": { diff --git a/src/contract/eval-reporting-suite.ts b/src/contract/eval-reporting-suite.ts new file mode 100644 index 0000000..c4da2b5 --- /dev/null +++ b/src/contract/eval-reporting-suite.ts @@ -0,0 +1,161 @@ +/** + * # `evalReportingSuite` — one call from runs (or a run dir) to `analysis.json`. + * + * A thin wrapper over the analysis primitive (`analyzeRuns`) and the on-disk + * intake adapter (`fromRunRecordDir`). It does NOT reimplement any statistics, + * distributions, or clustering — it resolves the input into validated + * `RunRecord[]`, calls `analyzeRuns` with the options you'd pass it directly, + * wraps the result in a small provenance envelope, and (optionally) writes a + * single `analysis.json` artifact. + * + * ```ts + * // From a directory of run files, write ./runs/analysis.json: + * const suite = await evalReportingSuite('./runs', { write: true }) + * // From records already in memory, no write: + * const suite = await evalReportingSuite(records, { analyze: { decisionThreshold: 0.03 } }) + * suite.report // the InsightReport — distributions, paired lift, findings rollup + * ``` + */ + +import { mkdir, writeFile } from 'node:fs/promises' +import { dirname, join } from 'node:path' +import type { RunRecord } from '../run-record' +import { type AnalyzeRunsOptions, analyzeRuns } from './analyze-runs' +import type { InsightReport } from './insight-report' +import { + type FromRunRecordDirOptions, + type FromRunRecordDirResult, + fromRunRecordDir, +} from './intake/run-record-dir' + +/** Either records in hand or a path to a `.json` / `.jsonl` file or a + * directory of them. */ +export type EvalReportingSuiteInput = RunRecord[] | string + +export interface EvalReportingSuiteOptions { + /** Forwarded verbatim to `analyzeRuns` (everything except `runs`, which the + * suite supplies from the resolved input). Use this for split selection, + * baseline/candidate ids, canaries, prior-period runs, the analyst registry, + * etc. */ + analyze?: Omit + /** Loader options used only when the input is a path. */ + load?: FromRunRecordDirOptions + /** + * Write the suite result as a single `analysis.json`. + * - `true` — write to `/analysis.json` when the input is a directory, + * or alongside the input file; throws if the input is in-memory records + * (no directory to anchor to — pass an explicit path instead). + * - a string — write to exactly this path (a directory path gets + * `analysis.json` appended; any other path is used verbatim). + * - omitted / false — do not write. + */ + write?: boolean | string +} + +/** The suite artifact — the `analyzeRuns` report plus provenance. This is the + * exact shape serialized to `analysis.json`. */ +export interface EvalReportingSuiteResult { + /** The analysis itself — distributions, paired stats/lift, failure rollup, + * recommendations. Produced by `analyzeRuns`. */ + report: InsightReport + /** How the suite was run, so a reader can verify provenance. */ + provenance: { + /** ISO timestamp the suite ran. */ + generatedAt: string + /** Number of records analyzed (mirrors `report.n`). */ + runCount: number + /** The source path when the input was a directory/file; null for + * in-memory records. */ + sourcePath: string | null + /** Files read when loading from disk; empty for in-memory input. */ + files: string[] + /** Records dropped at the validation boundary. Always empty unless + * `load.onInvalid` was set to `'collect'`. */ + rejected: FromRunRecordDirResult['rejected'] + } + /** The path `analysis.json` was written to, or null when `write` was unset. */ + writtenTo: string | null +} + +const ANALYSIS_ARTIFACT = 'analysis.json' + +/** + * Resolve runs (or a run dir/file), run `analyzeRuns`, and optionally persist a + * single `analysis.json`. The only analysis logic lives in `analyzeRuns`; this + * function is composition + I/O. + */ +export async function evalReportingSuite( + input: EvalReportingSuiteInput, + options: EvalReportingSuiteOptions = {}, +): Promise { + const fromPath = typeof input === 'string' + + let runs: RunRecord[] + let files: string[] = [] + let rejected: FromRunRecordDirResult['rejected'] = [] + if (fromPath) { + const loaded = await fromRunRecordDir(input, options.load) + runs = loaded.runs + files = loaded.files + rejected = loaded.rejected + } else { + runs = input + } + + if (runs.length === 0) { + throw new Error( + fromPath + ? `evalReportingSuite: no RunRecords found at '${input}'` + : 'evalReportingSuite: no RunRecords to analyze', + ) + } + + const report = await analyzeRuns({ ...options.analyze, runs }) + + const result: EvalReportingSuiteResult = { + report, + provenance: { + generatedAt: new Date().toISOString(), + runCount: runs.length, + sourcePath: fromPath ? input : null, + files, + rejected, + }, + writtenTo: null, + } + + const target = resolveWriteTarget(options.write, fromPath ? input : null) + if (target) { + await mkdir(dirname(target), { recursive: true }) + await writeFile(target, `${JSON.stringify(result, null, 2)}\n`, 'utf8') + result.writtenTo = target + } + + return result +} + +/** Resolve where (if anywhere) to write `analysis.json`. Returns null when + * writing is disabled. Throws on `write: true` with in-memory input — there is + * no directory to anchor the artifact to, and silently inventing `cwd` would + * scatter files. */ +function resolveWriteTarget( + write: EvalReportingSuiteOptions['write'], + sourcePath: string | null, +): string | null { + if (!write) return null + + if (typeof write === 'string') { + const looksLikeDir = + write.endsWith('/') || (!write.endsWith('.json') && !write.endsWith('.jsonl')) + return looksLikeDir ? join(write, ANALYSIS_ARTIFACT) : write + } + + // write === true + if (sourcePath === null) { + throw new Error( + 'evalReportingSuite: write:true needs a source path to anchor analysis.json — pass an explicit output path when analyzing in-memory records', + ) + } + const isFile = sourcePath.endsWith('.json') || sourcePath.endsWith('.jsonl') + return isFile ? join(dirname(sourcePath), ANALYSIS_ARTIFACT) : join(sourcePath, ANALYSIS_ARTIFACT) +} diff --git a/src/contract/index.ts b/src/contract/index.ts index 4d46d40..1ff8ffe 100644 --- a/src/contract/index.ts +++ b/src/contract/index.ts @@ -211,6 +211,15 @@ export { export type { AnalystFinding } from '../analyst/types' export type { AnalyzeRunsOptions } from './analyze-runs' export { analyzeRuns } from './analyze-runs' +// One-call reporting suite: runs (or a run dir/file) → `analyzeRuns` → +// optional `analysis.json`. Thin composition over `analyzeRuns` + +// `fromRunRecordDir`; adds no analysis logic of its own. +export { + type EvalReportingSuiteInput, + type EvalReportingSuiteOptions, + type EvalReportingSuiteResult, + evalReportingSuite, +} from './eval-reporting-suite' export type { FailureClusterInsight, InsightReport, @@ -258,6 +267,8 @@ export { type FromFeedbackTableOptions, type FromFeedbackTableResult, type FromOtelSpansOptions, + type FromRunRecordDirOptions, + type FromRunRecordDirResult, fromClaudeCodeSession, fromCodexSession, fromFeedbackTable, @@ -266,9 +277,11 @@ export { fromOtelSpans, fromPigraphSession, fromPiSession, + fromRunRecordDir, type ParsedCodeAgentJsonl, type PartitionByAuthoringModelResult, parseAgentTrace, parseCodeAgentJsonl, partitionRunsByAuthoringModel, + type RunRecordRejection, } from './intake' diff --git a/src/contract/intake/index.ts b/src/contract/intake/index.ts index 95eb1f4..c27849c 100644 --- a/src/contract/intake/index.ts +++ b/src/contract/intake/index.ts @@ -14,6 +14,8 @@ * - `fromCodexSession` / `fromClaudeCodeSession` / `fromOpenCodeSession` / * `fromKimiCodeSession` / `fromPiSession` — local coding-agent and * graph-shaped sessions projected into process-scored `RunRecord`s. + * - `fromRunRecordDir` — a `.json` / `.jsonl` file or a directory of them, + * parsed and validated at the boundary. */ export { @@ -52,3 +54,9 @@ export { fromFeedbackTable, } from './feedback-table' export { type FromOtelSpansOptions, fromOtelSpans } from './otel-spans' +export { + type FromRunRecordDirOptions, + type FromRunRecordDirResult, + fromRunRecordDir, + type RunRecordRejection, +} from './run-record-dir' diff --git a/src/contract/intake/run-record-dir.ts b/src/contract/intake/run-record-dir.ts new file mode 100644 index 0000000..2ded562 --- /dev/null +++ b/src/contract/intake/run-record-dir.ts @@ -0,0 +1,169 @@ +/** + * # `intake/run-record-dir` — load a directory or file of `RunRecord`s. + * + * The on-disk counterpart to the in-memory intake adapters: point it at a + * single `.json` (array) / `.jsonl` (one record per line) file or at a + * directory of such files, and it returns the substrate-canonical + * `RunRecord[]` ready for `analyzeRuns({ runs })`. + * + * Validation is at the boundary: each parsed object goes through + * `parseRunRecordSafe`. By default an invalid record fails loud with its + * file + index; pass `onInvalid: 'collect'` to keep the valid records and + * receive the rejects as structured diagnostics instead. + */ + +import { readdir, readFile, stat } from 'node:fs/promises' +import { join } from 'node:path' +import { parseRunRecordSafe, type RunRecord } from '../../run-record' + +/** A record that failed boundary validation, with enough context to fix it. */ +export interface RunRecordRejection { + /** Absolute or caller-relative path to the file the record came from. */ + file: string + /** Zero-based position within the file (array index or JSONL line number). */ + index: number + /** The validator's message. */ + reason: string +} + +export interface FromRunRecordDirOptions { + /** + * How to treat a record that fails `parseRunRecordSafe`: + * - `'throw'` (default) — fail loud on the first invalid record. + * - `'collect'` — drop it, keep the rest, and return it under `rejected`. + */ + onInvalid?: 'throw' | 'collect' + /** + * When the input is a directory, only files matching this predicate are + * read. Default: any file ending in `.json` or `.jsonl`. The `analysis.json` + * artifact `evalReportingSuite` writes is always skipped so a re-run never + * ingests its own output. + */ + include?: (fileName: string) => boolean + /** + * Recurse into subdirectories when the input is a directory. Default false — + * a flat run directory is the common case and recursion can silently pull in + * unrelated corpora. + */ + recursive?: boolean +} + +export interface FromRunRecordDirResult { + /** Records that passed boundary validation, in file-then-index order. */ + runs: RunRecord[] + /** Records that failed validation. Empty unless `onInvalid: 'collect'`. */ + rejected: RunRecordRejection[] + /** The files that were read, in the order they were processed. */ + files: string[] +} + +const ANALYSIS_ARTIFACT = 'analysis.json' + +function defaultInclude(fileName: string): boolean { + if (fileName === ANALYSIS_ARTIFACT) return false + return fileName.endsWith('.json') || fileName.endsWith('.jsonl') +} + +/** + * Resolve a file or directory path into validated `RunRecord[]`. + * + * A `.json` file must parse to a top-level array; a `.jsonl` file is one + * record per non-empty line. Directories are read shallowly by default + * (set `recursive` to descend); the `analysis.json` output artifact is + * always excluded. + */ +export async function fromRunRecordDir( + path: string, + options: FromRunRecordDirOptions = {}, +): Promise { + const onInvalid = options.onInvalid ?? 'throw' + const include = options.include ?? defaultInclude + + const stats = await stat(path) + const filePaths = stats.isDirectory() + ? await collectFiles(path, include, options.recursive ?? false) + : [path] + + const runs: RunRecord[] = [] + const rejected: RunRecordRejection[] = [] + + for (const file of filePaths) { + const raw = await parseRecordFile(file) + for (const { index, value } of raw) { + const parsed = parseRunRecordSafe(value) + if (parsed.ok) { + runs.push(parsed.value) + continue + } + const rejection: RunRecordRejection = { file, index, reason: parsed.error.message } + if (onInvalid === 'throw') { + throw new Error( + `fromRunRecordDir: invalid RunRecord in '${file}' at index ${index}: ${parsed.error.message}`, + ) + } + rejected.push(rejection) + } + } + + return { runs, rejected, files: filePaths } +} + +/** Read a single `.json` / `.jsonl` file into `{ index, value }` pairs. A + * malformed JSONL line throws with its line number rather than being skipped — + * silent line-dropping is how corpora quietly shrink. */ +async function parseRecordFile(file: string): Promise> { + const text = await readFile(file, 'utf8') + const trimmed = text.trim() + if (trimmed.length === 0) return [] + + if (trimmed.startsWith('[')) { + const parsed = JSON.parse(trimmed) as unknown + if (!Array.isArray(parsed)) { + throw new Error(`fromRunRecordDir: file '${file}' did not parse to an array`) + } + return parsed.map((value, index) => ({ index, value })) + } + + const out: Array<{ index: number; value: unknown }> = [] + const lines = trimmed.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]!.trim() + if (line.length === 0) continue + try { + out.push({ index: i, value: JSON.parse(line) as unknown }) + } catch (err) { + throw new Error( + `fromRunRecordDir: file '${file}' line ${i + 1} is not valid JSON: ${ + err instanceof Error ? err.message : String(err) + }`, + ) + } + } + return out +} + +/** Sorted file list under a directory, filtered by `include`. Sorted so the + * resulting `RunRecord` order — and any downstream fingerprint — is stable + * across filesystems. */ +async function collectFiles( + dir: string, + include: (fileName: string) => boolean, + recursive: boolean, +): Promise { + const entries = await readdir(dir, { withFileTypes: true }) + const files: string[] = [] + const subdirs: string[] = [] + for (const entry of entries) { + if (entry.isDirectory()) { + if (recursive) subdirs.push(join(dir, entry.name)) + continue + } + if (include(entry.name)) files.push(join(dir, entry.name)) + } + files.sort() + subdirs.sort() + for (const sub of subdirs) { + files.push(...(await collectFiles(sub, include, recursive))) + } + return files +} diff --git a/tests/contract-eval-reporting-suite.test.ts b/tests/contract-eval-reporting-suite.test.ts new file mode 100644 index 0000000..658670b --- /dev/null +++ b/tests/contract-eval-reporting-suite.test.ts @@ -0,0 +1,160 @@ +/** + * evalReportingSuite — the one-call wrapper over fromRunRecordDir + analyzeRuns. + * + * Covers the four journeys: + * - in-memory RunRecord[] → report (no write) + * - directory of .json / .jsonl → report (+ analysis.json on disk) + * - explicit output path (file and dir forms) + * - boundary validation: throw by default, collect on demand + * + * The suite must REUSE analyzeRuns — these tests assert the wrapped report is + * byte-identical to calling analyzeRuns directly, so the wrapper can't drift + * into reimplementing analysis. + */ + +import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { afterEach, beforeEach, describe, expect, it } from 'vitest' +import { analyzeRuns, evalReportingSuite } from '../src/contract' +import type { RunRecord } from '../src/run-record' + +function makeRun(opts: { id: string; candidate: string; composite: number }): RunRecord { + return { + runId: opts.id, + experimentId: 'exp', + candidateId: opts.candidate, + seed: 0, + model: 'm@v', + promptHash: 'sha256:p', + configHash: 'sha256:c', + commitSha: 'abc', + wallMs: 100, + costUsd: 0.01, + tokenUsage: { input: 100, output: 50 }, + outcome: { holdoutScore: opts.composite, raw: {} }, + splitTag: 'holdout', + } satisfies RunRecord +} + +const runs: RunRecord[] = [ + makeRun({ id: 'r1', candidate: 'base', composite: 0.4 }), + makeRun({ id: 'r2', candidate: 'base', composite: 0.5 }), + makeRun({ id: 'r3', candidate: 'cand', composite: 0.7 }), + makeRun({ id: 'r4', candidate: 'cand', composite: 0.8 }), +] + +let dir: string +beforeEach(async () => { + dir = await mkdtemp(join(tmpdir(), 'eval-suite-')) +}) +afterEach(async () => { + await rm(dir, { recursive: true, force: true }) +}) + +describe('evalReportingSuite', () => { + it('analyzes in-memory records and matches analyzeRuns exactly', async () => { + const suite = await evalReportingSuite(runs) + const direct = await analyzeRuns({ runs }) + expect(suite.report).toEqual(direct) + expect(suite.provenance.runCount).toBe(4) + expect(suite.provenance.sourcePath).toBeNull() + expect(suite.provenance.files).toEqual([]) + expect(suite.writtenTo).toBeNull() + }) + + it('forwards analyze options through to analyzeRuns', async () => { + const suite = await evalReportingSuite(runs, { + analyze: { baselineCandidateId: 'base', candidateCandidateId: 'cand' }, + }) + // lift only materializes when a baseline/candidate pair is given + expect(suite.report.lift).toBeDefined() + expect(suite.report.lift?.candidateMean).toBeGreaterThan(suite.report.lift?.baselineMean ?? 1) + }) + + it('loads a directory of .json and .jsonl files and writes analysis.json', async () => { + await writeFile(join(dir, 'a.json'), JSON.stringify(runs.slice(0, 2)), 'utf8') + await writeFile( + join(dir, 'b.jsonl'), + `${runs + .slice(2) + .map((r) => JSON.stringify(r)) + .join('\n')}\n`, + 'utf8', + ) + + const suite = await evalReportingSuite(dir, { write: true }) + expect(suite.report.n).toBe(4) + expect(suite.provenance.sourcePath).toBe(dir) + expect(suite.provenance.files).toHaveLength(2) + expect(suite.writtenTo).toBe(join(dir, 'analysis.json')) + + const onDisk = JSON.parse(await readFile(join(dir, 'analysis.json'), 'utf8')) + expect(onDisk.report.n).toBe(4) + expect(onDisk.provenance.runCount).toBe(4) + }) + + it('re-running on a directory ignores its own analysis.json output', async () => { + await writeFile(join(dir, 'a.json'), JSON.stringify(runs), 'utf8') + const first = await evalReportingSuite(dir, { write: true }) + expect(first.report.n).toBe(4) + // Second pass must not ingest the analysis.json the first pass wrote. + const second = await evalReportingSuite(dir, { write: true }) + expect(second.report.n).toBe(4) + expect(second.provenance.files).toEqual([join(dir, 'a.json')]) + }) + + it('loads a single .jsonl file and writes alongside it', async () => { + const file = join(dir, 'runs.jsonl') + await writeFile(file, `${runs.map((r) => JSON.stringify(r)).join('\n')}\n`, 'utf8') + const suite = await evalReportingSuite(file, { write: true }) + expect(suite.report.n).toBe(4) + expect(suite.writtenTo).toBe(join(dir, 'analysis.json')) + }) + + it('writes to an explicit file path when write is a string', async () => { + const out = join(dir, 'nested', 'custom-report.json') + const suite = await evalReportingSuite(runs, { write: out }) + expect(suite.writtenTo).toBe(out) + const onDisk = JSON.parse(await readFile(out, 'utf8')) + expect(onDisk.report.n).toBe(4) + }) + + it('treats a string write target without a .json extension as a directory', async () => { + const outDir = join(dir, 'reports') + const suite = await evalReportingSuite(runs, { write: outDir }) + expect(suite.writtenTo).toBe(join(outDir, 'analysis.json')) + const onDisk = JSON.parse(await readFile(join(outDir, 'analysis.json'), 'utf8')) + expect(onDisk.report.n).toBe(4) + }) + + it('refuses write:true for in-memory records (no anchor directory)', async () => { + await expect(evalReportingSuite(runs, { write: true })).rejects.toThrow(/anchor/) + }) + + it('throws on an empty corpus', async () => { + await expect(evalReportingSuite([])).rejects.toThrow(/no RunRecords/) + await expect(evalReportingSuite(dir)).rejects.toThrow(/no RunRecords found/) + }) + + it('fails loud on an invalid record by default', async () => { + await writeFile( + join(dir, 'bad.jsonl'), + `${JSON.stringify(runs[0])}\n${JSON.stringify({ runId: 'x' })}\n`, + 'utf8', + ) + await expect(evalReportingSuite(dir)).rejects.toThrow(/invalid RunRecord/) + }) + + it('collects invalid records when load.onInvalid is "collect"', async () => { + await writeFile( + join(dir, 'mixed.jsonl'), + `${runs.map((r) => JSON.stringify(r)).join('\n')}\n${JSON.stringify({ runId: 'x' })}\n`, + 'utf8', + ) + const suite = await evalReportingSuite(dir, { load: { onInvalid: 'collect' } }) + expect(suite.report.n).toBe(4) + expect(suite.provenance.rejected).toHaveLength(1) + expect(suite.provenance.rejected[0]?.reason).toMatch(/mandatory|missing/i) + }) +})