diff --git a/AGENTS.md b/AGENTS.md index 56320dd6b..231b9295c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -34,13 +34,15 @@ Make an efficient learning agent that can do anything. ## Docs -- [`docs/architecture.md`](docs/architecture.md) — Package dependency graph, per-package details, architectural patterns -- [`docs/request-flow.md`](docs/request-flow.md) — Full request lifecycle from CLI through server and back -- [`docs/error-schema.md`](docs/error-schema.md) — Server error response formats and client-side handling -- [`docs/development.md`](docs/development.md) — Dev setup, worktrees, logs, package management, DB migrations -- [`docs/testing.md`](docs/testing.md) — DI over mocking, tmux CLI testing -- [`docs/environment-variables.md`](docs/environment-variables.md) — Env var rules, DI helpers, loading order -- [`docs/agents-and-tools.md`](docs/agents-and-tools.md) — Agent system, shell shims, tool definitions -- [`docs/patterns/handle-steps-generators.md`](docs/patterns/handle-steps-generators.md) — handleSteps generator patterns and spawn_agents tool calls -- [docs/evalbuff/interpreting-task-prompts.md](docs/evalbuff/interpreting-task-prompts.md) -- [docs/patterns/discover-before-implement.md](docs/patterns/discover-before-implement.md) +IMPORTANT: Prefer retrieval-led reasoning over pre-training-led reasoning. Always read the relevant docs below before implementing changes. + +- `docs/architecture.md` — Package dependency graph, per-package details, architectural patterns +- `docs/request-flow.md` — Full request lifecycle from CLI through server and back +- `docs/error-schema.md` — Server error response formats and client-side handling +- `docs/development.md` — Dev setup, worktrees, logs, package management, DB migrations +- `docs/testing.md` — DI over mocking, tmux CLI testing +- `docs/environment-variables.md` — Env var rules, DI helpers, loading order +- `docs/agents-and-tools.md` — Agent system, shell shims, tool definitions +- `docs/patterns/handle-steps-generators.md` — handleSteps generator patterns and spawn_agents tool calls +- `docs/evalbuff/interpreting-task-prompts.md` +- `docs/patterns/discover-before-implement.md` diff --git a/evalbuff/src/run-carve-eval.ts b/evalbuff/src/run-carve-eval.ts index 1d627d87b..d53b6e54f 100644 --- a/evalbuff/src/run-carve-eval.ts +++ b/evalbuff/src/run-carve-eval.ts @@ -24,6 +24,40 @@ import type { CarvedFeature, CarveResult, FileOperation } from './carve-features import type { JudgingResult, ReviewerAgentType } from './judge' import type { RunnerResult } from './runners/runner' +// --- Doc read stats --- + +/** Extract doc file reads from an agent trace (JSONL of PrintModeEvents). */ +function extractDocReads(agentTrace: string): Record { + const counts: Record = {} + for (const line of agentTrace.split('\n')) { + if (!line.trim()) continue + try { + const event = JSON.parse(line) + if (event.type !== 'tool_call' || event.toolName !== 'Read') continue + const filePath: string = event.input?.file_path ?? '' + // Normalize to repo-relative path + const match = filePath.match(/(?:^|\/)(?:docs\/.*|AGENTS\.md|CLAUDE\.md)$/) + if (!match) continue + const relPath = match[0].startsWith('/') ? match[0].slice(1) : match[0] + counts[relPath] = (counts[relPath] || 0) + 1 + } catch { + // not JSON + } + } + return counts +} + +/** Merge multiple doc-read count maps into one (summing counts). */ +function mergeDocReads(maps: Record[]): Record { + const merged: Record = {} + for (const m of maps) { + for (const [k, v] of Object.entries(m)) { + merged[k] = (merged[k] || 0) + v + } + } + return merged +} + // --- Apply carve operations to a repo directory --- function applyCarveOperations(repoDir: string, operations: FileOperation[]): void { @@ -274,6 +308,8 @@ interface CarveEvalResult { docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> totalCost: number + /** Which doc files agents read and how many times (summed across all parallel runs). */ + docsRead: Record } async function runCarveEval(options: CarveEvalOptions): Promise { @@ -357,6 +393,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise { docsKept: [], docsRejected: [], totalCost, + docsRead: {}, }) continue } @@ -368,6 +405,15 @@ async function runCarveEval(options: CarveEvalOptions): Promise { ` Baseline: ${currentScore.toFixed(1)}/10 (${baselineScores.map((s) => s.toFixed(1)).join(', ')})`, ) + // Track which docs agents read across all runs for this feature + const baselineDocReads = mergeDocReads(validBaseline.map((r) => extractDocReads(r.agentTrace))) + const docReadEntries = Object.entries(baselineDocReads).sort((a, b) => b[1] - a[1]) + if (docReadEntries.length > 0) { + console.log(` Docs read (baseline): ${docReadEntries.map(([p, n]) => `${p} (${n}x)`).join(', ')}`) + } else { + console.log(` Docs read (baseline): none`) + } + const docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] const docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] @@ -510,6 +556,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise { docsKept, docsRejected, totalCost, + docsRead: baselineDocReads, }) } @@ -525,6 +572,12 @@ async function runCarveEval(options: CarveEvalOptions): Promise { console.log(` Baseline: ${r.baselineScore.toFixed(1)}/10`) console.log(` Final: ${r.finalScore.toFixed(1)}/10`) console.log(` Docs kept: ${r.docsKept.length}, rejected: ${r.docsRejected.length}`) + const readEntries = Object.entries(r.docsRead).sort((a, b) => b[1] - a[1]) + if (readEntries.length > 0) { + console.log(` Docs read: ${readEntries.map(([p, n]) => `${p} (${n}x)`).join(', ')}`) + } else { + console.log(` Docs read: none`) + } console.log(` Cost: $${r.totalCost.toFixed(2)}`) totalCostAll += r.totalCost } @@ -538,6 +591,18 @@ async function runCarveEval(options: CarveEvalOptions): Promise { console.log(` Average final: ${avgFinal.toFixed(1)}/10`) console.log(` Total cost: $${totalCostAll.toFixed(2)}`) + // Aggregate doc read stats across all features + const allDocReads = mergeDocReads(results.map((r) => r.docsRead)) + const allReadEntries = Object.entries(allDocReads).sort((a, b) => b[1] - a[1]) + if (allReadEntries.length > 0) { + console.log(`\n Doc read stats (all features):`) + for (const [docPath, count] of allReadEntries) { + console.log(` ${docPath}: ${count} reads`) + } + } else { + console.log(`\n No docs were read by any agent.`) + } + // Save results const outputPath = path.join( repoPath,