From a0ffab7e930ca6138bcd9662da85f45bf68c57a2 Mon Sep 17 00:00:00 2001
From: Lee Campbell <lee.ryan.campbell@gmail.com>
Date: Sun, 22 Mar 2026 11:05:42 +0800
Subject: [PATCH 1/2] feat: add benchmark-driven workflow for performance
 issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Establishes a benchmark-first methodology in testing standards and
updates autonomous agent prompts to enforce Phase 1 (baseline) →
Phase 2 (implement) → Phase 3 (validate) ordering for performance
issues, ensuring baseline measurements exist before any code changes.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 autonomous/agent-loop.sh                 | 14 ++++-
 autonomous/prompts/create-tasks.md       | 34 ++++++++++-
 autonomous/prompts/execute-tasks.md      | 23 +++++++-
 autonomous/prompts/pick-issue.md         | 10 ++++
 autonomous/prompts/review-brief.md       |  5 ++
 spec/README.md                           |  1 +
 spec/tech-standards/testing-standards.md | 73 +++++++++++++++++++++++-
 7 files changed, 153 insertions(+), 7 deletions(-)
diff --git a/autonomous/agent-loop.sh b/autonomous/agent-loop.sh
index e9fab30..88036dd 100644
--- a/autonomous/agent-loop.sh
+++ b/autonomous/agent-loop.sh
@@ -133,6 +133,16 @@ case "$STATE" in
 
 $(cat /tmp/plan-backup/done/task.md)
 
+</details>"
+        fi
+        if [ -f /tmp/plan-backup/benchmarks/comparison.md ]; then
+            PR_BODY="${PR_BODY}
+
+<details>
+<summary>Benchmark comparison</summary>
+
+$(cat /tmp/plan-backup/benchmarks/comparison.md)
+
 </details>"
         fi
         PR_BODY="${PR_BODY}
@@ -249,7 +259,7 @@ EOF
             # If plan state doesn't exist on the branch, re-initialise
             if [ ! -d ./plan ]; then
                 echo "No plan state found on branch, starting fresh"
-                mkdir -p ./plan/planning ./plan/ready ./plan/done
+                mkdir -p ./plan/planning ./plan/ready ./plan/done ./plan/benchmarks
 
                 ISSUE_BODY=$(gh issue view "$ISSUE_NUM" --repo "$UPSTREAM_REPO" \
                     --json body,title --jq '"# Issue #'"$ISSUE_NUM"': " + .title + "\n\n" + .body')
@@ -264,7 +274,7 @@ EOF
             echo "Starting fresh: agent/${ISSUE_NUM}-${BRANCH_SLUG}"
             git fetch upstream
             git checkout -b "agent/${ISSUE_NUM}-${BRANCH_SLUG}" "upstream/$UPSTREAM_BASE_BRANCH"
-            mkdir -p ./plan/planning ./plan/ready ./plan/done
+            mkdir -p ./plan/planning ./plan/ready ./plan/done ./plan/benchmarks
 
             ISSUE_BODY=$(gh issue view "$ISSUE_NUM" --repo "$UPSTREAM_REPO" \
                 --json body,title --jq '"# Issue #'"$ISSUE_NUM"': " + .title + "\n\n" + .body')
diff --git a/autonomous/prompts/create-tasks.md b/autonomous/prompts/create-tasks.md
index 900af7d..bfd61f5 100644
--- a/autonomous/prompts/create-tasks.md
+++ b/autonomous/prompts/create-tasks.md
@@ -21,4 +21,36 @@ Include tasks for:
 Use `[ ]` for each task. 
 Validate the task list is complete by cross-referencing every acceptance criterion in the brief — each criterion must be covered by at least one task.
 
-Any task that attempts to alter the ./.github folder will likely fail due to permissions restrictions. These changes should accompany the PR as an attached file with clear direction on the manual intervention required to complete the work. 
\ No newline at end of file
+Any task that attempts to alter the ./.github folder will likely fail due to permissions restrictions. These changes should accompany the PR as an attached file with clear direction on the manual intervention required to complete the work.
+
+If the brief category is `performance`, follow the benchmark-driven development process
+in spec/tech-standards/testing-standards.md. The task ordering MUST be:
+
+Phase 1 — Benchmark scaffolding (before any implementation changes):
+  - [ ] Create benchmark class(es) in HdrHistogram.Benchmarking/
+    - Micro-benchmarks for the specific operations being optimised
+    - End-to-end benchmarks that exercise the realistic user workflow
+    - Add [MemoryDiagnoser] to all benchmark classes
+    - Register new benchmarks in Program.cs BenchmarkSwitcher
+  - [ ] Build and verify benchmarks compile:
+    `dotnet build HdrHistogram.Benchmarking/ -c Release`
+  - [ ] Run baseline benchmarks on the UNMODIFIED code
+    - Run: `dotnet run -c Release --project HdrHistogram.Benchmarking/ -- --filter '*BenchmarkClass*' --exporters json`
+    - Save formatted results table to `plan/benchmarks/baseline.md`
+    - Include: Mean, StdDev, Allocated, Op/s for each benchmark method
+
+Phase 2 — Implementation (the actual code changes):
+  - [ ] (implementation tasks as normal)
+  - [ ] (unit tests as normal)
+
+Phase 3 — Benchmark validation (after implementation is complete):
+  - [ ] Run post-change benchmarks with identical configuration
+    - Save results to `plan/benchmarks/post-change.md`
+  - [ ] Generate comparison in `plan/benchmarks/comparison.md` containing:
+    - Side-by-side table: Benchmark | Baseline | Post-Change | Delta | Delta %
+    - Summary: which metrics improved, which regressed, which unchanged
+    - Verdict: does the data support the change?
+
+Create the directory `plan/benchmarks/` for storing results.
+
+If the brief category is `functional`, use the current ordering (implementation, tests, docs).
\ No newline at end of file
diff --git a/autonomous/prompts/execute-tasks.md b/autonomous/prompts/execute-tasks.md
index 72ef5fb..8accd9c 100644
--- a/autonomous/prompts/execute-tasks.md
+++ b/autonomous/prompts/execute-tasks.md
@@ -24,4 +24,25 @@ After all tasks are marked `[x]`:
 2. If the review identifies issues, append new `[ ]` tasks to task.md describing each fix.
 3. If the review is clean, move brief.md and task.md to ./plan/done/
 
-Process as many tasks as you can in this iteration.
\ No newline at end of file
+Process as many tasks as you can in this iteration.
+
+Special handling for benchmark tasks (applies when brief category is `performance`):
+
+Baseline capture (Phase 1 benchmark tasks):
+- After creating benchmark classes, commit ONLY the benchmark files:
+  `git add HdrHistogram.Benchmarking/ Program.cs && git commit -m "bench: add benchmarks for baseline capture"`
+- Run benchmarks in Release configuration. Use `--filter` to target only the relevant benchmarks.
+- BenchmarkDotNet outputs markdown tables to stdout and detailed results to `BenchmarkDotNet.Artifacts/`.
+  Copy the results table into `plan/benchmarks/baseline.md`.
+- Do NOT proceed to Phase 2 implementation tasks until baseline results are captured and saved.
+
+Post-change validation (Phase 3 benchmark tasks):
+- Run the exact same benchmark command used for the baseline.
+- Save results to `plan/benchmarks/post-change.md`.
+- Generate `plan/benchmarks/comparison.md` by reading both files and computing deltas.
+- If any benchmark shows a regression, flag it in comparison.md and add a new task to investigate.
+
+Benchmark execution notes:
+- Use `--job short` for faster feedback during development iterations.
+- For the final Phase 3 comparison, use default BenchmarkDotNet settings (no `--job short`).
+- Benchmarks that fail to compile or run must be fixed before proceeding.
\ No newline at end of file
diff --git a/autonomous/prompts/pick-issue.md b/autonomous/prompts/pick-issue.md
index d7d33bd..032d637 100644
--- a/autonomous/prompts/pick-issue.md
+++ b/autonomous/prompts/pick-issue.md
@@ -22,3 +22,13 @@ Using the exploration results, create ./plan/planning/brief.md containing:
 - Acceptance criteria derived from the issue
 - Test strategy: which tests to add or modify
 - Risks or open questions
+- Category: classify as either `functional` or `performance`
+  - `performance` if the issue mentions: allocation, memory, throughput, latency,
+    GC pressure, benchmark, hot path, serialisation performance, or similar
+  - `functional` for all other issues (bugs, features, refactors)
+- Benchmark strategy (required for `performance` issues, optional for `functional`):
+  Follow the benchmark-driven development process in spec/tech-standards/testing-standards.md.
+  Identify:
+  - Which existing benchmarks are relevant (check HdrHistogram.Benchmarking/)
+  - What new micro-benchmarks and end-to-end benchmarks are needed
+  - Which metrics matter: throughput, allocation, GC collections
diff --git a/autonomous/prompts/review-brief.md b/autonomous/prompts/review-brief.md
index 10565d1..433a1e3 100644
--- a/autonomous/prompts/review-brief.md
+++ b/autonomous/prompts/review-brief.md
@@ -12,6 +12,11 @@ Review the brief for:
 - Feasibility: Do the proposed changes align with what the code actually looks like?
 - Test strategy: Are there specific test cases identified?
 - Acceptance criteria: Are they measurable and verifiable?
+- Category: Is the classification correct? Performance issues MUST be marked `performance`.
+- Benchmark strategy (for `performance` issues, per spec/tech-standards/testing-standards.md):
+  - Are both micro-benchmarks AND end-to-end benchmarks identified?
+  - Do the benchmarks measure what the issue actually claims to improve?
+  - Are the benchmarks testing the realistic hot path, not just the changed code in isolation?
 
 If changes are needed: create ./plan/planning/brief-review.md with specific,
 actionable suggestions.
diff --git a/spec/README.md b/spec/README.md
index 0ad3c07..9ed9af2 100644
--- a/spec/README.md
+++ b/spec/README.md
@@ -41,6 +41,7 @@ This file provides guidance on where to find standards and specifications for th
 | encoding, LEB128, DEFLATE, compression | [Histogram Encoding](./tech-standards/histogram-encoding.md) |
 | log format, V2, persistence | [Histogram Encoding](./tech-standards/histogram-encoding.md) |
 | xUnit, test, FluentAssertions | [Testing Standards](./tech-standards/testing-standards.md) |
+| benchmark, performance, allocation, BenchmarkDotNet | [Testing Standards](./tech-standards/testing-standards.md) |
 | naming convention, XML docs, style | [Coding Standards](./tech-standards/coding-standards.md) |
 | build, NuGet, AppVeyor, CI/CD | [Build System](./tech-standards/build-system.md) |
 | milestone, issue, PR, GitHub | [GitHub CLI Reference](./tech-standards/github.md) |
diff --git a/spec/tech-standards/testing-standards.md b/spec/tech-standards/testing-standards.md
index 21b93f2..32e5834 100644
--- a/spec/tech-standards/testing-standards.md
+++ b/spec/tech-standards/testing-standards.md
@@ -171,13 +171,80 @@ Resource types:
 
 ## Performance Testing
 
-Run tests in Release mode for accurate performance measurements:
+### Benchmarking Framework
+
+The `HdrHistogram.Benchmarking/` project uses BenchmarkDotNet for performance measurement.
+All benchmarks must run in Release mode for accurate results.
+
+| Component | Details |
+|-----------|---------|
+| Framework | BenchmarkDotNet |
+| Targets | net10.0, net9.0, net8.0 |
+| Entry point | `Program.cs` with `BenchmarkSwitcher` |
+| Diagnostics | `[MemoryDiagnoser]` on all benchmark classes |
+
+### Benchmark-Driven Development
+
+Performance issues follow a benchmark-first methodology, analogous to test-driven development.
+Benchmarks are the tests for non-functional requirements.
+
+**Phase 1 — Benchmark scaffolding (before any implementation changes):**
+
+1. Create benchmark classes covering both levels:
+   - **Micro-benchmarks** — isolate the specific operation being optimised (e.g. `PutLong`/`GetLong`)
+   - **End-to-end benchmarks** — exercise the realistic user workflow that the micro-operation feeds into (e.g. histogram encode/decode round-trip)
+2. Register benchmarks in `Program.cs` `BenchmarkSwitcher`
+3. Run benchmarks against the **unmodified code** to establish a baseline
+4. Record baseline results (Mean, StdDev, Allocated, Op/s)
+
+**Phase 2 — Implementation:**
+
+5. Apply the optimisation
+6. Add or update unit tests as normal
+
+**Phase 3 — Validation (after implementation is complete):**
+
+7. Run the same benchmarks with identical configuration
+8. Compare against baseline: generate a side-by-side table with deltas
+9. Document the results — including if the change shows no meaningful improvement
+
+Both levels of benchmark are required because:
+
+- Micro-benchmarks prove the isolated operation improved
+- End-to-end benchmarks prove the improvement matters in a realistic context
+- A micro-optimisation with no observable end-to-end impact may not justify the change
+
+### Benchmark Design Guidelines
+
+- Use `[MemoryDiagnoser]` on every benchmark class to surface allocation counts
+- Reset buffer positions or state in each benchmark method, not in setup (matches BDN iteration model)
+- Pre-allocate buffers in `[GlobalSetup]` so setup allocation is excluded from measurements
+- Use realistic data sizes and distributions that match production usage
+- Name benchmark methods clearly: `Encode`, `Decode`, `EncodeCompressed`, not `Test1`, `Bench_A`
+
+### Running Benchmarks
 
 ```bash
-dotnet test --configuration Release
+# Build in Release mode (required)
+dotnet build HdrHistogram.Benchmarking/ -c Release
+
+# Run specific benchmarks
+dotnet run -c Release --project HdrHistogram.Benchmarking/ -- --filter '*ClassName*'
+
+# Export results as JSON for comparison
+dotnet run -c Release --project HdrHistogram.Benchmarking/ -- --filter '*ClassName*' --exporters json
+
+# Quick iteration (fewer iterations, faster feedback)
+dotnet run -c Release --project HdrHistogram.Benchmarking/ -- --filter '*ClassName*' --job short
 ```
 
-Separate benchmarking project (`HdrHistogram.Benchmarking`) uses BenchmarkDotNet for micro-benchmarks.
+### Existing Benchmarks
+
+| Class | Location | What it measures |
+|-------|----------|-----------------|
+| `LeadingZeroCount64BitBenchmark` | `LeadingZeroCount/` | Bit counting algorithm comparison |
+| `LeadingZeroCount32BitBenchmark` | `LeadingZeroCount/` | 32-bit bit counting comparison |
+| `Recording32BitBenchmark` | `Recording/` | Value recording throughput |
 
 ## Test Organization Guidelines
 

From 0cbacab3eeeeed7ce50e5ddab480ba1371f72e45 Mon Sep 17 00:00:00 2001
From: Lee Campbell <lee.ryan.campbell@gmail.com>
Date: Sun, 22 Mar 2026 13:23:19 +0800
Subject: [PATCH 2/2] fix: address PR #158 review feedback

- Remove stale Program.cs from git add in execute-tasks.md
- Clarify --job short is only for Phase 2; Phase 1 and 3 use defaults
- Add trailing newlines to create-tasks.md and execute-tasks.md
- Replace static benchmarks table with directory lookup directive

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 autonomous/prompts/create-tasks.md       | 2 +-
 autonomous/prompts/execute-tasks.md      | 8 ++++----
 spec/tech-standards/testing-standards.md | 7 ++-----
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/autonomous/prompts/create-tasks.md b/autonomous/prompts/create-tasks.md
index bfd61f5..c7b4906 100644
--- a/autonomous/prompts/create-tasks.md
+++ b/autonomous/prompts/create-tasks.md
@@ -53,4 +53,4 @@ Phase 3 — Benchmark validation (after implementation is complete):
 
 Create the directory `plan/benchmarks/` for storing results.
 
-If the brief category is `functional`, use the current ordering (implementation, tests, docs).
\ No newline at end of file
+If the brief category is `functional`, use the current ordering (implementation, tests, docs).
diff --git a/autonomous/prompts/execute-tasks.md b/autonomous/prompts/execute-tasks.md
index 8accd9c..a8af6ce 100644
--- a/autonomous/prompts/execute-tasks.md
+++ b/autonomous/prompts/execute-tasks.md
@@ -30,7 +30,7 @@ Special handling for benchmark tasks (applies when brief category is `performanc
 
 Baseline capture (Phase 1 benchmark tasks):
 - After creating benchmark classes, commit ONLY the benchmark files:
-  `git add HdrHistogram.Benchmarking/ Program.cs && git commit -m "bench: add benchmarks for baseline capture"`
+  `git add HdrHistogram.Benchmarking/ && git commit -m "bench: add benchmarks for baseline capture"`
 - Run benchmarks in Release configuration. Use `--filter` to target only the relevant benchmarks.
 - BenchmarkDotNet outputs markdown tables to stdout and detailed results to `BenchmarkDotNet.Artifacts/`.
   Copy the results table into `plan/benchmarks/baseline.md`.
@@ -43,6 +43,6 @@ Post-change validation (Phase 3 benchmark tasks):
 - If any benchmark shows a regression, flag it in comparison.md and add a new task to investigate.
 
 Benchmark execution notes:
-- Use `--job short` for faster feedback during development iterations.
-- For the final Phase 3 comparison, use default BenchmarkDotNet settings (no `--job short`).
-- Benchmarks that fail to compile or run must be fixed before proceeding.
\ No newline at end of file
+- Both Phase 1 (baseline) and Phase 3 (final comparison) MUST use default BenchmarkDotNet settings — do NOT use `--job short` for these runs.
+- Use `--job short` ONLY for ad-hoc iteration during Phase 2 development.
+- Benchmarks that fail to compile or run must be fixed before proceeding.
diff --git a/spec/tech-standards/testing-standards.md b/spec/tech-standards/testing-standards.md
index 32e5834..340aba9 100644
--- a/spec/tech-standards/testing-standards.md
+++ b/spec/tech-standards/testing-standards.md
@@ -240,11 +240,8 @@ dotnet run -c Release --project HdrHistogram.Benchmarking/ -- --filter '*ClassNa
 
 ### Existing Benchmarks
 
-| Class | Location | What it measures |
-|-------|----------|-----------------|
-| `LeadingZeroCount64BitBenchmark` | `LeadingZeroCount/` | Bit counting algorithm comparison |
-| `LeadingZeroCount32BitBenchmark` | `LeadingZeroCount/` | 32-bit bit counting comparison |
-| `Recording32BitBenchmark` | `Recording/` | Value recording throughput |
+Check `HdrHistogram.Benchmarking/` for the current set of benchmark classes.
+Each subdirectory groups benchmarks by area (e.g. `LeadingZeroCount/`, `Recording/`).
 
 ## Test Organization Guidelines