From c794c25bf9d7574503baa2521c3395bf7e41f4d8 Mon Sep 17 00:00:00 2001
From: Jonathan Haas <jonathan@haas.holdings>
Date: Sat, 14 Mar 2026 10:10:11 -0700
Subject: [PATCH 1/5] Mutation testing, cost visibility, CI, and docs

- storage_json: tests for refresh_summary, get_event_stats (empty/single/by_repo), prune boundary
- docs/mutation-testing.md: how to run, equivalent mutants, pre-push vs CI, file globs
- CI: mutation job on *storage_json* with timeout and missed-count baseline (15)
- README/CLAUDE: pre-push vs CI mutation note; optional --no-verify
- Analytics: show est. cost (reviews) from useEventStats when available
- web: cost.test.ts for formatCost, estimateCost, totalCost

Made-with: Cursor
---
 .github/workflows/ci.yml           |  24 +++++
 CLAUDE.md                          |   2 +-
 README.md                          |   2 +-
 docs/mutation-testing.md           |  48 +++++++++
 src/server/storage_json.rs         | 168 +++++++++++++++++++++++++++++
 web/src/lib/__tests__/cost.test.ts |  68 ++++++++++++
 web/src/pages/Analytics.tsx        |  15 ++-
 7 files changed, 322 insertions(+), 5 deletions(-)
 create mode 100644 docs/mutation-testing.md
 create mode 100644 web/src/lib/__tests__/cost.test.ts

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ace1e06..0a08df9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -66,3 +66,27 @@ jobs:
       - uses: Swatinem/rust-cache@v2
       - name: Test
         run: cargo test
+
+  mutation:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - uses: dtolnay/rust-toolchain@1.88.0
+      - uses: Swatinem/rust-cache@v2
+      - name: Install cargo-mutants
+        run: cargo install cargo-mutants --locked
+      - name: Mutation test (storage_json)
+        run: |
+          timeout 900 cargo mutants -f '*storage_json*' 2>&1 | tee mutation.log || true
+          MISSED=$(grep -E '[0-9]+ missed' mutation.log | tail -1 | grep -oE '[0-9]+' | head -1 || echo "0")
+          echo "missed=$MISSED" >> "$GITHUB_OUTPUT"
+        id: mutation
+      - name: Check mutation baseline
+        run: |
+          MISSED="${{ steps.mutation.outputs.missed }}"
+          BASELINE=15
+          if [ -n "$MISSED" ] && [ "$MISSED" -gt "$BASELINE" ]; then
+            echo "Mutation missed count $MISSED exceeds baseline $BASELINE. Update docs/mutation-testing.md and this baseline if intentional."
+            exit 1
+          fi
+        if: always() && steps.mutation.outputs.missed != ''
diff --git a/CLAUDE.md b/CLAUDE.md
index aa69cc1..4d1f235 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -22,7 +22,7 @@ cargo run -- --help       # CLI usage
 - `examples/` — Usage examples
 
 ## Development
-- Run `bash scripts/install-hooks.sh` once to install pre-commit and pre-push hooks (merge conflict check, trailing newline, conditional Rust/web checks, shellcheck; pre-push adds version sync, cargo audit, full web build+test).
+- Run `bash scripts/install-hooks.sh` once to install pre-commit and pre-push hooks (merge conflict check, trailing newline, conditional Rust/web checks, shellcheck; pre-push adds version sync, cargo audit, full web build+test). Mutation testing runs in CI only; see `docs/mutation-testing.md`.
 
 ## Conventions
 - Use frontier models for reviews — never default to smaller models
diff --git a/README.md b/README.md
index fc75f22..4156f10 100644
--- a/README.md
+++ b/README.md
@@ -1017,7 +1017,7 @@ bash scripts/install-hooks.sh
 - When `web/` is staged: `npm run lint`, `tsc -b`, and `npm run test` in `web/`
 - When `scripts/` or `.githooks/` change: `shellcheck` (if installed)
 
-**Pre-push** runs the full gate: workflow check, version sync (`Cargo.toml` vs git tags), `cargo fmt`, `cargo clippy`, `cargo audit` (if installed), `npm ci && npm run build && npm run test` in `web/`, and `cargo test`. The first push after clone may take longer due to `npm ci`.
+**Pre-push** runs the full gate: workflow check, version sync (`Cargo.toml` vs git tags), `cargo fmt`, `cargo clippy`, `cargo audit` (if installed), `npm ci && npm run build && npm run test` in `web/`, and `cargo test`. The first push after clone may take longer due to `npm ci`. Mutation testing runs in CI only (see `docs/mutation-testing.md`), not on pre-push, to keep pushes fast. For a quick push without full checks use `git push --no-verify` (use sparingly).
 
 ## Supported Platforms
 
diff --git a/docs/mutation-testing.md b/docs/mutation-testing.md
new file mode 100644
index 0000000..f79dad8
--- /dev/null
+++ b/docs/mutation-testing.md
@@ -0,0 +1,48 @@
+# Mutation testing
+
+We use [cargo-mutants](https://github.com/sourcefrog/cargo-mutants) to find gaps in test coverage. Mutants are small code changes (e.g. `*` → `/`, `>` → `>=`); if tests still pass, the mutant is "missed" and the behavior is under-tested.
+
+## Running
+
+```bash
+# All mutants (slow; 6000+)
+cargo mutants
+
+# One file / path (faster; glob)
+cargo mutants -f '*storage_json*'
+cargo mutants -f '*state*'
+cargo mutants -f '*storage_pg*'
+cargo mutants -f '*cost*'
+```
+
+CI runs mutation on `storage_json` with a timeout; see [CI job](#ci) below.
+
+## Known equivalent / accepted mutants
+
+These mutants are either equivalent (same behavior) or accepted by design. CI does not fail on them.
+
+### cost.rs
+
+| Location | Mutation | Reason |
+|----------|----------|--------|
+| ~line 43 | `* FALLBACK_PRICE_PER_M` → `/ FALLBACK_PRICE_PER_M` | With `FALLBACK_PRICE_PER_M = 1.0`, `* 1` and `/ 1` are equivalent. No test can distinguish without changing the constant. |
+
+### storage_json.rs
+
+After adding targeted tests (refresh_summary, get_event_stats exact aggregates/single-event/by_repo, prune boundary), many previously missed mutants are now killed. Any remaining misses in these areas are documented here after a run:
+
+- **refresh_summary**: Condition `summary.is_some() \|\| !comments.is_empty()` — tests now assert synthesized summary when comments exist and no summary.
+- **get_event_stats**: Percentile index, avg formulas, by_model/by_repo — tests assert exact values (e.g. p50/p95/p99, avg_score).
+- **prune**: Boundary `now - started_at > max_age_secs` — test asserts review exactly at boundary is not pruned; max_count test asserts oldest removed.
+
+If new mutants appear in these regions, add assertions that would fail on the wrong operator/formula, or add them to this table with a one-line rationale.
+
+## Pre-push vs CI
+
+- **Pre-push** (`.githooks/pre-push`): Runs unit tests, `cargo audit`, web build+test. Does *not* run mutation (too slow for every push).
+- **CI mutation job**: Runs `cargo mutants -f storage_json` (or one crate) on a schedule or on PRs to main. Fails if "missed" count increases beyond the baseline in this doc.
+- For a quick local push without full checks: `git push --no-verify` (use sparingly; CI will still run).
+
+## CI
+
+The `mutation` job in `.github/workflows/ci.yml` runs mutation on the `storage_json` crate with a timeout. Update the "allowed missed" baseline in the job when you intentionally accept new equivalent mutants (and add them to the table above).
diff --git a/src/server/storage_json.rs b/src/server/storage_json.rs
index 0af9562..a0acb31 100644
--- a/src/server/storage_json.rs
+++ b/src/server/storage_json.rs
@@ -1099,6 +1099,79 @@ mod tests {
         assert!(stats.avg_score.is_none());
         assert_eq!(stats.error_rate, 0.0);
         assert_eq!(stats.p50_latency_ms, 0);
+        assert_eq!(stats.p95_latency_ms, 0);
+        assert_eq!(stats.p99_latency_ms, 0);
+        assert!(stats.by_model.is_empty());
+        assert!(stats.by_source.is_empty());
+        assert!(stats.by_repo.is_empty());
+        assert!(stats.daily_counts.is_empty());
+        assert_eq!(stats.total_cost_estimate, 0.0);
+    }
+
+    /// Single event: percentiles and avg must equal that single value (catches percentile/division mutants).
+    #[tokio::test]
+    async fn test_get_event_stats_single_event_exact_values() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let mut s = make_session_with_event(
+            "r1",
+            now_ts(),
+            ReviewStatus::Complete,
+            "review.completed",
+            "gpt-4o",
+            "head",
+            150,
+        );
+        s.event.as_mut().unwrap().overall_score = Some(7.0);
+        backend.save_review(&s).await.unwrap();
+
+        let stats = backend
+            .get_event_stats(&EventFilters::default())
+            .await
+            .unwrap();
+
+        assert_eq!(stats.total_reviews, 1);
+        assert_eq!(stats.avg_duration_ms, 150.0);
+        assert_eq!(stats.p50_latency_ms, 150, "single value is p50/p95/p99");
+        assert_eq!(stats.p95_latency_ms, 150);
+        assert_eq!(stats.p99_latency_ms, 150);
+        assert_eq!(stats.avg_score, Some(7.0));
+    }
+
+    /// By-repo avg_score: two events same repo (4.0 + 6.0) / 2 = 5.0.
+    #[tokio::test]
+    async fn test_get_event_stats_by_repo_avg_score_exact() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let now = now_ts();
+        for (i, &score) in [4.0_f32, 6.0_f32].iter().enumerate() {
+            let mut s = make_session_with_event(
+                &format!("r{}", i),
+                now + i as i64,
+                ReviewStatus::Complete,
+                "review.completed",
+                "gpt-4o",
+                "head",
+                100,
+            );
+            if let Some(ref mut e) = s.event {
+                e.github_repo = Some("org/same-repo".to_string());
+                e.overall_score = Some(score);
+            }
+            backend.save_review(&s).await.unwrap();
+        }
+
+        let stats = backend
+            .get_event_stats(&EventFilters::default())
+            .await
+            .unwrap();
+
+        assert_eq!(stats.by_repo.len(), 1);
+        assert_eq!(stats.by_repo[0].repo, "org/same-repo");
+        assert_eq!(stats.by_repo[0].count, 2);
+        assert_eq!(stats.by_repo[0].avg_score, Some(5.0), " (4+6)/2 ");
     }
 
     #[tokio::test]
@@ -1346,6 +1419,101 @@ mod tests {
         assert_eq!(stats.by_model[0].avg_duration_ms, 200.0);
     }
 
+    /// refresh_summary: get_review with comments but no summary produces synthesized summary.
+    #[tokio::test]
+    async fn test_get_review_refreshes_summary_when_comments_no_summary() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let mut session = make_session("ref-sum", now_ts(), ReviewStatus::Complete);
+        session.comments = vec![
+            make_comment("c1", "src/a.rs"),
+            make_comment("c2", "src/b.rs"),
+        ];
+        session.summary = None;
+        backend.save_review(&session).await.unwrap();
+
+        let loaded = backend.get_review("ref-sum").await.unwrap().unwrap();
+        assert!(
+            loaded.summary.is_some(),
+            "refresh_summary should synthesize summary when comments exist"
+        );
+        let sum = loaded.summary.unwrap();
+        assert_eq!(sum.total_comments, 2);
+    }
+
+    /// refresh_summary: list_reviews returns sessions with refreshed summary when comments exist.
+    #[tokio::test]
+    async fn test_list_reviews_refreshes_summary_for_sessions_with_comments() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let mut with_comments = make_session("with-c", now_ts(), ReviewStatus::Complete);
+        with_comments.comments = vec![make_comment("c1", "src/main.rs")];
+        with_comments.summary = None;
+        backend.save_review(&with_comments).await.unwrap();
+
+        backend
+            .save_review(&make_session("no-c", now_ts() - 1, ReviewStatus::Complete))
+            .await
+            .unwrap();
+
+        let list = backend.list_reviews(10, 0).await.unwrap();
+        let with_c = list.iter().find(|s| s.id == "with-c").unwrap();
+        assert!(
+            with_c.summary.is_some(),
+            "list_reviews should refresh summary for session with comments"
+        );
+    }
+
+    /// Prune age boundary: review with started_at exactly (now - max_age_secs) must NOT be expired (> not >=).
+    #[tokio::test]
+    async fn test_prune_age_boundary_not_expired() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let now = now_ts();
+        let max_age = 100_i64;
+        let session = make_session("boundary", now - max_age, ReviewStatus::Complete);
+        backend.save_review(&session).await.unwrap();
+
+        let removed = backend.prune(max_age, 1000).await.unwrap();
+        assert_eq!(
+            removed, 0,
+            "exactly at boundary (now - max_age) should not be pruned"
+        );
+
+        let list = backend.list_reviews(10, 0).await.unwrap();
+        assert_eq!(list.len(), 1);
+        assert_eq!(list[0].id, "boundary");
+    }
+
+    /// Prune max_count: keep newest max_count completed, remove oldest.
+    #[tokio::test]
+    async fn test_prune_max_count_removes_oldest() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let base = now_ts() - 10_000;
+        for i in 0..5 {
+            let s = make_session(&format!("r{}", i), base + i as i64, ReviewStatus::Complete);
+            backend.save_review(&s).await.unwrap();
+        }
+
+        let removed = backend.prune(1_000_000, 3).await.unwrap();
+        assert_eq!(removed, 2, "5 - 3 = 2 removed by count limit");
+
+        let list = backend.list_reviews(10, 0).await.unwrap();
+        assert_eq!(list.len(), 3);
+        let ids: Vec<_> = list.iter().map(|s| s.id.as_str()).collect();
+        let want = ["r2", "r3", "r4"];
+        assert!(
+            want.iter().all(|w| ids.contains(w)),
+            "newest 3 (r2,r3,r4) kept; r0,r1 removed, got {:?}",
+            ids
+        );
+    }
+
     /// time_to filter: event with created_at == time_to must be included (<=).
     #[tokio::test]
     async fn test_list_events_time_to_inclusive() {
diff --git a/web/src/lib/__tests__/cost.test.ts b/web/src/lib/__tests__/cost.test.ts
new file mode 100644
index 0000000..b08749c
--- /dev/null
+++ b/web/src/lib/__tests__/cost.test.ts
@@ -0,0 +1,68 @@
+import { describe, expect, it } from 'vitest'
+import { estimateCost, formatCost, totalCost } from '../cost'
+import type { ReviewEvent } from '../../api/types'
+
+function makeEvent(overrides: Partial<ReviewEvent> = {}): ReviewEvent {
+  return {
+    review_id: 'e1',
+    event_type: 'review.completed',
+    model: 'gpt-4o',
+    diff_source: 'head',
+    duration_ms: 1000,
+    diff_bytes: 0,
+    diff_files_total: 0,
+    diff_files_reviewed: 0,
+    diff_files_skipped: 0,
+    comments_total: 0,
+    comments_by_severity: {},
+    comments_by_category: {},
+    hotspots_detected: 0,
+    high_risk_files: 0,
+    github_posted: false,
+    tokens_total: 100_000,
+    ...overrides,
+  }
+}
+
+describe('formatCost', () => {
+  it('formats zero as $0', () => {
+    expect(formatCost(0)).toBe('$0')
+  })
+  it('formats small values with enough precision', () => {
+    expect(formatCost(0.0001)).toBe('<$0.001')
+    expect(formatCost(0.005)).toBe('$0.0050')
+    expect(formatCost(0.5)).toBe('$0.500')
+  })
+  it('formats dollars with two decimals', () => {
+    expect(formatCost(1.5)).toBe('$1.50')
+    expect(formatCost(10)).toBe('$10.00')
+  })
+})
+
+describe('estimateCost', () => {
+  it('prefers server cost_estimate_usd when present', () => {
+    const e = makeEvent({ cost_estimate_usd: 0.42 })
+    expect(estimateCost(e)).toBe(0.42)
+  })
+  it('uses client estimate when cost_estimate_usd is absent', () => {
+    const e = makeEvent({ cost_estimate_usd: undefined, tokens_total: 1_000_000 })
+    expect(estimateCost(e)).toBeGreaterThan(0)
+  })
+  it('returns 0 when tokens_total is 0 and no server cost', () => {
+    const e = makeEvent({ tokens_total: 0, cost_estimate_usd: undefined })
+    expect(estimateCost(e)).toBe(0)
+  })
+})
+
+describe('totalCost', () => {
+  it('sums server cost when present', () => {
+    const events = [
+      makeEvent({ cost_estimate_usd: 0.1 }),
+      makeEvent({ cost_estimate_usd: 0.2 }),
+    ]
+    expect(totalCost(events)).toBeCloseTo(0.3)
+  })
+  it('returns 0 for empty list', () => {
+    expect(totalCost([])).toBe(0)
+  })
+})
diff --git a/web/src/pages/Analytics.tsx b/web/src/pages/Analytics.tsx
index b9c2e53..8ede787 100644
--- a/web/src/pages/Analytics.tsx
+++ b/web/src/pages/Analytics.tsx
@@ -4,7 +4,7 @@ import {
   AreaChart, Area, BarChart, Bar,
   ResponsiveContainer, XAxis, YAxis, Tooltip, CartesianGrid,
 } from 'recharts'
-import { useAnalyticsTrends, useReviews } from '../api/hooks'
+import { useAnalyticsTrends, useEventStats, useReviews } from '../api/hooks'
 import { AlertTriangle, Download, Loader2 } from 'lucide-react'
 import type { FeedbackEvalTrendGap } from '../api/types'
 import {
@@ -17,6 +17,7 @@ import {
   formatDurationHours,
   formatPercent,
 } from '../lib/analytics'
+import { formatCost } from '../lib/cost'
 import { scoreColorClass } from '../lib/scores'
 import { SEV_COLORS, CHART_THEME } from '../lib/constants'
 
@@ -120,6 +121,7 @@ export function Analytics() {
   const navigate = useNavigate()
   const { data: reviews, isLoading } = useReviews()
   const { data: trends, isLoading: trendsLoading } = useAnalyticsTrends()
+  const { data: eventStats } = useEventStats()
   const [drilldownSelection, setDrilldownSelection] = useState<
     { type: 'review'; reviewId: string }
     | { type: 'category'; category: string }
@@ -290,8 +292,15 @@ export function Analytics() {
 
       {hasReviewAnalytics && (
         <>
-          <div className="text-[10px] font-semibold text-text-muted tracking-[0.08em] font-code mb-3">
-            REVIEW ANALYTICS
+          <div className="flex items-center gap-4 mb-3">
+            <div className="text-[10px] font-semibold text-text-muted tracking-[0.08em] font-code">
+              REVIEW ANALYTICS
+            </div>
+            {eventStats != null && (
+              <div className="text-[11px] text-text-muted font-code">
+                Est. cost (reviews): <span className="text-text-primary font-medium">{formatCost(eventStats.total_cost_estimate)}</span>
+              </div>
+            )}
           </div>
 
           <div className="grid grid-cols-2 gap-3 mb-6">

From 147880918776b83a21df2e15242417c59be39a1d Mon Sep 17 00:00:00 2001
From: Jonathan Haas <jonathan@haas.holdings>
Date: Sat, 14 Mar 2026 10:14:05 -0700
Subject: [PATCH 2/5] Publish Docker image on merge to main; review job skips
 when image unavailable

- publish-image.yml: on push to main, build and push ghcr.io/evalops/diffscope:latest and :sha-<sha>
- diffscope.yml: check image available before Run DiffScope; skip gracefully with notice if pull fails (job no longer fails)

Made-with: Cursor
---
 .github/workflows/diffscope.yml     | 13 ++++++--
 .github/workflows/publish-image.yml | 46 +++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/publish-image.yml

diff --git a/.github/workflows/diffscope.yml b/.github/workflows/diffscope.yml
index e141b23..a472097 100644
--- a/.github/workflows/diffscope.yml
+++ b/.github/workflows/diffscope.yml
@@ -37,8 +37,17 @@ jobs:
             echo "skip=false" >> "$GITHUB_OUTPUT"
           fi
 
+      - name: Check image available
+        id: check_image
+        run: |
+          docker pull ghcr.io/evalops/diffscope:latest 2>/dev/null && echo "available=true" >> "$GITHUB_OUTPUT" || echo "available=false" >> "$GITHUB_OUTPUT"
+
+      - name: Notice image unavailable
+        if: steps.check_image.outputs.available == 'false'
+        run: echo "::notice::DiffScope review skipped — image ghcr.io/evalops/diffscope:latest not available (merge to main publishes it)"
+
       - name: Run DiffScope
-        if: steps.check_key.outputs.skip != 'true'
+        if: steps.check_key.outputs.skip != 'true' && steps.check_image.outputs.available == 'true'
         uses: docker://ghcr.io/evalops/diffscope:latest
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -46,7 +55,7 @@ jobs:
           args: review --diff pr.diff --output-format json --output comments.json
 
       - name: Post comments
-        if: steps.check_key.outputs.skip != 'true'
+        if: steps.check_key.outputs.skip != 'true' && steps.check_image.outputs.available == 'true'
         uses: actions/github-script@v7
         with:
           script: |
diff --git a/.github/workflows/publish-image.yml b/.github/workflows/publish-image.yml
new file mode 100644
index 0000000..b4e66fd
--- /dev/null
+++ b/.github/workflows/publish-image.yml
@@ -0,0 +1,46 @@
+# Build and push Docker image to ghcr.io on every merge to main.
+# Keeps ghcr.io/evalops/diffscope:latest available for the DiffScope Review workflow.
+name: Publish image
+
+on:
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: publish-image-main
+  cancel-in-progress: false
+
+jobs:
+  build-and-push:
+    name: Build and push Docker image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: linux/amd64
+          push: true
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          tags: |
+            ghcr.io/evalops/diffscope:latest
+            ghcr.io/evalops/diffscope:sha-${{ github.sha }}

From 52e44da258b443bed36c65fc006dba1442988159 Mon Sep 17 00:00:00 2001
From: Jonathan Haas <jonathan@haas.holdings>
Date: Sat, 14 Mar 2026 10:16:03 -0700
Subject: [PATCH 3/5] diffscope review: continue-on-error on Run DiffScope so
 image pull failure never fails job

Made-with: Cursor
---
 .github/workflows/diffscope.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/diffscope.yml b/.github/workflows/diffscope.yml
index a472097..076b265 100644
--- a/.github/workflows/diffscope.yml
+++ b/.github/workflows/diffscope.yml
@@ -49,6 +49,7 @@ jobs:
       - name: Run DiffScope
         if: steps.check_key.outputs.skip != 'true' && steps.check_image.outputs.available == 'true'
         uses: docker://ghcr.io/evalops/diffscope:latest
+        continue-on-error: true
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         with:

From 962fc6a3df1aeaed4e20ae5b25eb14e77757fdb9 Mon Sep 17 00:00:00 2001
From: Jonathan Haas <jonathan@haas.holdings>
Date: Sat, 14 Mar 2026 10:17:24 -0700
Subject: [PATCH 4/5] DiffScope Review: use docker run instead of uses
 docker:// to avoid pre-pull failing job

GitHub pre-pulls container actions before any steps run; that pull was failing the job.
Run DiffScope via 'docker run' only when image is already available from Check image step.

Made-with: Cursor
---
 .github/workflows/diffscope.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/diffscope.yml b/.github/workflows/diffscope.yml
index 076b265..15ddd48 100644
--- a/.github/workflows/diffscope.yml
+++ b/.github/workflows/diffscope.yml
@@ -48,12 +48,12 @@ jobs:
 
       - name: Run DiffScope
         if: steps.check_key.outputs.skip != 'true' && steps.check_image.outputs.available == 'true'
-        uses: docker://ghcr.io/evalops/diffscope:latest
-        continue-on-error: true
-        env:
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        with:
-          args: review --diff pr.diff --output-format json --output comments.json
+        run: |
+          docker run --rm \
+            -e OPENAI_API_KEY="${{ secrets.OPENAI_API_KEY }}" \
+            -v "$PWD":/workspace -w /workspace \
+            ghcr.io/evalops/diffscope:latest \
+            review --diff pr.diff --output-format json --output comments.json
 
       - name: Post comments
         if: steps.check_key.outputs.skip != 'true' && steps.check_image.outputs.available == 'true'

From 767a059cbbea7d21f478d89ef05a760d6b4c1c75 Mon Sep 17 00:00:00 2001
From: Jonathan Haas <jonathan@haas.holdings>
Date: Sat, 14 Mar 2026 10:23:05 -0700
Subject: [PATCH 5/5] Fix prune boundary test race (Sentry feedback): use
 prune_at(now) for single timestamp

- Extract prune_at(max_age_secs, max_count, now_secs) so test and production share logic
- prune() calls prune_at(..., SystemTime::now()); test calls prune_at(..., now_ts()) once

Made-with: Cursor
---
 src/server/storage_json.rs | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/server/storage_json.rs b/src/server/storage_json.rs
index a0acb31..f7fae3a 100644
--- a/src/server/storage_json.rs
+++ b/src/server/storage_json.rs
@@ -390,16 +390,28 @@ impl StorageBackend for JsonStorageBackend {
     }
 
     async fn prune(&self, max_age_secs: i64, max_count: usize) -> anyhow::Result<usize> {
+        let now = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap_or_default()
+            .as_secs() as i64;
+        self.prune_at(max_age_secs, max_count, now).await
+    }
+}
+
+impl JsonStorageBackend {
+    /// Prune with a fixed "now" timestamp. Used by the trait implementation and by tests to avoid race conditions.
+    pub(crate) async fn prune_at(
+        &self,
+        max_age_secs: i64,
+        max_count: usize,
+        now_secs: i64,
+    ) -> anyhow::Result<usize> {
         let removed = {
             let mut reviews = self.reviews.write().await;
-            let now = std::time::SystemTime::now()
-                .duration_since(std::time::UNIX_EPOCH)
-                .unwrap_or_default()
-                .as_secs() as i64;
 
             let expired: Vec<String> = reviews
                 .iter()
-                .filter(|(_, r)| now - r.started_at > max_age_secs)
+                .filter(|(_, r)| now_secs - r.started_at > max_age_secs)
                 .map(|(id, _)| id.clone())
                 .collect();
             let mut removed = expired.len();
@@ -1467,6 +1479,7 @@ mod tests {
     }
 
     /// Prune age boundary: review with started_at exactly (now - max_age_secs) must NOT be expired (> not >=).
+    /// Uses prune_at with a single timestamp to avoid race (Sentry feedback).
     #[tokio::test]
     async fn test_prune_age_boundary_not_expired() {
         let dir = tempfile::tempdir().unwrap();
@@ -1477,7 +1490,7 @@ mod tests {
         let session = make_session("boundary", now - max_age, ReviewStatus::Complete);
         backend.save_review(&session).await.unwrap();
 
-        let removed = backend.prune(max_age, 1000).await.unwrap();
+        let removed = backend.prune_at(max_age, 1000, now).await.unwrap();
         assert_eq!(
             removed, 0,
             "exactly at boundary (now - max_age) should not be pruned"