evalops · haasonsaas · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -66,3 +66,27 @@ jobs:
       - uses: Swatinem/rust-cache@v2
       - name: Test
         run: cargo test
+
+  mutation:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - uses: dtolnay/rust-toolchain@1.88.0
+      - uses: Swatinem/rust-cache@v2
+      - name: Install cargo-mutants
+        run: cargo install cargo-mutants --locked
+      - name: Mutation test (storage_json)
+        run: |
+          timeout 900 cargo mutants -f '*storage_json*' 2>&1 | tee mutation.log || true
+          MISSED=$(grep -E '[0-9]+ missed' mutation.log | tail -1 | grep -oE '[0-9]+' | head -1 || echo "0")
+          echo "missed=$MISSED" >> "$GITHUB_OUTPUT"
+        id: mutation
+      - name: Check mutation baseline
+        run: |
+          MISSED="${{ steps.mutation.outputs.missed }}"
+          BASELINE=15
+          if [ -n "$MISSED" ] && [ "$MISSED" -gt "$BASELINE" ]; then
+            echo "Mutation missed count $MISSED exceeds baseline $BASELINE. Update docs/mutation-testing.md and this baseline if intentional."
+            exit 1
+          fi
+        if: always() && steps.mutation.outputs.missed != ''
diff --git a/.github/workflows/diffscope.yml b/.github/workflows/diffscope.yml
@@ -37,16 +37,26 @@ jobs:
             echo "skip=false" >> "$GITHUB_OUTPUT"
           fi
 
+      - name: Check image available
+        id: check_image
+        run: |
+          docker pull ghcr.io/evalops/diffscope:latest 2>/dev/null && echo "available=true" >> "$GITHUB_OUTPUT" || echo "available=false" >> "$GITHUB_OUTPUT"
+
+      - name: Notice image unavailable
+        if: steps.check_image.outputs.available == 'false'
+        run: echo "::notice::DiffScope review skipped — image ghcr.io/evalops/diffscope:latest not available (merge to main publishes it)"
+
       - name: Run DiffScope
-        if: steps.check_key.outputs.skip != 'true'
-        uses: docker://ghcr.io/evalops/diffscope:latest
-        env:
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        with:
-          args: review --diff pr.diff --output-format json --output comments.json
+        if: steps.check_key.outputs.skip != 'true' && steps.check_image.outputs.available == 'true'
+        run: |
+          docker run --rm \
+            -e OPENAI_API_KEY="${{ secrets.OPENAI_API_KEY }}" \
+            -v "$PWD":/workspace -w /workspace \
+            ghcr.io/evalops/diffscope:latest \
+            review --diff pr.diff --output-format json --output comments.json
 
       - name: Post comments
-        if: steps.check_key.outputs.skip != 'true'
+        if: steps.check_key.outputs.skip != 'true' && steps.check_image.outputs.available == 'true'
         uses: actions/github-script@v7
         with:
           script: |

diff --git a/.github/workflows/publish-image.yml b/.github/workflows/publish-image.yml
@@ -0,0 +1,46 @@
+# Build and push Docker image to ghcr.io on every merge to main.
+# Keeps ghcr.io/evalops/diffscope:latest available for the DiffScope Review workflow.
+name: Publish image
+
+on:
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: publish-image-main
+  cancel-in-progress: false
+
+jobs:
+  build-and-push:
+    name: Build and push Docker image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: linux/amd64
+          push: true
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          tags: |
+            ghcr.io/evalops/diffscope:latest
+            ghcr.io/evalops/diffscope:sha-${{ github.sha }}
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -22,7 +22,7 @@ cargo run -- --help       # CLI usage
 - `examples/` — Usage examples
 
 ## Development
-- Run `bash scripts/install-hooks.sh` once to install pre-commit and pre-push hooks (merge conflict check, trailing newline, conditional Rust/web checks, shellcheck; pre-push adds version sync, cargo audit, full web build+test).
+- Run `bash scripts/install-hooks.sh` once to install pre-commit and pre-push hooks (merge conflict check, trailing newline, conditional Rust/web checks, shellcheck; pre-push adds version sync, cargo audit, full web build+test). Mutation testing runs in CI only; see `docs/mutation-testing.md`.
 
 ## Conventions
 - Use frontier models for reviews — never default to smaller models

diff --git a/README.md b/README.md
@@ -1017,7 +1017,7 @@ bash scripts/install-hooks.sh
 - When `web/` is staged: `npm run lint`, `tsc -b`, and `npm run test` in `web/`
 - When `scripts/` or `.githooks/` change: `shellcheck` (if installed)
 
-**Pre-push** runs the full gate: workflow check, version sync (`Cargo.toml` vs git tags), `cargo fmt`, `cargo clippy`, `cargo audit` (if installed), `npm ci && npm run build && npm run test` in `web/`, and `cargo test`. The first push after clone may take longer due to `npm ci`.
+**Pre-push** runs the full gate: workflow check, version sync (`Cargo.toml` vs git tags), `cargo fmt`, `cargo clippy`, `cargo audit` (if installed), `npm ci && npm run build && npm run test` in `web/`, and `cargo test`. The first push after clone may take longer due to `npm ci`. Mutation testing runs in CI only (see `docs/mutation-testing.md`), not on pre-push, to keep pushes fast. For a quick push without full checks use `git push --no-verify` (use sparingly).
 
 ## Supported Platforms
 

diff --git a/docs/mutation-testing.md b/docs/mutation-testing.md
@@ -0,0 +1,48 @@
+# Mutation testing
+
+We use [cargo-mutants](https://github.com/sourcefrog/cargo-mutants) to find gaps in test coverage. Mutants are small code changes (e.g. `*` → `/`, `>` → `>=`); if tests still pass, the mutant is "missed" and the behavior is under-tested.
+
+## Running
+
+```bash
+# All mutants (slow; 6000+)
+cargo mutants
+
+# One file / path (faster; glob)
+cargo mutants -f '*storage_json*'
+cargo mutants -f '*state*'
+cargo mutants -f '*storage_pg*'
+cargo mutants -f '*cost*'
+```
+
+CI runs mutation on `storage_json` with a timeout; see [CI job](#ci) below.
+
+## Known equivalent / accepted mutants
+
+These mutants are either equivalent (same behavior) or accepted by design. CI does not fail on them.
+
+### cost.rs
+
+| Location | Mutation | Reason |
+|----------|----------|--------|
+| ~line 43 | `* FALLBACK_PRICE_PER_M` → `/ FALLBACK_PRICE_PER_M` | With `FALLBACK_PRICE_PER_M = 1.0`, `* 1` and `/ 1` are equivalent. No test can distinguish without changing the constant. |
+
+### storage_json.rs
+
+After adding targeted tests (refresh_summary, get_event_stats exact aggregates/single-event/by_repo, prune boundary), many previously missed mutants are now killed. Any remaining misses in these areas are documented here after a run:
+
+- **refresh_summary**: Condition `summary.is_some() \|\| !comments.is_empty()` — tests now assert synthesized summary when comments exist and no summary.
+- **get_event_stats**: Percentile index, avg formulas, by_model/by_repo — tests assert exact values (e.g. p50/p95/p99, avg_score).
+- **prune**: Boundary `now - started_at > max_age_secs` — test asserts review exactly at boundary is not pruned; max_count test asserts oldest removed.
+
+If new mutants appear in these regions, add assertions that would fail on the wrong operator/formula, or add them to this table with a one-line rationale.
+
+## Pre-push vs CI
+
+- **Pre-push** (`.githooks/pre-push`): Runs unit tests, `cargo audit`, web build+test. Does *not* run mutation (too slow for every push).
+- **CI mutation job**: Runs `cargo mutants -f storage_json` (or one crate) on a schedule or on PRs to main. Fails if "missed" count increases beyond the baseline in this doc.
+- For a quick local push without full checks: `git push --no-verify` (use sparingly; CI will still run).
+
+## CI
+
+The `mutation` job in `.github/workflows/ci.yml` runs mutation on the `storage_json` crate with a timeout. Update the "allowed missed" baseline in the job when you intentionally accept new equivalent mutants (and add them to the table above).
diff --git a/src/server/storage_json.rs b/src/server/storage_json.rs
@@ -390,16 +390,28 @@ impl StorageBackend for JsonStorageBackend {
     }
 
     async fn prune(&self, max_age_secs: i64, max_count: usize) -> anyhow::Result<usize> {
+        let now = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap_or_default()
+            .as_secs() as i64;
+        self.prune_at(max_age_secs, max_count, now).await
+    }
+}
+
+impl JsonStorageBackend {
+    /// Prune with a fixed "now" timestamp. Used by the trait implementation and by tests to avoid race conditions.
+    pub(crate) async fn prune_at(
+        &self,
+        max_age_secs: i64,
+        max_count: usize,
+        now_secs: i64,
+    ) -> anyhow::Result<usize> {
         let removed = {
             let mut reviews = self.reviews.write().await;
-            let now = std::time::SystemTime::now()
-                .duration_since(std::time::UNIX_EPOCH)
-                .unwrap_or_default()
-                .as_secs() as i64;
 
             let expired: Vec<String> = reviews
                 .iter()
-                .filter(|(_, r)| now - r.started_at > max_age_secs)
+                .filter(|(_, r)| now_secs - r.started_at > max_age_secs)
                 .map(|(id, _)| id.clone())
                 .collect();
             let mut removed = expired.len();
@@ -1099,6 +1111,79 @@ mod tests {
         assert!(stats.avg_score.is_none());
         assert_eq!(stats.error_rate, 0.0);
         assert_eq!(stats.p50_latency_ms, 0);
+        assert_eq!(stats.p95_latency_ms, 0);
+        assert_eq!(stats.p99_latency_ms, 0);
+        assert!(stats.by_model.is_empty());
+        assert!(stats.by_source.is_empty());
+        assert!(stats.by_repo.is_empty());
+        assert!(stats.daily_counts.is_empty());
+        assert_eq!(stats.total_cost_estimate, 0.0);
+    }
+
+    /// Single event: percentiles and avg must equal that single value (catches percentile/division mutants).
+    #[tokio::test]
+    async fn test_get_event_stats_single_event_exact_values() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let mut s = make_session_with_event(
+            "r1",
+            now_ts(),
+            ReviewStatus::Complete,
+            "review.completed",
+            "gpt-4o",
+            "head",
+            150,
+        );
+        s.event.as_mut().unwrap().overall_score = Some(7.0);
+        backend.save_review(&s).await.unwrap();
+
+        let stats = backend
+            .get_event_stats(&EventFilters::default())
+            .await
+            .unwrap();
+
+        assert_eq!(stats.total_reviews, 1);
+        assert_eq!(stats.avg_duration_ms, 150.0);
+        assert_eq!(stats.p50_latency_ms, 150, "single value is p50/p95/p99");
+        assert_eq!(stats.p95_latency_ms, 150);
+        assert_eq!(stats.p99_latency_ms, 150);
+        assert_eq!(stats.avg_score, Some(7.0));
+    }
+
+    /// By-repo avg_score: two events same repo (4.0 + 6.0) / 2 = 5.0.
+    #[tokio::test]
+    async fn test_get_event_stats_by_repo_avg_score_exact() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let now = now_ts();
+        for (i, &score) in [4.0_f32, 6.0_f32].iter().enumerate() {
+            let mut s = make_session_with_event(
+                &format!("r{}", i),
+                now + i as i64,
+                ReviewStatus::Complete,
+                "review.completed",
+                "gpt-4o",
+                "head",
+                100,
+            );
+            if let Some(ref mut e) = s.event {
+                e.github_repo = Some("org/same-repo".to_string());
+                e.overall_score = Some(score);
+            }
+            backend.save_review(&s).await.unwrap();
+        }
+
+        let stats = backend
+            .get_event_stats(&EventFilters::default())
+            .await
+            .unwrap();
+
+        assert_eq!(stats.by_repo.len(), 1);
+        assert_eq!(stats.by_repo[0].repo, "org/same-repo");
+        assert_eq!(stats.by_repo[0].count, 2);
+        assert_eq!(stats.by_repo[0].avg_score, Some(5.0), " (4+6)/2 ");
     }
 
     #[tokio::test]
@@ -1346,6 +1431,102 @@ mod tests {
         assert_eq!(stats.by_model[0].avg_duration_ms, 200.0);
     }
 
+    /// refresh_summary: get_review with comments but no summary produces synthesized summary.
+    #[tokio::test]
+    async fn test_get_review_refreshes_summary_when_comments_no_summary() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let mut session = make_session("ref-sum", now_ts(), ReviewStatus::Complete);
+        session.comments = vec![
+            make_comment("c1", "src/a.rs"),
+            make_comment("c2", "src/b.rs"),
+        ];
+        session.summary = None;
+        backend.save_review(&session).await.unwrap();
+
+        let loaded = backend.get_review("ref-sum").await.unwrap().unwrap();
+        assert!(
+            loaded.summary.is_some(),
+            "refresh_summary should synthesize summary when comments exist"
+        );
+        let sum = loaded.summary.unwrap();
+        assert_eq!(sum.total_comments, 2);
+    }
+
+    /// refresh_summary: list_reviews returns sessions with refreshed summary when comments exist.
+    #[tokio::test]
+    async fn test_list_reviews_refreshes_summary_for_sessions_with_comments() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let mut with_comments = make_session("with-c", now_ts(), ReviewStatus::Complete);
+        with_comments.comments = vec![make_comment("c1", "src/main.rs")];
+        with_comments.summary = None;
+        backend.save_review(&with_comments).await.unwrap();
+
+        backend
+            .save_review(&make_session("no-c", now_ts() - 1, ReviewStatus::Complete))
+            .await
+            .unwrap();
+
+        let list = backend.list_reviews(10, 0).await.unwrap();
+        let with_c = list.iter().find(|s| s.id == "with-c").unwrap();
+        assert!(
+            with_c.summary.is_some(),
+            "list_reviews should refresh summary for session with comments"
+        );
+    }
+
+    /// Prune age boundary: review with started_at exactly (now - max_age_secs) must NOT be expired (> not >=).
+    /// Uses prune_at with a single timestamp to avoid race (Sentry feedback).
+    #[tokio::test]
+    async fn test_prune_age_boundary_not_expired() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let now = now_ts();
+        let max_age = 100_i64;
+        let session = make_session("boundary", now - max_age, ReviewStatus::Complete);
+        backend.save_review(&session).await.unwrap();
+
+        let removed = backend.prune_at(max_age, 1000, now).await.unwrap();
+        assert_eq!(
+            removed, 0,
+            "exactly at boundary (now - max_age) should not be pruned"
+        );
+
+        let list = backend.list_reviews(10, 0).await.unwrap();
+        assert_eq!(list.len(), 1);
+        assert_eq!(list[0].id, "boundary");
+    }
+
+    /// Prune max_count: keep newest max_count completed, remove oldest.
+    #[tokio::test]
+    async fn test_prune_max_count_removes_oldest() {
+        let dir = tempfile::tempdir().unwrap();
+        let backend = JsonStorageBackend::new(&dir.path().join("reviews.json"));
+
+        let base = now_ts() - 10_000;
+        for i in 0..5 {
+            let s = make_session(&format!("r{}", i), base + i as i64, ReviewStatus::Complete);
+            backend.save_review(&s).await.unwrap();
+        }
+
+        let removed = backend.prune(1_000_000, 3).await.unwrap();
+        assert_eq!(removed, 2, "5 - 3 = 2 removed by count limit");
+
+        let list = backend.list_reviews(10, 0).await.unwrap();
+        assert_eq!(list.len(), 3);
+        let ids: Vec<_> = list.iter().map(|s| s.id.as_str()).collect();
+        let want = ["r2", "r3", "r4"];
+        assert!(
+            want.iter().all(|w| ids.contains(w)),
+            "newest 3 (r2,r3,r4) kept; r0,r1 removed, got {:?}",
+            ids
+        );
+    }
+
     /// time_to filter: event with created_at == time_to must be included (<=).
     #[tokio::test]
     async fn test_list_events_time_to_inclusive() {