lolpack · yangdanny97 · Mar 17, 2026
diff --git a/typecheck_benchmark/daily_runner.py b/typecheck_benchmark/daily_runner.py
@@ -16,6 +16,7 @@
 import re
 import signal
 import shutil
+import statistics
 import subprocess
 import sys
 import tempfile
@@ -42,13 +43,26 @@
 # ---------------------------------------------------------------------------
 
 
+class RunStats(TypedDict):
+    """Statistical summary of multiple benchmark runs."""
+
+    min: float
+    max: float
+    mean: float
+    median: float
+    stddev: float
+
+
 class TimingMetrics(TypedDict, total=False):
     """Timing metrics for a single type checker run on a single package."""
 
     ok: bool
     execution_time_s: float
     peak_memory_mb: float
     error_message: str | None
+    runs: int
+    execution_time_stats: RunStats
+    peak_memory_stats: RunStats
 
 
 class PackageResult(TypedDict, total=False):
@@ -77,14 +91,15 @@ class AggregateStats(TypedDict, total=False):
     max_peak_memory_mb: float
 
 
-class BenchmarkOutput(TypedDict):
+class BenchmarkOutput(TypedDict, total=False):
     """Top-level JSON output."""
 
     timestamp: str
     date: str
     type_checkers: list[str]
     type_checker_versions: dict[str, str]
     package_count: int
+    runs_per_package: int
     aggregate: dict[str, AggregateStats]
     results: list[PackageResult]
 
@@ -695,6 +710,17 @@ def compute_percentile(values: Sequence[float | int], percentile: float) -> floa
     return sorted_values[lower] + fraction * (sorted_values[upper] - sorted_values[lower])
 
 
+def compute_run_stats(values: list[float]) -> RunStats:
+    """Compute min/max/mean/median/stddev for a list of values."""
+    return {
+        "min": round(min(values), 2),
+        "max": round(max(values), 2),
+        "mean": round(statistics.mean(values), 2),
+        "median": round(statistics.median(values), 2),
+        "stddev": round(statistics.stdev(values), 2) if len(values) > 1 else 0.0,
+    }
+
+
 def compute_aggregate_stats(
     results: list[PackageResult],
     type_checkers: list[str],
@@ -754,6 +780,7 @@ def run_benchmark(
     output_dir: Path | None = None,
     os_name: str | None = None,
     install_envs_file: Path | None = None,
+    runs: int = 1,
 ) -> Path:
     """Run the full benchmark suite.
 
@@ -765,6 +792,7 @@ def run_benchmark(
         output_dir: Where to write JSON results.
         os_name: OS name for filename (ubuntu, macos, windows).
         install_envs_file: Path to install_envs.json.
+        runs: Number of runs per checker per package.
 
     Returns:
         Path to the dated output JSON file.
@@ -797,6 +825,7 @@ def run_benchmark(
     print(f"Packages: {len(packages)}")
     print(f"Type checkers: {', '.join(type_checkers)}")
     print(f"Timeout: {timeout}s per checker")
+    print(f"Runs per checker: {runs}")
     print("=" * 70)
 
     # Versions
@@ -808,15 +837,15 @@ def run_benchmark(
     print()
 
     # Run benchmarks
-    all_results = _run_all(packages, type_checkers, timeout)
+    all_results = _run_all(packages, type_checkers, timeout, runs)
 
     # Aggregate
     aggregate = compute_aggregate_stats(all_results, type_checkers)
 
     # Save
     output_file = _save_results(
         all_results, aggregate, type_checkers, versions, len(packages),
-        output_dir, os_name,
+        output_dir, os_name, runs,
     )
 
     # Print summary
@@ -833,6 +862,7 @@ def _run_all(
     packages: list[dict[str, Any]],
     type_checkers: list[str],
     timeout: int,
+    runs: int = 1,
 ) -> list[PackageResult]:
     """Run benchmarks for all packages."""
     all_results: list[PackageResult] = []
@@ -846,7 +876,7 @@ def _run_all(
 
             print(f"\n[{i}/{len(packages)}] {name}")
 
-            result = _benchmark_package(pkg, temp_path, type_checkers, timeout)
+            result = _benchmark_package(pkg, temp_path, type_checkers, timeout, runs)
             all_results.append(result)
 
     return all_results
@@ -857,6 +887,7 @@ def _benchmark_package(
     temp_path: Path,
     type_checkers: list[str],
     timeout: int,
+    runs: int = 1,
 ) -> PackageResult:
     """Benchmark a single package: clone, install deps, run checkers."""
     name = pkg["name"]
@@ -912,16 +943,54 @@ def _benchmark_package(
             }
             continue
 
-        print(f"    Running {checker}...")
-        m = run_checker(checker, package_path, resolved_paths, timeout)
-        metrics[checker] = m
+        print(f"    Running {checker}... ({runs} run{'s' if runs > 1 else ''})")
+        times: list[float] = []
+        memories: list[float] = []
+        failed_metric: TimingMetrics | None = None
+
+        for run_idx in range(runs):
+            if runs > 1:
+                print(f"      Run {run_idx + 1}/{runs}...", end=" ")
+            m = run_checker(checker, package_path, resolved_paths, timeout)
+            if not m.get("ok"):
+                if runs > 1:
+                    print(f"Failed: {m.get('error_message', 'Unknown')}")
+                failed_metric = m
+                break
+            times.append(m["execution_time_s"])
+            memories.append(m.get("peak_memory_mb", 0.0))
+            if runs > 1:
+                peak = m.get("peak_memory_mb", 0)
+                mem_str = f", {peak:.0f}MB" if peak > 0 else ""
+                print(f"{m['execution_time_s']:.1f}s{mem_str}")
+
+        if failed_metric is not None:
+            failed_metric["runs"] = len(times) + 1
+            metrics[checker] = failed_metric
+            if runs == 1:
+                print(f"      Failed: {failed_metric.get('error_message', 'Unknown')}")
+        else:
+            result_metric: TimingMetrics = {
+                "ok": True,
+                "execution_time_s": round(statistics.mean(times), 2),
+                "peak_memory_mb": round(statistics.mean(memories), 2),
+                "runs": runs,
+            }
+            if runs > 1:
+                result_metric["execution_time_stats"] = compute_run_stats(times)
+                result_metric["peak_memory_stats"] = compute_run_stats(memories)
 
-        if m.get("ok"):
-            peak = m.get("peak_memory_mb", 0)
+            metrics[checker] = result_metric
+            peak = result_metric["peak_memory_mb"]
             mem_str = f", {peak:.0f}MB" if peak > 0 else ""
-            print(f"      {m['execution_time_s']:.1f}s{mem_str}")
-        else:
-            print(f"      Failed: {m.get('error_message', 'Unknown')}")
+            if runs > 1:
+                time_stats = result_metric["execution_time_stats"]
+                print(
+                    f"      Mean: {result_metric['execution_time_s']:.1f}s{mem_str} "
+                    f"(stddev: {time_stats['stddev']:.2f}s)"
+                )
+            else:
+                print(f"      {result_metric['execution_time_s']:.1f}s{mem_str}")
 
     # Cleanup cloned repo
     shutil.rmtree(package_path, ignore_errors=True)
@@ -942,6 +1011,7 @@ def _save_results(
     package_count: int,
     output_dir: Path,
     os_name: str | None = None,
+    runs: int = 1,
 ) -> Path:
     """Save benchmark results to JSON."""
     timestamp = datetime.now(timezone.utc)
@@ -960,6 +1030,7 @@ def _save_results(
         "type_checkers": type_checkers,
         "type_checker_versions": {k: v for k, v in versions.items() if k in type_checkers},
         "package_count": package_count,
+        "runs_per_package": runs,
         "aggregate": aggregate,
         "results": results,
     }
@@ -1039,6 +1110,10 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         "--install-envs", type=Path, default=None,
         help="Path to install_envs.json (default: typecheck_benchmark/install_envs.json)",
     )
+    parser.add_argument(
+        "--runs", "-r", type=int, default=1,
+        help="Number of runs per checker per package (default: 1)",
+    )
     return parser.parse_args(argv)
 
 
@@ -1053,6 +1128,7 @@ def main(argv: list[str] | None = None) -> int:
         output_dir=args.output,
         os_name=args.os_name,
         install_envs_file=args.install_envs,
+        runs=args.runs,
     )
     return 0