Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 88 additions & 12 deletions typecheck_benchmark/daily_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import re
import signal
import shutil
import statistics
import subprocess
import sys
import tempfile
Expand All @@ -42,13 +43,26 @@
# ---------------------------------------------------------------------------


class RunStats(TypedDict):
"""Statistical summary of multiple benchmark runs."""

min: float
max: float
mean: float
median: float
stddev: float


class TimingMetrics(TypedDict, total=False):
"""Timing metrics for a single type checker run on a single package."""

ok: bool
execution_time_s: float
peak_memory_mb: float
error_message: str | None
runs: int
execution_time_stats: RunStats
peak_memory_stats: RunStats


class PackageResult(TypedDict, total=False):
Expand Down Expand Up @@ -77,14 +91,15 @@ class AggregateStats(TypedDict, total=False):
max_peak_memory_mb: float


class BenchmarkOutput(TypedDict):
class BenchmarkOutput(TypedDict, total=False):
"""Top-level JSON output."""

timestamp: str
date: str
type_checkers: list[str]
type_checker_versions: dict[str, str]
package_count: int
runs_per_package: int
aggregate: dict[str, AggregateStats]
results: list[PackageResult]

Expand Down Expand Up @@ -695,6 +710,17 @@ def compute_percentile(values: Sequence[float | int], percentile: float) -> floa
return sorted_values[lower] + fraction * (sorted_values[upper] - sorted_values[lower])


def compute_run_stats(values: list[float]) -> RunStats:
"""Compute min/max/mean/median/stddev for a list of values."""
return {
"min": round(min(values), 2),
"max": round(max(values), 2),
"mean": round(statistics.mean(values), 2),
"median": round(statistics.median(values), 2),
"stddev": round(statistics.stdev(values), 2) if len(values) > 1 else 0.0,
}


def compute_aggregate_stats(
results: list[PackageResult],
type_checkers: list[str],
Expand Down Expand Up @@ -754,6 +780,7 @@ def run_benchmark(
output_dir: Path | None = None,
os_name: str | None = None,
install_envs_file: Path | None = None,
runs: int = 1,
) -> Path:
"""Run the full benchmark suite.

Expand All @@ -765,6 +792,7 @@ def run_benchmark(
output_dir: Where to write JSON results.
os_name: OS name for filename (ubuntu, macos, windows).
install_envs_file: Path to install_envs.json.
runs: Number of runs per checker per package.

Returns:
Path to the dated output JSON file.
Expand Down Expand Up @@ -797,6 +825,7 @@ def run_benchmark(
print(f"Packages: {len(packages)}")
print(f"Type checkers: {', '.join(type_checkers)}")
print(f"Timeout: {timeout}s per checker")
print(f"Runs per checker: {runs}")
print("=" * 70)

# Versions
Expand All @@ -808,15 +837,15 @@ def run_benchmark(
print()

# Run benchmarks
all_results = _run_all(packages, type_checkers, timeout)
all_results = _run_all(packages, type_checkers, timeout, runs)

# Aggregate
aggregate = compute_aggregate_stats(all_results, type_checkers)

# Save
output_file = _save_results(
all_results, aggregate, type_checkers, versions, len(packages),
output_dir, os_name,
output_dir, os_name, runs,
)

# Print summary
Expand All @@ -833,6 +862,7 @@ def _run_all(
packages: list[dict[str, Any]],
type_checkers: list[str],
timeout: int,
runs: int = 1,
) -> list[PackageResult]:
"""Run benchmarks for all packages."""
all_results: list[PackageResult] = []
Expand All @@ -846,7 +876,7 @@ def _run_all(

print(f"\n[{i}/{len(packages)}] {name}")

result = _benchmark_package(pkg, temp_path, type_checkers, timeout)
result = _benchmark_package(pkg, temp_path, type_checkers, timeout, runs)
all_results.append(result)

return all_results
Expand All @@ -857,6 +887,7 @@ def _benchmark_package(
temp_path: Path,
type_checkers: list[str],
timeout: int,
runs: int = 1,
) -> PackageResult:
"""Benchmark a single package: clone, install deps, run checkers."""
name = pkg["name"]
Expand Down Expand Up @@ -912,16 +943,54 @@ def _benchmark_package(
}
continue

print(f" Running {checker}...")
m = run_checker(checker, package_path, resolved_paths, timeout)
metrics[checker] = m
print(f" Running {checker}... ({runs} run{'s' if runs > 1 else ''})")
times: list[float] = []
memories: list[float] = []
failed_metric: TimingMetrics | None = None

for run_idx in range(runs):
if runs > 1:
print(f" Run {run_idx + 1}/{runs}...", end=" ")
m = run_checker(checker, package_path, resolved_paths, timeout)
if not m.get("ok"):
if runs > 1:
print(f"Failed: {m.get('error_message', 'Unknown')}")
failed_metric = m
break
times.append(m["execution_time_s"])
memories.append(m.get("peak_memory_mb", 0.0))
if runs > 1:
peak = m.get("peak_memory_mb", 0)
mem_str = f", {peak:.0f}MB" if peak > 0 else ""
print(f"{m['execution_time_s']:.1f}s{mem_str}")

if failed_metric is not None:
failed_metric["runs"] = len(times) + 1
metrics[checker] = failed_metric
if runs == 1:
print(f" Failed: {failed_metric.get('error_message', 'Unknown')}")
else:
result_metric: TimingMetrics = {
"ok": True,
"execution_time_s": round(statistics.mean(times), 2),
"peak_memory_mb": round(statistics.mean(memories), 2),
"runs": runs,
}
if runs > 1:
result_metric["execution_time_stats"] = compute_run_stats(times)
result_metric["peak_memory_stats"] = compute_run_stats(memories)

if m.get("ok"):
peak = m.get("peak_memory_mb", 0)
metrics[checker] = result_metric
peak = result_metric["peak_memory_mb"]
mem_str = f", {peak:.0f}MB" if peak > 0 else ""
print(f" {m['execution_time_s']:.1f}s{mem_str}")
else:
print(f" Failed: {m.get('error_message', 'Unknown')}")
if runs > 1:
time_stats = result_metric["execution_time_stats"]
print(
f" Mean: {result_metric['execution_time_s']:.1f}s{mem_str} "
f"(stddev: {time_stats['stddev']:.2f}s)"
)
else:
print(f" {result_metric['execution_time_s']:.1f}s{mem_str}")

# Cleanup cloned repo
shutil.rmtree(package_path, ignore_errors=True)
Expand All @@ -942,6 +1011,7 @@ def _save_results(
package_count: int,
output_dir: Path,
os_name: str | None = None,
runs: int = 1,
) -> Path:
"""Save benchmark results to JSON."""
timestamp = datetime.now(timezone.utc)
Expand All @@ -960,6 +1030,7 @@ def _save_results(
"type_checkers": type_checkers,
"type_checker_versions": {k: v for k, v in versions.items() if k in type_checkers},
"package_count": package_count,
"runs_per_package": runs,
"aggregate": aggregate,
"results": results,
}
Expand Down Expand Up @@ -1039,6 +1110,10 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
"--install-envs", type=Path, default=None,
help="Path to install_envs.json (default: typecheck_benchmark/install_envs.json)",
)
parser.add_argument(
"--runs", "-r", type=int, default=1,
help="Number of runs per checker per package (default: 1)",
)
return parser.parse_args(argv)


Expand All @@ -1053,6 +1128,7 @@ def main(argv: list[str] | None = None) -> int:
output_dir=args.output,
os_name=args.os_name,
install_envs_file=args.install_envs,
runs=args.runs,
)
return 0

Expand Down
Loading