Simplify worker argument handling and remove parallel processing

pablogsal · pablogsal · commit e454271a374c · 2025-06-25T16:40:21.000+01:00
- Change benchmark command to take commit_range as positional argument
- Make repo-path optional with --repo-path flag, defaults to temporary clone
- Remove parallel processing options (--max-workers, --batch-size, --local-checkout)
- Simplify command processing by removing unused parallel execution paths
- Clean up imports and remove multiprocessing dependencies
- Add trailing newline to runs.py and minor formatting cleanup
diff --git a/backend/app/routers/runs.py b/backend/app/routers/runs.py
@@ -41,4 +41,6 @@ async def get_runs(
             timestamp=run.timestamp,
         )
         for run in runs
-    ]
+    ]
+
+
diff --git a/worker/src/memory_tracker_worker/args.py b/worker/src/memory_tracker_worker/args.py
@@ -60,14 +60,13 @@ def parse_args():
         help='Run memory benchmarks on CPython commits'
     )
     benchmark_parser.add_argument(
-        'repo_path',
-        nargs='?',
-        type=Path,
-        help='Path to CPython repository (optional, will clone if not provided)'
+        'commit_range',
+        help='Git commit range to benchmark (e.g., HEAD~5..HEAD, HEAD^, commit_sha)'
     )
     benchmark_parser.add_argument(
-        'commit_range',
-        help='Git commit range to benchmark (e.g., HEAD~5..HEAD)'
+        '--repo-path', '-r',
+        type=Path,
+        help='Path to CPython repository. If not provided, will clone CPython to a temporary directory.'
     )
     benchmark_parser.add_argument(
         '--output-dir', '-o',
@@ -106,18 +105,6 @@ def parse_args():
         action='store_true',
         help='Force overwrite existing output directories for commits'
     )
-    benchmark_parser.add_argument(
-        '--max-workers', '-j',
-        type=int,
-        default=1,
-        help='Maximum number of parallel workers. Creates temporary repo copies for each worker to avoid conflicts. (default: 1 for sequential processing)'
-    )
-    benchmark_parser.add_argument(
-        '--batch-size', '-b',
-        type=int,
-        default=None,
-        help='Number of commits to process in each parallel batch. Useful for memory management with large commit ranges. (default: same as max-workers)'
-    )
     benchmark_parser.add_argument(
         '--auth-token',
         help='Authentication token for uploading results to server. Can also be set via MEMORY_TRACKER_TOKEN environment variable.'
@@ -127,11 +114,6 @@ def parse_args():
         default='http://localhost:8000',
         help='Base URL for the memory tracker API (default: http://localhost:8000)'
     )
-    benchmark_parser.add_argument(
-        '--local-checkout',
-        action='store_true',
-        help='Use local checkout for building. Runs git clean -fxd, configures once, and runs make for each commit. Incompatible with parallel processing (-j > 1).'
-    )
     benchmark_parser.set_defaults(func=benchmark_command)
     
     return parser.parse_args()
diff --git a/worker/src/memory_tracker_worker/benchmarks/__init__.py b/worker/src/memory_tracker_worker/benchmarks/__init__.py
@@ -199,6 +199,7 @@ def list_environments(server_url: str = "http://localhost:8000") -> list:
         raise ValueError(f"Failed to fetch environments: {e}")
 
 
+
 def validate_binary_and_environment(binary_id: str, environment_id: str, server_url: str = "http://localhost:8000") -> None:
     """Validate that binary and environment exist on the server before running benchmarks."""
     logger.info(f"Validating binary_id: {binary_id} and environment_id: {environment_id}")
diff --git a/worker/src/memory_tracker_worker/commands.py b/worker/src/memory_tracker_worker/commands.py
@@ -3,7 +3,6 @@
 import os
 import sys
 import tempfile
-from multiprocessing import cpu_count
 from pathlib import Path
 import git
 
@@ -14,7 +13,7 @@
     check_build_environment,
     get_commits_to_process
 )
-from .processing import process_commit, process_commits_in_parallel, process_commits_local_checkout
+from .processing import process_commits
 
 logger = logging.getLogger(__name__)
 
@@ -148,33 +147,16 @@ def benchmark_command(args):
         logger.error(f"Pre-flight validation failed: {e}")
         sys.exit(1)
     
+    # Use the provided commit range directly
+    commit_range = args.commit_range
+    
     # Get commits to process
     try:
-        commits = get_commits_to_process(repo, args.commit_range)
+        commits = get_commits_to_process(repo, commit_range)
     except ValueError as e:
         logger.error(f"Failed to get commits: {e}")
         sys.exit(1)
     
-    # Validate local-checkout compatibility
-    if hasattr(args, 'local_checkout') and args.local_checkout and args.max_workers > 1:
-        logger.error("--local-checkout is incompatible with parallel processing (-j > 1)")
-        sys.exit(1)
-    
-    # Validate and set defaults for parallel processing arguments
-    if args.max_workers < 1:
-        logger.error(f"Invalid max-workers value: {args.max_workers}. Must be >= 1")
-        sys.exit(1)
-    
-    if args.batch_size is None:
-        args.batch_size = args.max_workers
-    elif args.batch_size < 1:
-        logger.error(f"Invalid batch-size value: {args.batch_size}. Must be >= 1")
-        sys.exit(1)
-    
-    # Warn if using more workers than available CPUs
-    available_cpus = cpu_count()
-    if args.max_workers > available_cpus:
-        logger.warning(f"Using {args.max_workers} workers on a system with {available_cpus} CPUs. This may reduce performance.")
     
     # Get authentication token from CLI or environment variable
     auth_token = args.auth_token or os.getenv('MEMORY_TRACKER_TOKEN')
@@ -189,71 +171,30 @@ def benchmark_command(args):
     logger.info(f"Output directory: {args.output_dir}")
     logger.info(f"Configure flags: {args.configure_flags}")
     logger.info(f"Make flags: {args.make_flags}")
-    logger.info(f"Max workers: {args.max_workers}")
-    logger.info(f"Batch size: {args.batch_size}")
-    logger.info(f"Local checkout mode: {getattr(args, 'local_checkout', False)}")
     logger.info(f"Number of commits to process: {len(commits)}")
     if len(commits) > 0:
         logger.info("Commits to process:")
         for commit in commits:
             logger.info(f"  {commit.hexsha[:8]} - {commit.message.splitlines()[0]}")
     
-    # Process commits (parallel, sequential, or local checkout mode)
-    if args.max_workers > 1:
-        logger.info(f"Using parallel processing with {args.max_workers} workers and batch size {args.batch_size}")
-        results = process_commits_in_parallel(
-            commits,
-            repo_path,
-            args.output_dir,
-            args.configure_flags,
-            args.make_flags,
-            args.verbose,
-            args.binary_id,
-            args.environment_id,
-            args.force,
-            args.max_workers,
-            args.batch_size,
-            auth_token,
-            args.api_base
-        )
-        errors = [(commit, error) for commit, error in results if error is not None]
-    elif getattr(args, 'local_checkout', False):
-        logger.info("Using local checkout mode")
-        errors = []
-        error = process_commits_local_checkout(
-            commits,
-            repo_path,
-            args.output_dir,
-            args.configure_flags,
-            args.make_flags,
-            args.verbose,
-            args.binary_id,
-            args.environment_id,
-            args.force,
-            auth_token,
-            args.api_base
-        )
-        if error:
-            errors.append((None, error))
-    else:
-        logger.info("Using sequential processing")
-        errors = []
-        for commit in commits:
-            error = process_commit(
-                commit,
-                repo_path,
-                args.output_dir,
-                args.configure_flags,
-                args.make_flags,
-                args.verbose,
-                args.binary_id,
-                args.environment_id,
-                args.force,
-                auth_token,
-                args.api_base
-            )
-            if error:
-                errors.append((commit, error))
+    # Process commits using incremental mode (previously local checkout)
+    logger.info("Processing commits using incremental mode")
+    errors = []
+    error = process_commits(
+        commits,
+        repo_path,
+        args.output_dir,
+        args.configure_flags,
+        args.make_flags,
+        args.verbose,
+        args.binary_id,
+        args.environment_id,
+        args.force,
+        auth_token,
+        args.api_base
+    )
+    if error:
+        errors.append((None, error))
     
     # Print final status
     if errors:
diff --git a/worker/src/memory_tracker_worker/processing.py b/worker/src/memory_tracker_worker/processing.py
diff --git a/worker/src/memory_tracker_worker/validation.py b/worker/src/memory_tracker_worker/validation.py

Original file line number	Diff line number	Diff line change
`@@ -41,4 +41,6 @@ async def get_runs(`
`41`	`41`	`timestamp=run.timestamp,`
`42`	`42`	`)`
`43`	`43`	`for run in runs`
`44`		`- ]`
	`44`	`+ ]`
	`45`	`+`
	`46`	`+`