From 57cce29d655be265d87ec5c027e9f8f7fd7e3ff0 Mon Sep 17 00:00:00 2001
From: xyuzh <xinyzng@gmail.com>
Date: Sun, 22 Feb 2026 20:25:32 -0800
Subject: [PATCH 01/34] Add Miles Qwen3-8B GRPO training example on H100

Single-node RL training of Qwen3-8B with GRPO on 8x H100-80GB using
Anyscale. Includes Dockerfile, job config, and entrypoint script that
handles model download, weight conversion, and async GRPO training
with Megatron backend (TP=2, DP=2) and 3 SGLang rollout engines.
---
 miles_qwen3_8b_h100/Dockerfile.anyscale | 118 +++++++++++++++++
 miles_qwen3_8b_h100/README.md           |  77 +++++++++++
 miles_qwen3_8b_h100/entrypoint.sh       | 168 ++++++++++++++++++++++++
 miles_qwen3_8b_h100/job.yaml            |  33 +++++
 4 files changed, 396 insertions(+)
 create mode 100644 miles_qwen3_8b_h100/Dockerfile.anyscale
 create mode 100644 miles_qwen3_8b_h100/README.md
 create mode 100755 miles_qwen3_8b_h100/entrypoint.sh
 create mode 100644 miles_qwen3_8b_h100/job.yaml

diff --git a/miles_qwen3_8b_h100/Dockerfile.anyscale b/miles_qwen3_8b_h100/Dockerfile.anyscale
new file mode 100644
index 0000000..72640ea
--- /dev/null
+++ b/miles_qwen3_8b_h100/Dockerfile.anyscale
@@ -0,0 +1,118 @@
+FROM anyscale/ray:2.54.0-py312-cu129
+
+ARG PATCH_VERSION=latest
+ARG MEGATRON_COMMIT=3714d81d418c9f1bca4594fc35f9e8289f652862
+ARG SGLANG_COMMIT=24c91001cf99ba642be791e099d358f4dfe955f5
+ARG MILES_REF=main
+
+# Anyscale base image runs as non-root; switch to root for system installs.
+USER root
+WORKDIR /root
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git rsync dnsutils nvtop && \
+    rm -rf /var/lib/apt/lists/*
+
+# Keep pip tooling current and pin numpy to 1.x for Megatron compatibility.
+RUN python -m pip install --upgrade pip setuptools wheel && \
+    python -m pip install "numpy<2" huggingface_hub
+
+# Pin PyTorch 2.9.1 — matches sgl_kernel from PyPI (compiled for torch 2.9.x)
+# and has a pre-built flash-attn 2.8.3 wheel available.
+RUN python -m pip install torch==2.9.1 torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# Pre-built flash-attn wheel for torch 2.9 + cu12 (source compilation
+# exceeds Anyscale's ~60 min build timeout).
+RUN python -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3%2Bcu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+
+# Apex: install Python-only (no CUDA extensions) to stay within Anyscale's
+# ~60 min build timeout.  Megatron falls back to PyTorch-native kernels.
+RUN git clone --filter=blob:none https://github.com/NVIDIA/apex.git /tmp/apex && \
+    cd /tmp/apex && \
+    git checkout 10417aceddd7d5d05d7cbf7b0fc2daad1105f8b4 && \
+    python -m pip install --disable-pip-version-check --no-cache-dir \
+    --no-build-isolation . && \
+    rm -rf /tmp/apex
+
+# Install SGLang from source.  sgl_kernel comes from PyPI, pre-compiled
+# for torch 2.9.x — no need to rebuild from source.
+RUN git clone https://github.com/sgl-project/sglang.git /root/sglang && \
+    cd /root/sglang && \
+    git checkout ${SGLANG_COMMIT} && \
+    python -m pip install -e "python[all]"
+
+# Install Megatron-LM from source.
+RUN git clone --recursive https://github.com/NVIDIA/Megatron-LM.git /root/Megatron-LM && \
+    cd /root/Megatron-LM && \
+    git checkout ${MEGATRON_COMMIT} && \
+    python -m pip install -e .
+
+# Pull Miles source for patches and dependency manifests.
+RUN git clone https://github.com/radixark/miles.git /tmp/miles && \
+    cd /tmp/miles && \
+    git checkout ${MILES_REF}
+
+# Apply SGLang patch.
+RUN cd /root/sglang && \
+    cp /tmp/miles/docker/patch/${PATCH_VERSION}/sglang.patch ./sglang.patch && \
+    git update-index --refresh && \
+    git apply sglang.patch --3way && \
+    if grep -R -n '^<<<<<<< ' .; then \
+      echo "SGLang patch failed to apply cleanly. Please resolve conflicts." && \
+      exit 1; \
+    fi && \
+    rm sglang.patch
+
+# Apply Megatron-LM patch.
+RUN cd /root/Megatron-LM && \
+    cp /tmp/miles/docker/patch/${PATCH_VERSION}/megatron.patch ./megatron.patch && \
+    git update-index --refresh && \
+    git apply megatron.patch --3way && \
+    if grep -R -n '^<<<<<<< ' .; then \
+      echo "Megatron patch failed to apply cleanly. Please resolve conflicts." && \
+      exit 1; \
+    fi && \
+    rm megatron.patch
+
+# Install Miles dependencies.
+RUN python -m pip install git+https://github.com/ISEEKYAN/mbridge.git@89eb10887887bc74853f89a4de258c0702932a1c --no-deps && \
+    python -m pip install git+https://github.com/fzyzcjy/torch_memory_saver.git@dc6876905830430b5054325fa4211ff302169c6b --no-cache-dir --force-reinstall && \
+    python -m pip install git+https://github.com/fzyzcjy/Megatron-Bridge.git@dev_rl --no-build-isolation && \
+    python -m pip install "nvidia-modelopt[torch]>=0.37.0" --no-build-isolation
+
+# Make MXFP8 quantizer import conditional — mxfp8_group_quantize was added
+# in a newer SGLang than our pinned commit. Not needed for Qwen3-8B training.
+RUN python -c "\
+import pathlib; \
+p = pathlib.Path('/tmp/miles/miles/backends/megatron_utils/megatron_to_hf/processors/quantizer_mxfp8.py'); \
+t = p.read_text(); \
+t = t.replace( \
+    'from sglang.srt.layers.quantization.fp8_utils import mxfp8_group_quantize', \
+    'try:\\n    from sglang.srt.layers.quantization.fp8_utils import mxfp8_group_quantize\\nexcept ImportError:\\n    mxfp8_group_quantize = None' \
+); \
+p.write_text(t)"
+
+# Install Miles itself.
+RUN python -m pip install -r /tmp/miles/requirements.txt && \
+    python -m pip install -e /tmp/miles --no-deps && \
+    cd /tmp/miles/miles/backends/megatron_utils/kernels/int4_qat && \
+    python -m pip install . --no-build-isolation
+
+# Re-pin PyTorch 2.9.1 and reinstall flash-attn + TE at the end.
+# Earlier installs may have upgraded torch, breaking pre-built binary wheels.
+RUN python -c "import torch; print(f'Before re-pin: PyTorch {torch.__version__}')"
+RUN python -m pip install torch==2.9.1 torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/cu128
+RUN python -m pip install --force-reinstall --no-deps \
+    https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3%2Bcu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+RUN python -m pip install --no-build-isolation "transformer_engine[pytorch]==2.10.0"
+
+# Verify torch + flash-attn ABI compatibility.
+# sgl_kernel is skipped here — it requires libcuda.so.1 (GPU hardware) to import.
+RUN python -c "\
+import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}'); \
+assert torch.__version__.startswith('2.9'), f'Expected 2.9.x, got {torch.__version__}'; \
+from flash_attn import flash_attn_func; print('flash-attn OK')"
+
+WORKDIR /tmp/miles
diff --git a/miles_qwen3_8b_h100/README.md b/miles_qwen3_8b_h100/README.md
new file mode 100644
index 0000000..083b7b2
--- /dev/null
+++ b/miles_qwen3_8b_h100/README.md
@@ -0,0 +1,77 @@
+# Qwen3-8B GRPO Training on Anyscale (H100)
+
+Single-node RL training of Qwen3-8B with GRPO on **8x H100-80GB** using Anyscale, following the pattern from [anyscale/examples#43](https://github.com/anyscale/examples/pull/43).
+
+## Cluster Layout
+
+```
+Head node (m5.2xlarge):  driver only, no GPUs
+Worker 0 (8x H100-80GB):
+  GPU 0-3: Training (TP=2, DP=2)
+  GPU 4-7: Rollout (3 SGLang engines + 1 driver)
+```
+
+- **Training**: 4 GPUs — TP=2 x DP=2 (Megatron backend)
+- **Rollout**: 3 GPUs — disaggregated SGLang inference, 1 GPU per engine (1 GPU reserved for driver)
+- **Algorithm**: GRPO with DAPO-style asymmetric clipping
+- **Dataset**: DAPO-Math-17k (integer math, deterministic reward)
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `job.yaml` | Anyscale job config (`m5.2xlarge` head + 1x `p5.48xlarge` worker) |
+| `Dockerfile.anyscale` | Docker image with Miles, Megatron-LM, SGLang, flash-attn, TE |
+| `entrypoint.sh` | Downloads model/data, converts weights, runs async GRPO training |
+
+## Quick Start
+
+```bash
+pip install -U anyscale
+anyscale login
+
+cd examples/anyscale_qwen3_8b_h100
+anyscale job submit -f job.yaml
+```
+
+The entrypoint automatically:
+1. Downloads `Qwen/Qwen3-8B` and `zhuzilin/dapo-math-17k` to `/mnt/cluster_storage`
+2. Converts HF weights to Megatron torch_dist format (on GPU worker)
+3. Runs async GRPO training with `dapo` reward model via `train_async.py`
+
+## Key Differences from the Slime A10G Example (PR #43)
+
+| | Slime A10G (PR #43) | This Example |
+|---|---|---|
+| GPUs | 2x4 A10G (24GB) | 1x8 H100 (80GB) |
+| Model | Qwen3-1.7B | Qwen3-8B |
+| Training | `train.py` (sync) | `train_async.py` (pipelined async) |
+| Parallelism | TP=2, PP=2 across nodes | TP=2, DP=2, single node |
+| A10G patches | sgl_kernel, Triton, multi_platform | Not needed (H100 = SM90) |
+| Batch size | 64 (16 prompts x 4 samples) | 256 (32 prompts x 8 samples) |
+| Max tokens/GPU | 4096 | 9216 |
+| Attention | FA2 only (Ampere) | FA2 (FA3 available with custom image) |
+
+## Verification
+
+A successful run shows:
+- SGLang engine startup on rollout GPUs
+- Weight conversion completes (first run only)
+- Training loss values printed each step
+- Reward gradually increasing over rollouts
+- Weight sync between training and rollout engines
+
+## If You Hit OOM
+
+**Training GPUs:**
+1. `--max-tokens-per-gpu` -> `4096`
+2. `--rollout-max-response-len` -> `4096`
+3. `--n-samples-per-prompt` -> `4` and `--global-batch-size` -> `128`
+
+**Rollout GPUs:**
+1. `--sglang-mem-fraction-static` -> `0.5`
+2. Add `--sglang-chunked-prefill-size 4096`
+
+## View the Job
+
+View the job in the [jobs tab](https://console.anyscale.com/jobs) of the Anyscale console.
diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
new file mode 100755
index 0000000..505ee89
--- /dev/null
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+# Anyscale entrypoint: Qwen3-8B GRPO training on 1 worker × 8x H100-80GB
+# Downloads model/dataset, converts weights, and runs async RL training.
+#
+# Head node (m5.2xlarge): driver only, no GPUs
+# Layout (GPU worker):
+#   Worker 0 (8x H100):
+#     GPU 0-3: Training (TP=2, DP=2)
+#     GPU 4-7: Rollout (4 SGLang engines, 1 GPU each)
+
+set -ex
+
+export PYTHONBUFFERED=16
+STORAGE=/mnt/cluster_storage
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+
+# Qwen3-8B model architecture args (from scripts/models/qwen3-8B.sh)
+MODEL_ARGS=(
+   --swiglu
+   --num-layers 36
+   --hidden-size 4096
+   --ffn-hidden-size 12288
+   --num-attention-heads 32
+   --group-query-attention
+   --num-query-groups 8
+   --use-rotary-position-embeddings
+   --disable-bias-linear
+   --normalization "RMSNorm"
+   --norm-epsilon 1e-6
+   --rotary-base 1000000
+   --vocab-size 151936
+   --kv-channels 128
+   --qk-layernorm
+   --untie-embeddings-and-output-weights
+)
+
+# ======================== Step 1: Download model & dataset ========================
+
+echo "=== Downloading model ==="
+huggingface-cli download Qwen/Qwen3-8B --local-dir ${STORAGE}/Qwen3-8B
+
+echo "=== Downloading dataset ==="
+huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir ${STORAGE}/dapo-math-17k
+
+# ======================== Step 2: Convert HF weights to torch_dist ========================
+
+if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then
+  echo "=== Converting weights (HF -> torch_dist) on GPU worker ==="
+  CONVERT_ENV_JSON='{
+    "env_vars": {
+      "PYTHONPATH": "/root/Megatron-LM/"
+    }
+  }'
+  ray job submit --address="http://127.0.0.1:8265" \
+    --runtime-env-json="${CONVERT_ENV_JSON}" \
+    --entrypoint-num-gpus 1 \
+    -- python3 /tmp/miles/tools/convert_hf_to_torch_dist.py \
+      ${MODEL_ARGS[@]} \
+      --no-gradient-accumulation-fusion \
+      --hf-checkpoint ${STORAGE}/Qwen3-8B \
+      --save ${STORAGE}/Qwen3-8B_torch_dist
+else
+  echo "=== Converted weights already exist, skipping ==="
+fi
+
+# ======================== Step 3: Run training ========================
+
+CKPT_ARGS=(
+   --hf-checkpoint ${STORAGE}/Qwen3-8B
+   --ref-load ${STORAGE}/Qwen3-8B_torch_dist
+   --load ${STORAGE}/Qwen3-8B_torch_dist
+   --save ${STORAGE}/Qwen3-8B_miles/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data ${STORAGE}/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --balance-data
+   --rm-type dapo
+   --reward-key score
+   --num-rollout 3000
+   --rollout-batch-size 32
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 8192
+   --rollout-temperature 1
+   --global-batch-size 256
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 2
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 9216
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 1
+   --sglang-mem-fraction-static 0.7
+)
+
+MISC_ARGS=(
+   --no-gradient-accumulation-fusion
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   --attention-backend flash
+   --use-tensorboard
+   --tensorboard-dir ${STORAGE}/tensorboard_logs
+)
+
+RUNTIME_ENV_JSON='{
+  "env_vars": {
+    "PYTHONPATH": "/root/Megatron-LM/",
+    "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+    "TENSORBOARD_DIR": "/mnt/cluster_storage/tensorboard_logs"
+  }
+}'
+
+echo "=== Submitting training job ==="
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   --entrypoint-num-gpus 1 \
+   -- python3 /tmp/miles/train_async.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node 4 \
+   --rollout-num-gpus 3 \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]}
diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml
new file mode 100644
index 0000000..10682e1
--- /dev/null
+++ b/miles_qwen3_8b_h100/job.yaml
@@ -0,0 +1,33 @@
+# Anyscale job config: Miles Qwen3-8B GRPO training on H100
+# Single node × 8x H100-80GB
+#
+# Layout:
+#   Head node (m5.2xlarge): driver only, no GPUs
+#   Worker 0 (8x H100):  [GPU 0-3: Training TP=2 DP=2] [GPU 4-7: Rollout (4 engines)]
+#
+# Submit with:
+#   cd examples/anyscale_qwen3_8b_h100
+#   anyscale job submit -f job.yaml
+cloud: anyscale-v2-cloud-us-east-1
+
+name: miles-qwen3-8b-grpo-h100
+
+containerfile: ./Dockerfile.anyscale
+
+compute_config:
+  head_node:
+    instance_type: m5.2xlarge       # CPU-only, runs driver script
+  worker_nodes:
+    - instance_type: p5.48xlarge    # 8x H100-80GB, 192 vCPU, 2048 GB RAM
+      min_nodes: 1
+      max_nodes: 1
+      advanced_instance_config:
+        CapacityReservationSpecification:
+          CapacityReservationTarget:
+            CapacityReservationId: cr-0dfe1157d299ae5fc
+
+working_dir: .
+
+entrypoint: bash entrypoint.sh
+
+max_retries: 0

From 2c18184e689f3297a4779b0fb2971ec3764b2cc4 Mon Sep 17 00:00:00 2001
From: Xinyu Zhang <60529799+xyuzh@users.noreply.github.com>
Date: Thu, 26 Feb 2026 10:57:11 -0800
Subject: [PATCH 02/34] Change num-rollout from 3000 to 5

---
 miles_qwen3_8b_h100/entrypoint.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
index 505ee89..1be7221 100755
--- a/miles_qwen3_8b_h100/entrypoint.sh
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -83,7 +83,7 @@ ROLLOUT_ARGS=(
    --balance-data
    --rm-type dapo
    --reward-key score
-   --num-rollout 3000
+   --num-rollout 5
    --rollout-batch-size 32
    --n-samples-per-prompt 8
    --rollout-max-response-len 8192

From 6ca34bcf972503a6e8a4c847643b1deb8aca6446 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 13:30:57 -0800
Subject: [PATCH 03/34] Polish Qwen3-8B GRPO example

- Remove ray job submit, call python directly
- Move env vars to appropriate locations (PYTHONPATH in Dockerfile, CUDA_DEVICE_MAX_CONNECTIONS in job.yaml)
- Simplify entrypoint.sh (remove unused vars, fix paths)
- Add timeout_s to job.yaml
- Restructure README to match other examples pattern
- Rename Dockerfile.anyscale -> Dockerfile
- Change python3 -> python throughout

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 .../{Dockerfile.anyscale => Dockerfile}       | 25 +++---
 miles_qwen3_8b_h100/README.md                 | 84 +++++--------------
 miles_qwen3_8b_h100/entrypoint.sh             | 39 ++-------
 miles_qwen3_8b_h100/job.yaml                  | 17 ++--
 4 files changed, 53 insertions(+), 112 deletions(-)
 rename miles_qwen3_8b_h100/{Dockerfile.anyscale => Dockerfile} (90%)

diff --git a/miles_qwen3_8b_h100/Dockerfile.anyscale b/miles_qwen3_8b_h100/Dockerfile
similarity index 90%
rename from miles_qwen3_8b_h100/Dockerfile.anyscale
rename to miles_qwen3_8b_h100/Dockerfile
index 72640ea..265e250 100644
--- a/miles_qwen3_8b_h100/Dockerfile.anyscale
+++ b/miles_qwen3_8b_h100/Dockerfile
@@ -5,13 +5,12 @@ ARG MEGATRON_COMMIT=3714d81d418c9f1bca4594fc35f9e8289f652862
 ARG SGLANG_COMMIT=24c91001cf99ba642be791e099d358f4dfe955f5
 ARG MILES_REF=main
 
-# Anyscale base image runs as non-root; switch to root for system installs.
-USER root
-WORKDIR /root
+# Anyscale base image runs as non-root; use sudo for system installs.
+WORKDIR /home/ray
 
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends git rsync dnsutils nvtop && \
-    rm -rf /var/lib/apt/lists/*
+RUN sudo apt-get update && \
+    sudo apt-get install -y --no-install-recommends git rsync dnsutils nvtop && \
+    sudo rm -rf /var/lib/apt/lists/*
 
 # Keep pip tooling current and pin numpy to 1.x for Megatron compatibility.
 RUN python -m pip install --upgrade pip setuptools wheel && \
@@ -37,14 +36,14 @@ RUN git clone --filter=blob:none https://github.com/NVIDIA/apex.git /tmp/apex &&
 
 # Install SGLang from source.  sgl_kernel comes from PyPI, pre-compiled
 # for torch 2.9.x — no need to rebuild from source.
-RUN git clone https://github.com/sgl-project/sglang.git /root/sglang && \
-    cd /root/sglang && \
+RUN git clone https://github.com/sgl-project/sglang.git /home/ray/sglang && \
+    cd /home/ray/sglang && \
     git checkout ${SGLANG_COMMIT} && \
     python -m pip install -e "python[all]"
 
 # Install Megatron-LM from source.
-RUN git clone --recursive https://github.com/NVIDIA/Megatron-LM.git /root/Megatron-LM && \
-    cd /root/Megatron-LM && \
+RUN git clone --recursive https://github.com/NVIDIA/Megatron-LM.git /home/ray/Megatron-LM && \
+    cd /home/ray/Megatron-LM && \
     git checkout ${MEGATRON_COMMIT} && \
     python -m pip install -e .
 
@@ -54,7 +53,7 @@ RUN git clone https://github.com/radixark/miles.git /tmp/miles && \
     git checkout ${MILES_REF}
 
 # Apply SGLang patch.
-RUN cd /root/sglang && \
+RUN cd /home/ray/sglang && \
     cp /tmp/miles/docker/patch/${PATCH_VERSION}/sglang.patch ./sglang.patch && \
     git update-index --refresh && \
     git apply sglang.patch --3way && \
@@ -65,7 +64,7 @@ RUN cd /root/sglang && \
     rm sglang.patch
 
 # Apply Megatron-LM patch.
-RUN cd /root/Megatron-LM && \
+RUN cd /home/ray/Megatron-LM && \
     cp /tmp/miles/docker/patch/${PATCH_VERSION}/megatron.patch ./megatron.patch && \
     git update-index --refresh && \
     git apply megatron.patch --3way && \
@@ -115,4 +114,6 @@ import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}')
 assert torch.__version__.startswith('2.9'), f'Expected 2.9.x, got {torch.__version__}'; \
 from flash_attn import flash_attn_func; print('flash-attn OK')"
 
+ENV PYTHONPATH=/home/ray/Megatron-LM:$PYTHONPATH
+
 WORKDIR /tmp/miles
diff --git a/miles_qwen3_8b_h100/README.md b/miles_qwen3_8b_h100/README.md
index 083b7b2..fcd12b4 100644
--- a/miles_qwen3_8b_h100/README.md
+++ b/miles_qwen3_8b_h100/README.md
@@ -1,77 +1,39 @@
-# Qwen3-8B GRPO Training on Anyscale (H100)
+# GRPO Training for Qwen3-8B with MILES
 
-Single-node RL training of Qwen3-8B with GRPO on **8x H100-80GB** using Anyscale, following the pattern from [anyscale/examples#43](https://github.com/anyscale/examples/pull/43).
+This example demonstrates reinforcement learning fine-tuning of Qwen3-8B using **Group Relative Policy Optimization (GRPO)** on the DAPO-Math-17k dataset. It uses the [MILES](https://github.com/radixark/miles) framework for distributed RL training with disaggregated rollouts on Anyscale.
 
-## Cluster Layout
+The training runs on a single node with **8x H100-80GB GPUs**, using:
+- **4 GPUs for training** (TP=2, DP=2 with Megatron-LM)
+- **4 GPUs for rollout inference** (disaggregated SGLang engines)
 
-```
-Head node (m5.2xlarge):  driver only, no GPUs
-Worker 0 (8x H100-80GB):
-  GPU 0-3: Training (TP=2, DP=2)
-  GPU 4-7: Rollout (3 SGLang engines + 1 driver)
-```
-
-- **Training**: 4 GPUs — TP=2 x DP=2 (Megatron backend)
-- **Rollout**: 3 GPUs — disaggregated SGLang inference, 1 GPU per engine (1 GPU reserved for driver)
-- **Algorithm**: GRPO with DAPO-style asymmetric clipping
-- **Dataset**: DAPO-Math-17k (integer math, deterministic reward)
-
-## Files
-
-| File | Description |
-|------|-------------|
-| `job.yaml` | Anyscale job config (`m5.2xlarge` head + 1x `p5.48xlarge` worker) |
-| `Dockerfile.anyscale` | Docker image with Miles, Megatron-LM, SGLang, flash-attn, TE |
-| `entrypoint.sh` | Downloads model/data, converts weights, runs async GRPO training |
-
-## Quick Start
+## Install the Anyscale CLI
 
 ```bash
 pip install -U anyscale
 anyscale login
-
-cd examples/anyscale_qwen3_8b_h100
-anyscale job submit -f job.yaml
 ```
 
-The entrypoint automatically:
-1. Downloads `Qwen/Qwen3-8B` and `zhuzilin/dapo-math-17k` to `/mnt/cluster_storage`
-2. Converts HF weights to Megatron torch_dist format (on GPU worker)
-3. Runs async GRPO training with `dapo` reward model via `train_async.py`
+## Submit the job
 
-## Key Differences from the Slime A10G Example (PR #43)
+Clone the example from GitHub.
 
-| | Slime A10G (PR #43) | This Example |
-|---|---|---|
-| GPUs | 2x4 A10G (24GB) | 1x8 H100 (80GB) |
-| Model | Qwen3-1.7B | Qwen3-8B |
-| Training | `train.py` (sync) | `train_async.py` (pipelined async) |
-| Parallelism | TP=2, PP=2 across nodes | TP=2, DP=2, single node |
-| A10G patches | sgl_kernel, Triton, multi_platform | Not needed (H100 = SM90) |
-| Batch size | 64 (16 prompts x 4 samples) | 256 (32 prompts x 8 samples) |
-| Max tokens/GPU | 4096 | 9216 |
-| Attention | FA2 only (Ampere) | FA2 (FA3 available with custom image) |
-
-## Verification
-
-A successful run shows:
-- SGLang engine startup on rollout GPUs
-- Weight conversion completes (first run only)
-- Training loss values printed each step
-- Reward gradually increasing over rollouts
-- Weight sync between training and rollout engines
+```bash
+git clone https://github.com/anyscale/examples.git
+cd examples/miles_qwen3_8b_h100
+```
 
-## If You Hit OOM
+Submit the job.
 
-**Training GPUs:**
-1. `--max-tokens-per-gpu` -> `4096`
-2. `--rollout-max-response-len` -> `4096`
-3. `--n-samples-per-prompt` -> `4` and `--global-batch-size` -> `128`
+```bash
+anyscale job submit -f job.yaml
+```
 
-**Rollout GPUs:**
-1. `--sglang-mem-fraction-static` -> `0.5`
-2. Add `--sglang-chunked-prefill-size 4096`
+The entrypoint will automatically download the model and dataset, convert weights to Megatron format, and start training. Training progress can be monitored via TensorBoard logs in `/mnt/cluster_storage/tensorboard_logs`.
 
-## View the Job
+## Understanding the example
 
-View the job in the [jobs tab](https://console.anyscale.com/jobs) of the Anyscale console.
+- **Algorithm**: This example uses GRPO with DAPO-style asymmetric clipping (ε_low=0.2, ε_high=0.28), which is particularly effective for math reasoning tasks.
+- **Dataset**: [DAPO-Math-17k](https://huggingface.co/datasets/zhuzilin/dapo-math-17k) contains 17k integer math problems with deterministic reward signals based on answer correctness.
+- **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. The 4 SGLang rollout engines run inference in parallel while the training GPUs perform gradient updates.
+- **Weight conversion**: On the first run, HuggingFace weights are converted to Megatron-LM's `torch_dist` format. Converted weights are cached in `/mnt/cluster_storage/Qwen3-8B_torch_dist` for subsequent runs.
+- **Async training**: The pipeline uses `train_async.py` which overlaps rollout generation and policy updates for better GPU utilization.
diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
index 1be7221..79f1a06 100755
--- a/miles_qwen3_8b_h100/entrypoint.sh
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -13,8 +13,6 @@ set -ex
 export PYTHONBUFFERED=16
 STORAGE=/mnt/cluster_storage
 
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-
 # Qwen3-8B model architecture args (from scripts/models/qwen3-8B.sh)
 MODEL_ARGS=(
    --swiglu
@@ -46,20 +44,12 @@ huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir
 # ======================== Step 2: Convert HF weights to torch_dist ========================
 
 if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then
-  echo "=== Converting weights (HF -> torch_dist) on GPU worker ==="
-  CONVERT_ENV_JSON='{
-    "env_vars": {
-      "PYTHONPATH": "/root/Megatron-LM/"
-    }
-  }'
-  ray job submit --address="http://127.0.0.1:8265" \
-    --runtime-env-json="${CONVERT_ENV_JSON}" \
-    --entrypoint-num-gpus 1 \
-    -- python3 /tmp/miles/tools/convert_hf_to_torch_dist.py \
-      ${MODEL_ARGS[@]} \
-      --no-gradient-accumulation-fusion \
-      --hf-checkpoint ${STORAGE}/Qwen3-8B \
-      --save ${STORAGE}/Qwen3-8B_torch_dist
+  echo "=== Converting weights (HF -> torch_dist) ==="
+  python /tmp/miles/tools/convert_hf_to_torch_dist.py \
+    ${MODEL_ARGS[@]} \
+    --no-gradient-accumulation-fusion \
+    --hf-checkpoint ${STORAGE}/Qwen3-8B \
+    --save ${STORAGE}/Qwen3-8B_torch_dist
 else
   echo "=== Converted weights already exist, skipping ==="
 fi
@@ -142,22 +132,11 @@ MISC_ARGS=(
    --tensorboard-dir ${STORAGE}/tensorboard_logs
 )
 
-RUNTIME_ENV_JSON='{
-  "env_vars": {
-    "PYTHONPATH": "/root/Megatron-LM/",
-    "CUDA_DEVICE_MAX_CONNECTIONS": "1",
-    "TENSORBOARD_DIR": "/mnt/cluster_storage/tensorboard_logs"
-  }
-}'
-
-echo "=== Submitting training job ==="
-ray job submit --address="http://127.0.0.1:8265" \
-   --runtime-env-json="${RUNTIME_ENV_JSON}" \
-   --entrypoint-num-gpus 1 \
-   -- python3 /tmp/miles/train_async.py \
+echo "=== Starting training ==="
+python /tmp/miles/train_async.py \
    --actor-num-nodes 1 \
    --actor-num-gpus-per-node 4 \
-   --rollout-num-gpus 3 \
+   --rollout-num-gpus 4 \
    ${MODEL_ARGS[@]} \
    ${CKPT_ARGS[@]} \
    ${ROLLOUT_ARGS[@]} \
diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml
index 10682e1..7c65bac 100644
--- a/miles_qwen3_8b_h100/job.yaml
+++ b/miles_qwen3_8b_h100/job.yaml
@@ -6,28 +6,27 @@
 #   Worker 0 (8x H100):  [GPU 0-3: Training TP=2 DP=2] [GPU 4-7: Rollout (4 engines)]
 #
 # Submit with:
-#   cd examples/anyscale_qwen3_8b_h100
+#   cd miles_qwen3_8b_h100
 #   anyscale job submit -f job.yaml
-cloud: anyscale-v2-cloud-us-east-1
 
 name: miles-qwen3-8b-grpo-h100
 
-containerfile: ./Dockerfile.anyscale
+containerfile: ./Dockerfile
 
 compute_config:
   head_node:
-    instance_type: m5.2xlarge       # CPU-only, runs driver script
+    instance_type: m5.2xlarge
   worker_nodes:
-    - instance_type: p5.48xlarge    # 8x H100-80GB, 192 vCPU, 2048 GB RAM
+    - instance_type: p5.48xlarge    # 8x H100-80GB
       min_nodes: 1
       max_nodes: 1
-      advanced_instance_config:
-        CapacityReservationSpecification:
-          CapacityReservationTarget:
-            CapacityReservationId: cr-0dfe1157d299ae5fc
 
 working_dir: .
 
 entrypoint: bash entrypoint.sh
 
+env_vars:
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
+
 max_retries: 0
+timeout_s: 7200

From e7aa67c39a51ed3703a1e3c737e3a90c3fd2687c Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 13:42:58 -0800
Subject: [PATCH 04/34] Use declarative compute config instead of hardcoded
 instance types

- Replace instance_type with required_resources and required_labels
- Specify H100 accelerator type using ray.io/accelerator-type label
- Define resource requirements: 8 CPUs/32Gi for head, 96 CPUs/512Gi/8 GPUs for workers
- Allows Anyscale to select optimal H100 instance type (e.g., p5.48xlarge)

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 miles_qwen3_8b_h100/job.yaml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml
index 7c65bac..6f54dc6 100644
--- a/miles_qwen3_8b_h100/job.yaml
+++ b/miles_qwen3_8b_h100/job.yaml
@@ -15,9 +15,17 @@ containerfile: ./Dockerfile
 
 compute_config:
   head_node:
-    instance_type: m5.2xlarge
+    required_resources:
+      CPU: 8
+      memory: 32Gi
   worker_nodes:
-    - instance_type: p5.48xlarge    # 8x H100-80GB
+    - name: h100-workers
+      required_resources:
+        CPU: 96
+        memory: 512Gi
+        GPU: 8
+      required_labels:
+        ray.io/accelerator-type: H100
       min_nodes: 1
       max_nodes: 1
 

From 4c572960cdc0d314833d0f38cac5c23b5575d058 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 16:15:20 -0800
Subject: [PATCH 05/34] Fix declarative compute config resource requirements

- Update worker resources to match p5.48xlarge specs: 192 vCPUs, 2048Gi memory
- Keeps 8 H100 GPUs with H100 accelerator type label

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 miles_qwen3_8b_h100/job.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml
index 6f54dc6..53ec596 100644
--- a/miles_qwen3_8b_h100/job.yaml
+++ b/miles_qwen3_8b_h100/job.yaml
@@ -21,8 +21,8 @@ compute_config:
   worker_nodes:
     - name: h100-workers
       required_resources:
-        CPU: 96
-        memory: 512Gi
+        CPU: 192
+        memory: 2048Gi
         GPU: 8
       required_labels:
         ray.io/accelerator-type: H100

From 7e690cdf917c0f26454b81983b48053d907e2e73 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 16:24:36 -0800
Subject: [PATCH 06/34] Use Ray remote to run weight conversion on GPU worker

- Add convert_weights_remote.py wrapper with @ray.remote(num_gpus=1)
- Ensures weight conversion runs on GPU worker instead of head node
- Fixes 'No NVIDIA driver' error when running conversion

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 miles_qwen3_8b_h100/convert_weights_remote.py | 30 +++++++++++++++++++
 miles_qwen3_8b_h100/entrypoint.sh             |  4 +--
 2 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 miles_qwen3_8b_h100/convert_weights_remote.py

diff --git a/miles_qwen3_8b_h100/convert_weights_remote.py b/miles_qwen3_8b_h100/convert_weights_remote.py
new file mode 100644
index 0000000..f7aaa4d
--- /dev/null
+++ b/miles_qwen3_8b_h100/convert_weights_remote.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+"""Ray remote wrapper for weight conversion - ensures it runs on a GPU worker."""
+import sys
+import subprocess
+import ray
+
+@ray.remote(num_gpus=1)
+def convert_weights(cmd_args):
+    """Run weight conversion on a GPU worker."""
+    result = subprocess.run(
+        ["python", "/tmp/miles/tools/convert_hf_to_torch_dist.py"] + cmd_args,
+        capture_output=True,
+        text=True
+    )
+    return result.returncode, result.stdout, result.stderr
+
+if __name__ == "__main__":
+    # Pass through all command-line arguments
+    cmd_args = sys.argv[1:]
+
+    # Run conversion on GPU worker
+    returncode, stdout, stderr = ray.get(convert_weights.remote(cmd_args))
+
+    # Print output
+    if stdout:
+        print(stdout, end="")
+    if stderr:
+        print(stderr, end="", file=sys.stderr)
+
+    sys.exit(returncode)
diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
index 79f1a06..04345e8 100755
--- a/miles_qwen3_8b_h100/entrypoint.sh
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -44,8 +44,8 @@ huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir
 # ======================== Step 2: Convert HF weights to torch_dist ========================
 
 if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then
-  echo "=== Converting weights (HF -> torch_dist) ==="
-  python /tmp/miles/tools/convert_hf_to_torch_dist.py \
+  echo "=== Converting weights (HF -> torch_dist) on GPU worker ==="
+  python convert_weights_remote.py \
     ${MODEL_ARGS[@]} \
     --no-gradient-accumulation-fusion \
     --hf-checkpoint ${STORAGE}/Qwen3-8B \

From eba802aca5c5fab30e1dd3de0b051ba13222d9a6 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 18:23:18 -0800
Subject: [PATCH 07/34] Add Ray remote wrapper for training script

- Create train_remote.py with @ray.remote(num_gpus=4)
- Ensures training runs on GPU workers instead of head node
- Both weight conversion and training now use Ray remote

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 miles_qwen3_8b_h100/entrypoint.sh   |  2 +-
 miles_qwen3_8b_h100/train_remote.py | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 miles_qwen3_8b_h100/train_remote.py

diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
index 04345e8..eba3188 100755
--- a/miles_qwen3_8b_h100/entrypoint.sh
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -133,7 +133,7 @@ MISC_ARGS=(
 )
 
 echo "=== Starting training ==="
-python /tmp/miles/train_async.py \
+python train_remote.py \
    --actor-num-nodes 1 \
    --actor-num-gpus-per-node 4 \
    --rollout-num-gpus 4 \
diff --git a/miles_qwen3_8b_h100/train_remote.py b/miles_qwen3_8b_h100/train_remote.py
new file mode 100644
index 0000000..d95fbaa
--- /dev/null
+++ b/miles_qwen3_8b_h100/train_remote.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+"""Ray remote wrapper for training - ensures it runs on GPU workers."""
+import sys
+import subprocess
+import ray
+
+@ray.remote(num_gpus=4)  # Training needs 4 GPUs
+def run_training(cmd_args):
+    """Run training on GPU workers."""
+    result = subprocess.run(
+        ["python", "/tmp/miles/train_async.py"] + cmd_args,
+        capture_output=False,  # Stream output directly
+        text=True
+    )
+    return result.returncode
+
+if __name__ == "__main__":
+    # Pass through all command-line arguments
+    cmd_args = sys.argv[1:]
+
+    # Run training on GPU workers
+    returncode = ray.get(run_training.remote(cmd_args))
+
+    sys.exit(returncode)

From 764d8fa64988e2dbc30ffd5a5461e11ee2ee1932 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 20:52:59 -0800
Subject: [PATCH 08/34] Rename miles_qwen3_8b_h100 to rl_with_miles

- Rename directory from miles_qwen3_8b_h100 to rl_with_miles for better clarity
- Add Ray remote wrappers (convert_weights_remote.py, train_remote.py)
- Update job.yaml to use flexible required_resources instead of fixed instance types
- Update all directory references in README.md and job.yaml
- Sync latest improvements from main development branch

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 .../Dockerfile                                |  0
 .../README.md                                 |  2 +-
 rl_with_miles/convert_weights_remote.py       | 30 +++++++++++++++++++
 .../entrypoint.sh                             |  6 ++--
 .../job.yaml                                  | 14 +++++++--
 rl_with_miles/train_remote.py                 | 24 +++++++++++++++
 6 files changed, 69 insertions(+), 7 deletions(-)
 rename {miles_qwen3_8b_h100 => rl_with_miles}/Dockerfile (100%)
 rename {miles_qwen3_8b_h100 => rl_with_miles}/README.md (98%)
 create mode 100644 rl_with_miles/convert_weights_remote.py
 rename {miles_qwen3_8b_h100 => rl_with_miles}/entrypoint.sh (96%)
 rename {miles_qwen3_8b_h100 => rl_with_miles}/job.yaml (69%)
 create mode 100644 rl_with_miles/train_remote.py

diff --git a/miles_qwen3_8b_h100/Dockerfile b/rl_with_miles/Dockerfile
similarity index 100%
rename from miles_qwen3_8b_h100/Dockerfile
rename to rl_with_miles/Dockerfile
diff --git a/miles_qwen3_8b_h100/README.md b/rl_with_miles/README.md
similarity index 98%
rename from miles_qwen3_8b_h100/README.md
rename to rl_with_miles/README.md
index fcd12b4..9019c09 100644
--- a/miles_qwen3_8b_h100/README.md
+++ b/rl_with_miles/README.md
@@ -19,7 +19,7 @@ Clone the example from GitHub.
 
 ```bash
 git clone https://github.com/anyscale/examples.git
-cd examples/miles_qwen3_8b_h100
+cd examples/rl_with_miles
 ```
 
 Submit the job.
diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
new file mode 100644
index 0000000..f7aaa4d
--- /dev/null
+++ b/rl_with_miles/convert_weights_remote.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+"""Ray remote wrapper for weight conversion - ensures it runs on a GPU worker."""
+import sys
+import subprocess
+import ray
+
+@ray.remote(num_gpus=1)
+def convert_weights(cmd_args):
+    """Run weight conversion on a GPU worker."""
+    result = subprocess.run(
+        ["python", "/tmp/miles/tools/convert_hf_to_torch_dist.py"] + cmd_args,
+        capture_output=True,
+        text=True
+    )
+    return result.returncode, result.stdout, result.stderr
+
+if __name__ == "__main__":
+    # Pass through all command-line arguments
+    cmd_args = sys.argv[1:]
+
+    # Run conversion on GPU worker
+    returncode, stdout, stderr = ray.get(convert_weights.remote(cmd_args))
+
+    # Print output
+    if stdout:
+        print(stdout, end="")
+    if stderr:
+        print(stderr, end="", file=sys.stderr)
+
+    sys.exit(returncode)
diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/rl_with_miles/entrypoint.sh
similarity index 96%
rename from miles_qwen3_8b_h100/entrypoint.sh
rename to rl_with_miles/entrypoint.sh
index 79f1a06..eba3188 100755
--- a/miles_qwen3_8b_h100/entrypoint.sh
+++ b/rl_with_miles/entrypoint.sh
@@ -44,8 +44,8 @@ huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir
 # ======================== Step 2: Convert HF weights to torch_dist ========================
 
 if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then
-  echo "=== Converting weights (HF -> torch_dist) ==="
-  python /tmp/miles/tools/convert_hf_to_torch_dist.py \
+  echo "=== Converting weights (HF -> torch_dist) on GPU worker ==="
+  python convert_weights_remote.py \
     ${MODEL_ARGS[@]} \
     --no-gradient-accumulation-fusion \
     --hf-checkpoint ${STORAGE}/Qwen3-8B \
@@ -133,7 +133,7 @@ MISC_ARGS=(
 )
 
 echo "=== Starting training ==="
-python /tmp/miles/train_async.py \
+python train_remote.py \
    --actor-num-nodes 1 \
    --actor-num-gpus-per-node 4 \
    --rollout-num-gpus 4 \
diff --git a/miles_qwen3_8b_h100/job.yaml b/rl_with_miles/job.yaml
similarity index 69%
rename from miles_qwen3_8b_h100/job.yaml
rename to rl_with_miles/job.yaml
index 7c65bac..5c5f156 100644
--- a/miles_qwen3_8b_h100/job.yaml
+++ b/rl_with_miles/job.yaml
@@ -6,7 +6,7 @@
 #   Worker 0 (8x H100):  [GPU 0-3: Training TP=2 DP=2] [GPU 4-7: Rollout (4 engines)]
 #
 # Submit with:
-#   cd miles_qwen3_8b_h100
+#   cd rl_with_miles
 #   anyscale job submit -f job.yaml
 
 name: miles-qwen3-8b-grpo-h100
@@ -15,9 +15,17 @@ containerfile: ./Dockerfile
 
 compute_config:
   head_node:
-    instance_type: m5.2xlarge
+    required_resources:
+      CPU: 8
+      memory: 32Gi
   worker_nodes:
-    - instance_type: p5.48xlarge    # 8x H100-80GB
+    - name: h100-workers
+      required_resources:
+        CPU: 192
+        memory: 2048Gi
+        GPU: 8
+      required_labels:
+        ray.io/accelerator-type: H100
       min_nodes: 1
       max_nodes: 1
 
diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
new file mode 100644
index 0000000..d95fbaa
--- /dev/null
+++ b/rl_with_miles/train_remote.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+"""Ray remote wrapper for training - ensures it runs on GPU workers."""
+import sys
+import subprocess
+import ray
+
+@ray.remote(num_gpus=4)  # Training needs 4 GPUs
+def run_training(cmd_args):
+    """Run training on GPU workers."""
+    result = subprocess.run(
+        ["python", "/tmp/miles/train_async.py"] + cmd_args,
+        capture_output=False,  # Stream output directly
+        text=True
+    )
+    return result.returncode
+
+if __name__ == "__main__":
+    # Pass through all command-line arguments
+    cmd_args = sys.argv[1:]
+
+    # Run training on GPU workers
+    returncode = ray.get(run_training.remote(cmd_args))
+
+    sys.exit(returncode)

From 97ace474055f106b20105f6847114d6bdcfcb7b9 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:03:05 -0800
Subject: [PATCH 09/34] Add explanation for Ray remote wrappers in README

Explain why MILES scripts are wrapped in Ray remote functions to ensure
GPU operations run on worker nodes, not the CPU-only head node.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rl_with_miles/README.md b/rl_with_miles/README.md
index 9019c09..4041f81 100644
--- a/rl_with_miles/README.md
+++ b/rl_with_miles/README.md
@@ -37,3 +37,4 @@ The entrypoint will automatically download the model and dataset, convert weight
 - **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. The 4 SGLang rollout engines run inference in parallel while the training GPUs perform gradient updates.
 - **Weight conversion**: On the first run, HuggingFace weights are converted to Megatron-LM's `torch_dist` format. Converted weights are cached in `/mnt/cluster_storage/Qwen3-8B_torch_dist` for subsequent runs.
 - **Async training**: The pipeline uses `train_async.py` which overlaps rollout generation and policy updates for better GPU utilization.
+- **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. In Anyscale jobs, the head node typically has no GPUs, so GPU-requiring operations must be explicitly scheduled on worker nodes using `@ray.remote(num_gpus=N)`.

From 5d6ff19a8c5f2ec3bd6791cd4fe5f24a8fc9758e Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:07:05 -0800
Subject: [PATCH 10/34] Fix GPU allocation in train_remote.py

Reserve all 8 GPUs (not just 4) in the Ray remote wrapper to match the
training requirements: 4 GPUs for training + 4 GPUs for rollout.

Previously, the wrapper only reserved 4 GPUs but the subprocess needed
8, which would cause CUDA_VISIBLE_DEVICES to hide the rollout GPUs.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/train_remote.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index d95fbaa..fb9b978 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -4,7 +4,7 @@
 import subprocess
 import ray
 
-@ray.remote(num_gpus=4)  # Training needs 4 GPUs
+@ray.remote(num_gpus=8)  # Reserves all 8 GPUs (4 for training + 4 for rollout)
 def run_training(cmd_args):
     """Run training on GPU workers."""
     result = subprocess.run(

From 0e270ae4f00cbd963f703e1779006131772c953d Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:08:55 -0800
Subject: [PATCH 11/34] Fix: Remove GPU reservation from train_remote wrapper

Don't reserve GPUs in the wrapper because the MILES training subprocess
internally uses Ray to allocate GPUs. Reserving GPUs in both the wrapper
and the subprocess creates resource conflicts.

The wrapper now only ensures execution on a worker node, while the
subprocess handles all GPU allocation through its own Ray calls.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/train_remote.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index fb9b978..091413d 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -4,9 +4,15 @@
 import subprocess
 import ray
 
-@ray.remote(num_gpus=8)  # Reserves all 8 GPUs (4 for training + 4 for rollout)
+@ray.remote
 def run_training(cmd_args):
-    """Run training on GPU workers."""
+    """Run training on GPU workers.
+
+    Note: We don't reserve GPUs here because the MILES training script
+    internally uses Ray to allocate GPUs for training and rollout.
+    Reserving GPUs in the wrapper would conflict with the subprocess's
+    GPU allocation.
+    """
     result = subprocess.run(
         ["python", "/tmp/miles/train_async.py"] + cmd_args,
         capture_output=False,  # Stream output directly

From 9260e654f126658f653ba95aa4a096fea3fa328b Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:18:32 -0800
Subject: [PATCH 12/34] Use accelerator type label to ensure GPU node placement

Request accelerator_type_H100 resource to explicitly schedule on H100
GPU nodes, matching the compute_config in job.yaml. This ensures the
wrapper runs on the correct node type without reserving actual GPUs.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/train_remote.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index 091413d..a91b229 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -4,11 +4,14 @@
 import subprocess
 import ray
 
-@ray.remote
+@ray.remote(resources={"accelerator_type_H100": 0.001})
 def run_training(cmd_args):
     """Run training on GPU workers.
 
-    Note: We don't reserve GPUs here because the MILES training script
+    Uses accelerator_type_H100 resource to ensure scheduling on H100 GPU nodes.
+    This must match the accelerator-type label in job.yaml compute_config.
+
+    Note: We don't reserve GPUs (num_gpus) because the MILES training script
     internally uses Ray to allocate GPUs for training and rollout.
     Reserving GPUs in the wrapper would conflict with the subprocess's
     GPU allocation.

From 8e1f17d921b7599e574a3b6cf23d391de98a80da Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:19:16 -0800
Subject: [PATCH 13/34] Use Ray node affinity scheduling for GPU placement

Replace resource-based scheduling with NodeAffinitySchedulingStrategy to
explicitly select GPU nodes. Queries ray.nodes() to find nodes with GPUs
and schedules the training wrapper on those nodes.

This uses Ray's native label/scheduling features rather than custom
resources, providing clearer intent and better control over placement.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/train_remote.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index a91b229..4435bc5 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -3,13 +3,14 @@
 import sys
 import subprocess
 import ray
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 
-@ray.remote(resources={"accelerator_type_H100": 0.001})
+@ray.remote
 def run_training(cmd_args):
     """Run training on GPU workers.
 
-    Uses accelerator_type_H100 resource to ensure scheduling on H100 GPU nodes.
-    This must match the accelerator-type label in job.yaml compute_config.
+    Uses node label scheduling to ensure placement on H100 GPU nodes.
+    The label must match the accelerator-type in job.yaml compute_config.
 
     Note: We don't reserve GPUs (num_gpus) because the MILES training script
     internally uses Ray to allocate GPUs for training and rollout.
@@ -27,7 +28,20 @@ def run_training(cmd_args):
     # Pass through all command-line arguments
     cmd_args = sys.argv[1:]
 
+    # Get a GPU worker node (matches ray.io/accelerator-type: H100 from job.yaml)
+    gpu_nodes = [node for node in ray.nodes() if node.get("Resources", {}).get("GPU", 0) > 0]
+    if not gpu_nodes:
+        raise RuntimeError("No GPU nodes available")
+
+    # Schedule on a GPU node with H100 accelerators
+    scheduling_strategy = NodeAffinitySchedulingStrategy(
+        node_id=gpu_nodes[0]["NodeID"],
+        soft=False,
+    )
+
     # Run training on GPU workers
-    returncode = ray.get(run_training.remote(cmd_args))
+    returncode = ray.get(
+        run_training.options(scheduling_strategy=scheduling_strategy).remote(cmd_args)
+    )
 
     sys.exit(returncode)

From 1951ae177704a7d3b6d6ed51a9a876e671cb3201 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:20:25 -0800
Subject: [PATCH 14/34] Use label_selector for H100 node placement

Use Ray's label_selector API to schedule on H100 GPU nodes:
  @ray.remote(label_selector={"ray.io/accelerator-type": "H100"})

This matches the accelerator-type label in job.yaml compute_config
and uses Ray's native label-based scheduling feature.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/train_remote.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index 4435bc5..4ab1c15 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -3,13 +3,12 @@
 import sys
 import subprocess
 import ray
-from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 
-@ray.remote
+@ray.remote(label_selector={"ray.io/accelerator-type": "H100"})
 def run_training(cmd_args):
     """Run training on GPU workers.
 
-    Uses node label scheduling to ensure placement on H100 GPU nodes.
+    Uses label selector to ensure placement on H100 GPU nodes.
     The label must match the accelerator-type in job.yaml compute_config.
 
     Note: We don't reserve GPUs (num_gpus) because the MILES training script
@@ -28,20 +27,7 @@ def run_training(cmd_args):
     # Pass through all command-line arguments
     cmd_args = sys.argv[1:]
 
-    # Get a GPU worker node (matches ray.io/accelerator-type: H100 from job.yaml)
-    gpu_nodes = [node for node in ray.nodes() if node.get("Resources", {}).get("GPU", 0) > 0]
-    if not gpu_nodes:
-        raise RuntimeError("No GPU nodes available")
-
-    # Schedule on a GPU node with H100 accelerators
-    scheduling_strategy = NodeAffinitySchedulingStrategy(
-        node_id=gpu_nodes[0]["NodeID"],
-        soft=False,
-    )
-
     # Run training on GPU workers
-    returncode = ray.get(
-        run_training.options(scheduling_strategy=scheduling_strategy).remote(cmd_args)
-    )
+    returncode = ray.get(run_training.remote(cmd_args))
 
     sys.exit(returncode)

From 8d0fdba0da6e2caaf534ea88aaec43b0cca8b005 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:21:49 -0800
Subject: [PATCH 15/34] Add label_selector to convert_weights_remote.py

Use label selector to ensure weight conversion runs on H100 GPU nodes:
  @ray.remote(num_gpus=1, label_selector={"ray.io/accelerator-type": "H100"})

Unlike the training wrapper, this keeps num_gpus=1 because the Megatron
weight conversion tool needs actual GPU access.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/convert_weights_remote.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index f7aaa4d..25d3dbf 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -4,9 +4,15 @@
 import subprocess
 import ray
 
-@ray.remote(num_gpus=1)
+@ray.remote(num_gpus=1, label_selector={"ray.io/accelerator-type": "H100"})
 def convert_weights(cmd_args):
-    """Run weight conversion on a GPU worker."""
+    """Run weight conversion on a GPU worker.
+
+    Uses label selector to ensure placement on H100 GPU nodes.
+    The label must match the accelerator-type in job.yaml compute_config.
+
+    Reserves 1 GPU for the Megatron weight conversion process.
+    """
     result = subprocess.run(
         ["python", "/tmp/miles/tools/convert_hf_to_torch_dist.py"] + cmd_args,
         capture_output=True,

From 87d98fd340191946d921fcd9c88a668d965e6226 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:23:52 -0800
Subject: [PATCH 16/34] Simplify wrapper scripts

Remove if __name__ == "__main__" blocks and intermediate variables.
Use raise SystemExit() instead of sys.exit() for cleaner code.

Reduces ~15 lines of boilerplate while maintaining functionality.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/convert_weights_remote.py | 20 ++++++--------------
 rl_with_miles/train_remote.py           |  9 +--------
 2 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index 25d3dbf..e43f9bb 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -20,17 +20,9 @@ def convert_weights(cmd_args):
     )
     return result.returncode, result.stdout, result.stderr
 
-if __name__ == "__main__":
-    # Pass through all command-line arguments
-    cmd_args = sys.argv[1:]
-
-    # Run conversion on GPU worker
-    returncode, stdout, stderr = ray.get(convert_weights.remote(cmd_args))
-
-    # Print output
-    if stdout:
-        print(stdout, end="")
-    if stderr:
-        print(stderr, end="", file=sys.stderr)
-
-    sys.exit(returncode)
+returncode, stdout, stderr = ray.get(convert_weights.remote(sys.argv[1:]))
+if stdout:
+    print(stdout, end="")
+if stderr:
+    print(stderr, end="", file=sys.stderr)
+raise SystemExit(returncode)
diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index 4ab1c15..01bafb0 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -23,11 +23,4 @@ def run_training(cmd_args):
     )
     return result.returncode
 
-if __name__ == "__main__":
-    # Pass through all command-line arguments
-    cmd_args = sys.argv[1:]
-
-    # Run training on GPU workers
-    returncode = ray.get(run_training.remote(cmd_args))
-
-    sys.exit(returncode)
+raise SystemExit(ray.get(run_training.remote(sys.argv[1:])))

From 6cf53a3e73c0f08ed9d63adcdb73ec25ab792c2d Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:24:25 -0800
Subject: [PATCH 17/34] Update README to explain label_selector usage

Update the Ray remote wrappers section to:
- Explain label_selector usage for H100 GPU node placement
- Clarify why it must match job.yaml accelerator-type
- Document the difference between the two wrappers (convert_weights
  reserves 1 GPU, training wrapper doesn't)
- Explain why training wrapper avoids GPU reservation (conflict with
  MILES internal allocation)

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rl_with_miles/README.md b/rl_with_miles/README.md
index 4041f81..a5b793d 100644
--- a/rl_with_miles/README.md
+++ b/rl_with_miles/README.md
@@ -37,4 +37,4 @@ The entrypoint will automatically download the model and dataset, convert weight
 - **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. The 4 SGLang rollout engines run inference in parallel while the training GPUs perform gradient updates.
 - **Weight conversion**: On the first run, HuggingFace weights are converted to Megatron-LM's `torch_dist` format. Converted weights are cached in `/mnt/cluster_storage/Qwen3-8B_torch_dist` for subsequent runs.
 - **Async training**: The pipeline uses `train_async.py` which overlaps rollout generation and policy updates for better GPU utilization.
-- **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. In Anyscale jobs, the head node typically has no GPUs, so GPU-requiring operations must be explicitly scheduled on worker nodes using `@ray.remote(num_gpus=N)`.
+- **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. Both wrappers use `label_selector={"ray.io/accelerator-type": "H100"}` to match the accelerator type specified in `job.yaml`, ensuring placement on H100 GPU nodes. The weight conversion wrapper reserves 1 GPU (`num_gpus=1`) for the actual conversion process, while the training wrapper does not reserve GPUs to avoid conflicts with MILES's internal GPU allocation for training and rollout.

From 24b106f2107dea505149a4e39261524dc1ef6bcc Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:24:48 -0800
Subject: [PATCH 18/34] Revert to sys.exit() instead of raise SystemExit()

Use sys.exit() which is more conventional and clearer than raise SystemExit().

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/convert_weights_remote.py | 2 +-
 rl_with_miles/train_remote.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index e43f9bb..8e932c3 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -25,4 +25,4 @@ def convert_weights(cmd_args):
     print(stdout, end="")
 if stderr:
     print(stderr, end="", file=sys.stderr)
-raise SystemExit(returncode)
+sys.exit(returncode)
diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index 01bafb0..5cf7ed8 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -23,4 +23,4 @@ def run_training(cmd_args):
     )
     return result.returncode
 
-raise SystemExit(ray.get(run_training.remote(sys.argv[1:])))
+sys.exit(ray.get(run_training.remote(sys.argv[1:])))

From 5b1ca54898ac85b4fd66a907f2c7c7d4b4150184 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:25:25 -0800
Subject: [PATCH 19/34] Remove GPU reservation from conversion wrapper

Don't reserve GPUs in convert_weights_remote.py, allowing the conversion
script to manage its own GPU allocation. This matches the training wrapper
pattern and avoids resource conflicts.

Both wrappers now only use label_selector for H100 node placement without
reserving GPU resources.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/README.md                 | 2 +-
 rl_with_miles/convert_weights_remote.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/rl_with_miles/README.md b/rl_with_miles/README.md
index a5b793d..794f251 100644
--- a/rl_with_miles/README.md
+++ b/rl_with_miles/README.md
@@ -37,4 +37,4 @@ The entrypoint will automatically download the model and dataset, convert weight
 - **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. The 4 SGLang rollout engines run inference in parallel while the training GPUs perform gradient updates.
 - **Weight conversion**: On the first run, HuggingFace weights are converted to Megatron-LM's `torch_dist` format. Converted weights are cached in `/mnt/cluster_storage/Qwen3-8B_torch_dist` for subsequent runs.
 - **Async training**: The pipeline uses `train_async.py` which overlaps rollout generation and policy updates for better GPU utilization.
-- **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. Both wrappers use `label_selector={"ray.io/accelerator-type": "H100"}` to match the accelerator type specified in `job.yaml`, ensuring placement on H100 GPU nodes. The weight conversion wrapper reserves 1 GPU (`num_gpus=1`) for the actual conversion process, while the training wrapper does not reserve GPUs to avoid conflicts with MILES's internal GPU allocation for training and rollout.
+- **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. Both wrappers use `label_selector={"ray.io/accelerator-type": "H100"}` to match the accelerator type specified in `job.yaml`, ensuring placement on H100 GPU nodes. Neither wrapper reserves GPUs (`num_gpus`) to avoid conflicts with the scripts' internal GPU management.
diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index 8e932c3..b78e8e8 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -4,14 +4,15 @@
 import subprocess
 import ray
 
-@ray.remote(num_gpus=1, label_selector={"ray.io/accelerator-type": "H100"})
+@ray.remote(label_selector={"ray.io/accelerator-type": "H100"})
 def convert_weights(cmd_args):
     """Run weight conversion on a GPU worker.
 
     Uses label selector to ensure placement on H100 GPU nodes.
     The label must match the accelerator-type in job.yaml compute_config.
 
-    Reserves 1 GPU for the Megatron weight conversion process.
+    Note: We don't reserve GPUs (num_gpus) to allow the conversion script
+    to manage GPU allocation as needed.
     """
     result = subprocess.run(
         ["python", "/tmp/miles/tools/convert_hf_to_torch_dist.py"] + cmd_args,

From 25d089de37d5dfab34c1226e5b75989b872a44bf Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:27:04 -0800
Subject: [PATCH 20/34] Fix PYTHONBUFFERED typo in entrypoint.sh

Change PYTHONBUFFERED=16 to PYTHONUNBUFFERED=1 for proper Python
output buffering control.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/entrypoint.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rl_with_miles/entrypoint.sh b/rl_with_miles/entrypoint.sh
index eba3188..be56e6f 100755
--- a/rl_with_miles/entrypoint.sh
+++ b/rl_with_miles/entrypoint.sh
@@ -10,7 +10,7 @@
 
 set -ex
 
-export PYTHONBUFFERED=16
+export PYTHONUNBUFFERED=1
 STORAGE=/mnt/cluster_storage
 
 # Qwen3-8B model architecture args (from scripts/models/qwen3-8B.sh)

From b68e7e376d4acdc708844465ab3d54cf9b00e693 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:29:58 -0800
Subject: [PATCH 21/34] Fix PEP 8 spacing in wrapper scripts

Add two blank lines before and after top-level functions per PEP 8.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/convert_weights_remote.py | 2 ++
 rl_with_miles/train_remote.py           | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index b78e8e8..7a3db73 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -4,6 +4,7 @@
 import subprocess
 import ray
 
+
 @ray.remote(label_selector={"ray.io/accelerator-type": "H100"})
 def convert_weights(cmd_args):
     """Run weight conversion on a GPU worker.
@@ -21,6 +22,7 @@ def convert_weights(cmd_args):
     )
     return result.returncode, result.stdout, result.stderr
 
+
 returncode, stdout, stderr = ray.get(convert_weights.remote(sys.argv[1:]))
 if stdout:
     print(stdout, end="")
diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index 5cf7ed8..d45b302 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -4,6 +4,7 @@
 import subprocess
 import ray
 
+
 @ray.remote(label_selector={"ray.io/accelerator-type": "H100"})
 def run_training(cmd_args):
     """Run training on GPU workers.
@@ -23,4 +24,5 @@ def run_training(cmd_args):
     )
     return result.returncode
 
+
 sys.exit(ray.get(run_training.remote(sys.argv[1:])))

From a8f5d8262d1721f3b3e20fa53a25a637fa42b781 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 21:42:36 -0800
Subject: [PATCH 22/34] Use spot instances for H100 workers

Add market_type: SPOT to improve availability and reduce cost.

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/job.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rl_with_miles/job.yaml b/rl_with_miles/job.yaml
index 5c5f156..93bee7d 100644
--- a/rl_with_miles/job.yaml
+++ b/rl_with_miles/job.yaml
@@ -19,13 +19,13 @@ compute_config:
       CPU: 8
       memory: 32Gi
   worker_nodes:
-    - name: h100-workers
-      required_resources:
+    - required_resources:
         CPU: 192
         memory: 2048Gi
         GPU: 8
       required_labels:
         ray.io/accelerator-type: H100
+      market_type: SPOT
       min_nodes: 1
       max_nodes: 1
 

From 65d7cdda06a31617c01e4d8f906b8b1f7aa51ced Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 23:40:43 -0800
Subject: [PATCH 23/34] Fix GPU access in convert_weights_remote.py

Reserve 1 GPU in the weight conversion wrapper so that CUDA_VISIBLE_DEVICES
is set for the subprocess. This allows the conversion script to access GPUs.
The label_selector alone doesn't provide GPU access to subprocesses.

This doesn't conflict with training since conversion completes before
training starts.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/README.md                 |  4 +++-
 rl_with_miles/convert_weights_remote.py | 10 +++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/rl_with_miles/README.md b/rl_with_miles/README.md
index 794f251..c260296 100644
--- a/rl_with_miles/README.md
+++ b/rl_with_miles/README.md
@@ -37,4 +37,6 @@ The entrypoint will automatically download the model and dataset, convert weight
 - **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. The 4 SGLang rollout engines run inference in parallel while the training GPUs perform gradient updates.
 - **Weight conversion**: On the first run, HuggingFace weights are converted to Megatron-LM's `torch_dist` format. Converted weights are cached in `/mnt/cluster_storage/Qwen3-8B_torch_dist` for subsequent runs.
 - **Async training**: The pipeline uses `train_async.py` which overlaps rollout generation and policy updates for better GPU utilization.
-- **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. Both wrappers use `label_selector={"ray.io/accelerator-type": "H100"}` to match the accelerator type specified in `job.yaml`, ensuring placement on H100 GPU nodes. Neither wrapper reserves GPUs (`num_gpus`) to avoid conflicts with the scripts' internal GPU management.
+- **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. Both wrappers use `label_selector={"ray.io/accelerator-type": "H100"}` to match the accelerator type specified in `job.yaml`, ensuring placement on H100 GPU nodes.
+  - `convert_weights_remote.py`: Reserves 1 GPU (`num_gpus=1`) so that `CUDA_VISIBLE_DEVICES` is set for the subprocess, enabling GPU access. This doesn't conflict with training since conversion completes before training starts.
+  - `train_remote.py`: Does NOT reserve GPUs to avoid conflicts with the MILES training script, which internally uses Ray to allocate all 8 GPUs (4 for training, 4 for rollout). The label selector ensures placement on the GPU worker node.
diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index 7a3db73..0bb46a6 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -5,15 +5,19 @@
 import ray
 
 
-@ray.remote(label_selector={"ray.io/accelerator-type": "H100"})
+@ray.remote(
+    num_gpus=1,
+    label_selector={"ray.io/accelerator-type": "H100"}
+)
 def convert_weights(cmd_args):
     """Run weight conversion on a GPU worker.
 
     Uses label selector to ensure placement on H100 GPU nodes.
     The label must match the accelerator-type in job.yaml compute_config.
 
-    Note: We don't reserve GPUs (num_gpus) to allow the conversion script
-    to manage GPU allocation as needed.
+    Reserves 1 GPU (num_gpus=1) so that CUDA_VISIBLE_DEVICES is set for
+    the subprocess, enabling GPU access. This doesn't conflict with training
+    since conversion runs before training starts.
     """
     result = subprocess.run(
         ["python", "/tmp/miles/tools/convert_hf_to_torch_dist.py"] + cmd_args,

From 922bebd817f4157ba2524c91ced9eb55c85b5246 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 23:44:28 -0800
Subject: [PATCH 24/34] Set CUDA_VISIBLE_DEVICES explicitly for all GPUs

Set CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 in both wrapper subprocesses
so they have explicit access to all 8 GPUs on the H100 worker node.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/convert_weights_remote.py | 13 +++++++++----
 rl_with_miles/train_remote.py           | 14 +++++++++-----
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index 0bb46a6..2e002f6 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -2,6 +2,7 @@
 """Ray remote wrapper for weight conversion - ensures it runs on a GPU worker."""
 import sys
 import subprocess
+import os
 import ray
 
 
@@ -15,14 +16,18 @@ def convert_weights(cmd_args):
     Uses label selector to ensure placement on H100 GPU nodes.
     The label must match the accelerator-type in job.yaml compute_config.
 
-    Reserves 1 GPU (num_gpus=1) so that CUDA_VISIBLE_DEVICES is set for
-    the subprocess, enabling GPU access. This doesn't conflict with training
-    since conversion runs before training starts.
+    Reserves 1 GPU (num_gpus=1) for scheduling and explicitly sets
+    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 so the subprocess can access
+    all GPUs on the worker node.
     """
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
+
     result = subprocess.run(
         ["python", "/tmp/miles/tools/convert_hf_to_torch_dist.py"] + cmd_args,
         capture_output=True,
-        text=True
+        text=True,
+        env=env
     )
     return result.returncode, result.stdout, result.stderr
 
diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index d45b302..ade4f33 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -2,6 +2,7 @@
 """Ray remote wrapper for training - ensures it runs on GPU workers."""
 import sys
 import subprocess
+import os
 import ray
 
 
@@ -12,15 +13,18 @@ def run_training(cmd_args):
     Uses label selector to ensure placement on H100 GPU nodes.
     The label must match the accelerator-type in job.yaml compute_config.
 
-    Note: We don't reserve GPUs (num_gpus) because the MILES training script
-    internally uses Ray to allocate GPUs for training and rollout.
-    Reserving GPUs in the wrapper would conflict with the subprocess's
-    GPU allocation.
+    Explicitly sets CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 so the subprocess
+    can access all GPUs on the worker node. Does not reserve GPUs (num_gpus)
+    to avoid conflicts with the MILES training script's internal GPU allocation.
     """
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
+
     result = subprocess.run(
         ["python", "/tmp/miles/train_async.py"] + cmd_args,
         capture_output=False,  # Stream output directly
-        text=True
+        text=True,
+        env=env
     )
     return result.returncode
 

From 390fcbee28b7aca951f4a3ca062897ea81a8ec58 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Thu, 5 Mar 2026 23:45:01 -0800
Subject: [PATCH 25/34] Remove num_gpus reservation from convert wrapper

Don't reserve GPUs in the wrapper - use label_selector for placement
and explicit CUDA_VISIBLE_DEVICES for GPU access. This gives the
subprocess full control over GPU allocation.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/README.md                 |  4 +---
 rl_with_miles/convert_weights_remote.py | 11 ++++-------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/rl_with_miles/README.md b/rl_with_miles/README.md
index c260296..5a59bd9 100644
--- a/rl_with_miles/README.md
+++ b/rl_with_miles/README.md
@@ -37,6 +37,4 @@ The entrypoint will automatically download the model and dataset, convert weight
 - **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. The 4 SGLang rollout engines run inference in parallel while the training GPUs perform gradient updates.
 - **Weight conversion**: On the first run, HuggingFace weights are converted to Megatron-LM's `torch_dist` format. Converted weights are cached in `/mnt/cluster_storage/Qwen3-8B_torch_dist` for subsequent runs.
 - **Async training**: The pipeline uses `train_async.py` which overlaps rollout generation and policy updates for better GPU utilization.
-- **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. Both wrappers use `label_selector={"ray.io/accelerator-type": "H100"}` to match the accelerator type specified in `job.yaml`, ensuring placement on H100 GPU nodes.
-  - `convert_weights_remote.py`: Reserves 1 GPU (`num_gpus=1`) so that `CUDA_VISIBLE_DEVICES` is set for the subprocess, enabling GPU access. This doesn't conflict with training since conversion completes before training starts.
-  - `train_remote.py`: Does NOT reserve GPUs to avoid conflicts with the MILES training script, which internally uses Ray to allocate all 8 GPUs (4 for training, 4 for rollout). The label selector ensures placement on the GPU worker node.
+- **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. Both wrappers use `label_selector={"ray.io/accelerator-type": "H100"}` to match the accelerator type specified in `job.yaml`, ensuring placement on H100 GPU nodes. Both wrappers explicitly set `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7` in the subprocess environment to provide access to all 8 GPUs. Neither wrapper reserves GPUs with `num_gpus` to allow the subprocesses to manage GPU allocation internally.
diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index 2e002f6..315a048 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -6,19 +6,16 @@
 import ray
 
 
-@ray.remote(
-    num_gpus=1,
-    label_selector={"ray.io/accelerator-type": "H100"}
-)
+@ray.remote(label_selector={"ray.io/accelerator-type": "H100"})
 def convert_weights(cmd_args):
     """Run weight conversion on a GPU worker.
 
     Uses label selector to ensure placement on H100 GPU nodes.
     The label must match the accelerator-type in job.yaml compute_config.
 
-    Reserves 1 GPU (num_gpus=1) for scheduling and explicitly sets
-    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 so the subprocess can access
-    all GPUs on the worker node.
+    Explicitly sets CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 so the subprocess
+    can access all GPUs on the worker node. Does not reserve GPUs (num_gpus)
+    to allow flexible GPU allocation.
     """
     env = os.environ.copy()
     env["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"

From 395f24fe02483e8fbf19287c6f4c1083cf455641 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 6 Mar 2026 11:52:14 -0800
Subject: [PATCH 26/34] Fix TensorBoard configuration

Set TENSORBOARD_DIR as environment variable instead of passing as
command-line argument. MILES expects this to be set in the environment.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/entrypoint.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rl_with_miles/entrypoint.sh b/rl_with_miles/entrypoint.sh
index be56e6f..8ff1612 100755
--- a/rl_with_miles/entrypoint.sh
+++ b/rl_with_miles/entrypoint.sh
@@ -12,6 +12,7 @@ set -ex
 
 export PYTHONUNBUFFERED=1
 STORAGE=/mnt/cluster_storage
+export TENSORBOARD_DIR=${STORAGE}/tensorboard_logs
 
 # Qwen3-8B model architecture args (from scripts/models/qwen3-8B.sh)
 MODEL_ARGS=(
@@ -129,7 +130,6 @@ MISC_ARGS=(
    --attention-softmax-in-fp32
    --attention-backend flash
    --use-tensorboard
-   --tensorboard-dir ${STORAGE}/tensorboard_logs
 )
 
 echo "=== Starting training ==="

From f9cbc9c1500088ec09c328881a6730a87db429cf Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 6 Mar 2026 12:14:15 -0800
Subject: [PATCH 27/34] Set TENSORBOARD_DIR in job env_vars

Set TENSORBOARD_DIR as a cluster-wide environment variable in job.yaml
so it's available to all Ray actors, including the RolloutManager.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/job.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rl_with_miles/job.yaml b/rl_with_miles/job.yaml
index 93bee7d..5517457 100644
--- a/rl_with_miles/job.yaml
+++ b/rl_with_miles/job.yaml
@@ -35,6 +35,7 @@ entrypoint: bash entrypoint.sh
 
 env_vars:
   CUDA_DEVICE_MAX_CONNECTIONS: "1"
+  TENSORBOARD_DIR: "/mnt/cluster_storage/tensorboard_logs"
 
 max_retries: 0
 timeout_s: 7200

From 05f7b4b083b3969ffd9ebbf800121b1033a7b7c1 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 6 Mar 2026 13:49:26 -0800
Subject: [PATCH 28/34] Remove redundant TENSORBOARD_DIR from entrypoint

TENSORBOARD_DIR is now set at the job level in job.yaml,
so remove the redundant export from entrypoint.sh.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/entrypoint.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rl_with_miles/entrypoint.sh b/rl_with_miles/entrypoint.sh
index 8ff1612..34d465b 100755
--- a/rl_with_miles/entrypoint.sh
+++ b/rl_with_miles/entrypoint.sh
@@ -12,7 +12,6 @@ set -ex
 
 export PYTHONUNBUFFERED=1
 STORAGE=/mnt/cluster_storage
-export TENSORBOARD_DIR=${STORAGE}/tensorboard_logs
 
 # Qwen3-8B model architecture args (from scripts/models/qwen3-8B.sh)
 MODEL_ARGS=(

From 4d9c8607960bfaafb9a4dfe2b16bdcf61f5db375 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 6 Mar 2026 17:03:45 -0800
Subject: [PATCH 29/34] Use on-demand instances instead of spot

Remove market_type: SPOT to use on-demand instances for better
availability and reliability.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/job.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rl_with_miles/job.yaml b/rl_with_miles/job.yaml
index 5517457..70fdad6 100644
--- a/rl_with_miles/job.yaml
+++ b/rl_with_miles/job.yaml
@@ -25,7 +25,6 @@ compute_config:
         GPU: 8
       required_labels:
         ray.io/accelerator-type: H100
-      market_type: SPOT
       min_nodes: 1
       max_nodes: 1
 

From 6355c4abf4d473b234a2250f863c3befe78ef2fd Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 6 Mar 2026 17:05:43 -0800
Subject: [PATCH 30/34] Clean up formatting

- Remove shebang lines from Python wrapper scripts
- Remove comment header from job.yaml

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/convert_weights_remote.py |  1 -
 rl_with_miles/job.yaml                  | 11 -----------
 rl_with_miles/train_remote.py           |  1 -
 3 files changed, 13 deletions(-)

diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index 315a048..6b90b07 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Ray remote wrapper for weight conversion - ensures it runs on a GPU worker."""
 import sys
 import subprocess
diff --git a/rl_with_miles/job.yaml b/rl_with_miles/job.yaml
index 70fdad6..9844650 100644
--- a/rl_with_miles/job.yaml
+++ b/rl_with_miles/job.yaml
@@ -1,14 +1,3 @@
-# Anyscale job config: Miles Qwen3-8B GRPO training on H100
-# Single node × 8x H100-80GB
-#
-# Layout:
-#   Head node (m5.2xlarge): driver only, no GPUs
-#   Worker 0 (8x H100):  [GPU 0-3: Training TP=2 DP=2] [GPU 4-7: Rollout (4 engines)]
-#
-# Submit with:
-#   cd rl_with_miles
-#   anyscale job submit -f job.yaml
-
 name: miles-qwen3-8b-grpo-h100
 
 containerfile: ./Dockerfile
diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index ade4f33..1dfa95e 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """Ray remote wrapper for training - ensures it runs on GPU workers."""
 import sys
 import subprocess

From b499b61afc9c776c99e00b861329e98b73d9b68a Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 6 Mar 2026 18:50:47 -0800
Subject: [PATCH 31/34] Fix wrapper scripts with correct versions from
 polish-qwen3-example

- Remove shebang lines
- Use label_selector without num_gpus reservation
- Add explicit CUDA_VISIBLE_DEVICES for GPU access
- Simplify script structure without if __name__ blocks

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/convert_weights_remote.py | 40 ++++++++++++++-----------
 rl_with_miles/train_remote.py           | 29 +++++++++++-------
 2 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/rl_with_miles/convert_weights_remote.py b/rl_with_miles/convert_weights_remote.py
index f7aaa4d..6b90b07 100644
--- a/rl_with_miles/convert_weights_remote.py
+++ b/rl_with_miles/convert_weights_remote.py
@@ -1,30 +1,36 @@
-#!/usr/bin/env python
 """Ray remote wrapper for weight conversion - ensures it runs on a GPU worker."""
 import sys
 import subprocess
+import os
 import ray
 
-@ray.remote(num_gpus=1)
+
+@ray.remote(label_selector={"ray.io/accelerator-type": "H100"})
 def convert_weights(cmd_args):
-    """Run weight conversion on a GPU worker."""
+    """Run weight conversion on a GPU worker.
+
+    Uses label selector to ensure placement on H100 GPU nodes.
+    The label must match the accelerator-type in job.yaml compute_config.
+
+    Explicitly sets CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 so the subprocess
+    can access all GPUs on the worker node. Does not reserve GPUs (num_gpus)
+    to allow flexible GPU allocation.
+    """
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
+
     result = subprocess.run(
         ["python", "/tmp/miles/tools/convert_hf_to_torch_dist.py"] + cmd_args,
         capture_output=True,
-        text=True
+        text=True,
+        env=env
     )
     return result.returncode, result.stdout, result.stderr
 
-if __name__ == "__main__":
-    # Pass through all command-line arguments
-    cmd_args = sys.argv[1:]
-
-    # Run conversion on GPU worker
-    returncode, stdout, stderr = ray.get(convert_weights.remote(cmd_args))
-
-    # Print output
-    if stdout:
-        print(stdout, end="")
-    if stderr:
-        print(stderr, end="", file=sys.stderr)
 
-    sys.exit(returncode)
+returncode, stdout, stderr = ray.get(convert_weights.remote(sys.argv[1:]))
+if stdout:
+    print(stdout, end="")
+if stderr:
+    print(stderr, end="", file=sys.stderr)
+sys.exit(returncode)
diff --git a/rl_with_miles/train_remote.py b/rl_with_miles/train_remote.py
index d95fbaa..1dfa95e 100644
--- a/rl_with_miles/train_remote.py
+++ b/rl_with_miles/train_remote.py
@@ -1,24 +1,31 @@
-#!/usr/bin/env python
 """Ray remote wrapper for training - ensures it runs on GPU workers."""
 import sys
 import subprocess
+import os
 import ray
 
-@ray.remote(num_gpus=4)  # Training needs 4 GPUs
+
+@ray.remote(label_selector={"ray.io/accelerator-type": "H100"})
 def run_training(cmd_args):
-    """Run training on GPU workers."""
+    """Run training on GPU workers.
+
+    Uses label selector to ensure placement on H100 GPU nodes.
+    The label must match the accelerator-type in job.yaml compute_config.
+
+    Explicitly sets CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 so the subprocess
+    can access all GPUs on the worker node. Does not reserve GPUs (num_gpus)
+    to avoid conflicts with the MILES training script's internal GPU allocation.
+    """
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
+
     result = subprocess.run(
         ["python", "/tmp/miles/train_async.py"] + cmd_args,
         capture_output=False,  # Stream output directly
-        text=True
+        text=True,
+        env=env
     )
     return result.returncode
 
-if __name__ == "__main__":
-    # Pass through all command-line arguments
-    cmd_args = sys.argv[1:]
-
-    # Run training on GPU workers
-    returncode = ray.get(run_training.remote(cmd_args))
 
-    sys.exit(returncode)
+sys.exit(ray.get(run_training.remote(sys.argv[1:])))

From f7f9d452fa24085292ff150e002ff8dc9740b81c Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Sat, 7 Mar 2026 13:26:51 -0800
Subject: [PATCH 32/34] Remove specific memory size from H100 GPUs

Remove "80GB" specification as the example works with standard H100 GPUs
regardless of memory variant.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rl_with_miles/README.md b/rl_with_miles/README.md
index 5a59bd9..460a590 100644
--- a/rl_with_miles/README.md
+++ b/rl_with_miles/README.md
@@ -2,7 +2,7 @@
 
 This example demonstrates reinforcement learning fine-tuning of Qwen3-8B using **Group Relative Policy Optimization (GRPO)** on the DAPO-Math-17k dataset. It uses the [MILES](https://github.com/radixark/miles) framework for distributed RL training with disaggregated rollouts on Anyscale.
 
-The training runs on a single node with **8x H100-80GB GPUs**, using:
+The training runs on a single node with **8x H100 GPUs**, using:
 - **4 GPUs for training** (TP=2, DP=2 with Megatron-LM)
 - **4 GPUs for rollout inference** (disaggregated SGLang engines)
 

From 551421c5537842c6ff238aaf2819e38df998c4b6 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Sat, 7 Mar 2026 13:30:01 -0800
Subject: [PATCH 33/34] Add explanation for CUDA_DEVICE_MAX_CONNECTIONS setting

Document that CUDA_DEVICE_MAX_CONNECTIONS=1 is standard for Megatron-LM
with tensor parallelism on H100 GPUs.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 rl_with_miles/job.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rl_with_miles/job.yaml b/rl_with_miles/job.yaml
index 9844650..286eeb7 100644
--- a/rl_with_miles/job.yaml
+++ b/rl_with_miles/job.yaml
@@ -22,6 +22,9 @@ working_dir: .
 entrypoint: bash entrypoint.sh
 
 env_vars:
+  # Standard setting for Megatron-LM with tensor parallelism on H100 GPUs.
+  # Limits concurrent CUDA kernel launches to prevent deadlocks with NCCL
+  # collective operations during distributed training.
   CUDA_DEVICE_MAX_CONNECTIONS: "1"
   TENSORBOARD_DIR: "/mnt/cluster_storage/tensorboard_logs"
 

From b03c4789b32dded0d287b80e91c46ac2b9e9d81f Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Sat, 7 Mar 2026 15:36:14 -0800
Subject: [PATCH 34/34] Update example to 2-node configuration with spot
 instances

- Scale to 2 worker nodes (16 GPUs total)
- Training: TP=2, DP=8 across 2 nodes (8 training GPUs)
- Rollout: 8 SGLang engines (4 per node)
- Document MILES placement groups behavior (dedicated nodes)
- Enable spot instances for better availability

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 rl_with_miles/README.md     |  8 ++++----
 rl_with_miles/entrypoint.sh | 13 ++++++-------
 rl_with_miles/job.yaml      |  4 ++--
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/rl_with_miles/README.md b/rl_with_miles/README.md
index 460a590..f45d0c0 100644
--- a/rl_with_miles/README.md
+++ b/rl_with_miles/README.md
@@ -2,9 +2,9 @@
 
 This example demonstrates reinforcement learning fine-tuning of Qwen3-8B using **Group Relative Policy Optimization (GRPO)** on the DAPO-Math-17k dataset. It uses the [MILES](https://github.com/radixark/miles) framework for distributed RL training with disaggregated rollouts on Anyscale.
 
-The training runs on a single node with **8x H100 GPUs**, using:
-- **4 GPUs for training** (TP=2, DP=2 with Megatron-LM)
-- **4 GPUs for rollout inference** (disaggregated SGLang engines)
+The training runs on **2 nodes with 8x H100 GPUs each** (16 GPUs total), using:
+- **8 GPUs for training** (TP=2, DP=8 with Megatron-LM across 2 nodes)
+- **8 GPUs for rollout inference** (disaggregated SGLang engines, 8 total)
 
 ## Install the Anyscale CLI
 
@@ -34,7 +34,7 @@ The entrypoint will automatically download the model and dataset, convert weight
 
 - **Algorithm**: This example uses GRPO with DAPO-style asymmetric clipping (ε_low=0.2, ε_high=0.28), which is particularly effective for math reasoning tasks.
 - **Dataset**: [DAPO-Math-17k](https://huggingface.co/datasets/zhuzilin/dapo-math-17k) contains 17k integer math problems with deterministic reward signals based on answer correctness.
-- **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. The 4 SGLang rollout engines run inference in parallel while the training GPUs perform gradient updates.
+- **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. GPU placement is handled automatically by MILES using Ray placement groups, which uses node 1 for all training GPUs and node 2 for all rollout GPUs.
 - **Weight conversion**: On the first run, HuggingFace weights are converted to Megatron-LM's `torch_dist` format. Converted weights are cached in `/mnt/cluster_storage/Qwen3-8B_torch_dist` for subsequent runs.
 - **Async training**: The pipeline uses `train_async.py` which overlaps rollout generation and policy updates for better GPU utilization.
 - **Ray remote wrappers**: The MILES scripts are wrapped in Ray remote functions (`convert_weights_remote.py` and `train_remote.py`) to ensure they execute on GPU worker nodes rather than the CPU-only head node. Both wrappers use `label_selector={"ray.io/accelerator-type": "H100"}` to match the accelerator type specified in `job.yaml`, ensuring placement on H100 GPU nodes. Both wrappers explicitly set `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7` in the subprocess environment to provide access to all 8 GPUs. Neither wrapper reserves GPUs with `num_gpus` to allow the subprocesses to manage GPU allocation internally.
diff --git a/rl_with_miles/entrypoint.sh b/rl_with_miles/entrypoint.sh
index 34d465b..3e171e5 100755
--- a/rl_with_miles/entrypoint.sh
+++ b/rl_with_miles/entrypoint.sh
@@ -1,12 +1,11 @@
 #!/bin/bash
-# Anyscale entrypoint: Qwen3-8B GRPO training on 1 worker × 8x H100-80GB
+# Anyscale entrypoint: Qwen3-8B GRPO training on 2 workers × 8x H100
 # Downloads model/dataset, converts weights, and runs async RL training.
 #
 # Head node (m5.2xlarge): driver only, no GPUs
-# Layout (GPU worker):
-#   Worker 0 (8x H100):
-#     GPU 0-3: Training (TP=2, DP=2)
-#     GPU 4-7: Rollout (4 SGLang engines, 1 GPU each)
+# GPU Placement (determined by MILES using Ray Placement Groups with PACK strategy):
+#   Node 1 (8x H100): Training GPUs 0-7 (TP=2, DP=8)
+#   Node 2 (8x H100): Rollout GPUs 0-7 (8 SGLang engines, 1 GPU each)
 
 set -ex
 
@@ -133,9 +132,9 @@ MISC_ARGS=(
 
 echo "=== Starting training ==="
 python train_remote.py \
-   --actor-num-nodes 1 \
+   --actor-num-nodes 2 \
    --actor-num-gpus-per-node 4 \
-   --rollout-num-gpus 4 \
+   --rollout-num-gpus 8 \
    ${MODEL_ARGS[@]} \
    ${CKPT_ARGS[@]} \
    ${ROLLOUT_ARGS[@]} \
diff --git a/rl_with_miles/job.yaml b/rl_with_miles/job.yaml
index 286eeb7..18de47d 100644
--- a/rl_with_miles/job.yaml
+++ b/rl_with_miles/job.yaml
@@ -14,8 +14,8 @@ compute_config:
         GPU: 8
       required_labels:
         ray.io/accelerator-type: H100
-      min_nodes: 1
-      max_nodes: 1
+      min_nodes: 2
+      max_nodes: 2
 
 working_dir: .