From 6d050ff2e3dcfa3c50515197ceffc0b746a8d1ac Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 13 Mar 2026 01:42:34 +0000 Subject: [PATCH 1/6] fix: remove uint16 test from test_add.py - Removed `torch.uint16` from the list of integer data types in the `_INT_DTYPES` tuple to streamline the code and eliminate redundancy. --- tests/test_add.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_add.py b/tests/test_add.py index afbce0d..900c4a1 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -6,7 +6,6 @@ _INT_DTYPES = ( torch.int16, - torch.uint16, torch.int32, torch.uint32, torch.int64, @@ -39,7 +38,6 @@ (torch.float16, 1e-3, 1e-3), (torch.bfloat16, 1e-2, 5e-3), (torch.int16, 0, 0), - (torch.uint16, 0, 0), (torch.int32, 0, 0), (torch.uint32, 0, 0), (torch.int64, 0, 0), From 252abb286bf46a2ea18dddc38811aa7a4d648d45 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 13 Mar 2026 01:53:20 +0000 Subject: [PATCH 2/6] refactor: enhance dtype handling in test_add.py --- tests/test_add.py | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/tests/test_add.py b/tests/test_add.py index 900c4a1..77ebb3d 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -4,12 +4,16 @@ from tests.utils import Payload, empty_strided, randint_strided, randn_strided -_INT_DTYPES = ( - torch.int16, - torch.int32, - torch.uint32, - torch.int64, - torch.uint64, +_INT_DTYPES = tuple( + d + for d in ( + torch.int16, + torch.int32, + getattr(torch, "uint32", None), + torch.int64, + getattr(torch, "uint64", None), + ) + if d is not None ) @@ -31,19 +35,21 @@ ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), ), ) -@pytest.mark.parametrize( - ("dtype", "rtol", "atol"), - ( +def _dtype_parametrize(): + candidates = [ (torch.float32, 1e-7, 1e-7), (torch.float16, 1e-3, 1e-3), (torch.bfloat16, 1e-2, 5e-3), (torch.int16, 0, 0), (torch.int32, 0, 0), - (torch.uint32, 0, 0), + (getattr(torch, "uint32", None), 0, 0), (torch.int64, 0, 0), - (torch.uint64, 0, 0), - ), -) + (getattr(torch, "uint64", None), 0, 0), + ] + return tuple((d, r, a) for (d, r, a) in candidates if d is not None) + + +@pytest.mark.parametrize(("dtype", "rtol", "atol"), _dtype_parametrize()) def test_add(shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol): if dtype in _INT_DTYPES: input = randint_strided(0, 100, shape, input_strides, dtype=dtype, device=device) @@ -63,11 +69,16 @@ def _add(input, other, out): return out +_UINT_DTYPES = tuple( + d for name in ("uint16", "uint32", "uint64") if (d := getattr(torch, name, None)) is not None +) + + def _torch_add(input, other, out): - if input.dtype in (torch.uint16, torch.uint32, torch.uint64): + if input.dtype in _UINT_DTYPES: input = input.to(torch.int64) - if other.dtype in (torch.uint16, torch.uint32, torch.uint64): + if other.dtype in _UINT_DTYPES: other = other.to(torch.int64) res = torch.add(input, other) From 031e928a1765b3dd441e41c2af15a8c4487e373a Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 13 Mar 2026 02:06:25 +0000 Subject: [PATCH 3/6] refactor: streamline dtype parameterization in test_add.py and enhance rms_norm fallback handling in test_rms_norm.py --- tests/test_add.py | 28 ++++++++++++++-------------- tests/test_rms_norm.py | 11 ++++++++++- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/tests/test_add.py b/tests/test_add.py index 77ebb3d..fe57652 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -17,6 +17,20 @@ ) +def _dtype_parametrize(): + candidates = [ + (torch.float32, 1e-7, 1e-7), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-2, 5e-3), + (torch.int16, 0, 0), + (torch.int32, 0, 0), + (getattr(torch, "uint32", None), 0, 0), + (torch.int64, 0, 0), + (getattr(torch, "uint64", None), 0, 0), + ] + return tuple((d, r, a) for (d, r, a) in candidates if d is not None) + + @pytest.mark.auto_act_and_assert @pytest.mark.parametrize( "shape, input_strides, other_strides, out_strides", @@ -35,20 +49,6 @@ ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), ), ) -def _dtype_parametrize(): - candidates = [ - (torch.float32, 1e-7, 1e-7), - (torch.float16, 1e-3, 1e-3), - (torch.bfloat16, 1e-2, 5e-3), - (torch.int16, 0, 0), - (torch.int32, 0, 0), - (getattr(torch, "uint32", None), 0, 0), - (torch.int64, 0, 0), - (getattr(torch, "uint64", None), 0, 0), - ] - return tuple((d, r, a) for (d, r, a) in candidates if d is not None) - - @pytest.mark.parametrize(("dtype", "rtol", "atol"), _dtype_parametrize()) def test_add(shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol): if dtype in _INT_DTYPES: diff --git a/tests/test_rms_norm.py b/tests/test_rms_norm.py index f447091..b0c9c5d 100644 --- a/tests/test_rms_norm.py +++ b/tests/test_rms_norm.py @@ -59,4 +59,13 @@ def _rms_norm(input, weight, *, eps=1e-6, out=None): def _torch_rms_norm(input, weight, *, eps=1e-6, out=None): - return torch.nn.functional.rms_norm(input, input.shape[-1:], weight=weight, eps=eps) + rms_norm_fn = getattr(torch.nn.functional, "rms_norm", None) + if rms_norm_fn is not None: + return rms_norm_fn(input, input.shape[-1:], weight=weight, eps=eps) + # Fallback for PyTorch < 2.3: RMS norm = (x / sqrt(mean(x^2) + eps)) * weight + rms = torch.sqrt(torch.mean(input * input, dim=-1, keepdim=True) + eps) + result = (input / rms) * weight + if out is not None: + out.copy_(result) + return out + return result From 6a27f695e6eab604bb2506cea23bd501b2e34149 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 13 Mar 2026 02:39:02 +0000 Subject: [PATCH 4/6] refactor: add unsigned integer data types to test_add.py for enhanced dtype handling --- tests/test_add.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/test_add.py b/tests/test_add.py index fe57652..61d6715 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -9,13 +9,20 @@ for d in ( torch.int16, torch.int32, - getattr(torch, "uint32", None), torch.int64, - getattr(torch, "uint64", None), ) if d is not None ) +_UINT_DTYPES = tuple( + d + for d in ( + getattr(torch, "uint16", None), + getattr(torch, "uint32", None), + getattr(torch, "uint64", None), + ) + if d is not None +) def _dtype_parametrize(): candidates = [ @@ -51,7 +58,7 @@ def _dtype_parametrize(): ) @pytest.mark.parametrize(("dtype", "rtol", "atol"), _dtype_parametrize()) def test_add(shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol): - if dtype in _INT_DTYPES: + if dtype in _INT_DTYPES or dtype in _UINT_DTYPES: input = randint_strided(0, 100, shape, input_strides, dtype=dtype, device=device) other = randint_strided(0, 100, shape, other_strides, dtype=dtype, device=device) else: @@ -69,11 +76,6 @@ def _add(input, other, out): return out -_UINT_DTYPES = tuple( - d for name in ("uint16", "uint32", "uint64") if (d := getattr(torch, name, None)) is not None -) - - def _torch_add(input, other, out): if input.dtype in _UINT_DTYPES: input = input.to(torch.int64) From 373fa409d48fc7ebdfd4770ccdec70b889957b5c Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 13 Mar 2026 05:41:23 +0000 Subject: [PATCH 5/6] feat: add CI configuration and build scripts for Docker images - Introduced a new CI directory containing scripts for building and running Docker images. - Added `build.py` for building images based on configuration changes. - Created `run.py` for executing CI jobs with specified stages. - Included `config.yaml` for centralized configuration of registry, images, and jobs. - Updated `pyproject.toml` to include `pyyaml` in the development dependencies. --- .ci/README.md | 159 ++++++++++++++++++++++++++ .ci/build.py | 210 +++++++++++++++++++++++++++++++++++ .ci/config.yaml | 36 ++++++ .ci/images/ascend/Dockerfile | 31 ++++++ .ci/images/nvidia/Dockerfile | 26 +++++ .ci/run.py | 195 ++++++++++++++++++++++++++++++++ pyproject.toml | 2 +- 7 files changed, 658 insertions(+), 1 deletion(-) create mode 100644 .ci/README.md create mode 100644 .ci/build.py create mode 100644 .ci/config.yaml create mode 100644 .ci/images/ascend/Dockerfile create mode 100644 .ci/images/nvidia/Dockerfile create mode 100644 .ci/run.py diff --git a/.ci/README.md b/.ci/README.md new file mode 100644 index 0000000..afc6f53 --- /dev/null +++ b/.ci/README.md @@ -0,0 +1,159 @@ +# .ci — CI 镜像与流水线 + +本目录管理 CI 所用的 Docker 镜像构建与测试流水线执行。 + +## 目录结构 + +``` +.ci/ +├── config.yaml # 统一配置(registry、镜像、job 定义) +├── build.py # 镜像构建脚本 +├── run.py # CI 流水线执行脚本 +├── README.md +└── images/ + ├── nvidia/Dockerfile # NVIDIA 平台镜像 + └── ascend/Dockerfile # 昇腾平台镜像 +``` + +## 前置依赖 + +- Docker +- Python 3.10+ +- pyyaml (`pip install pyyaml`) + +## 配置文件 `config.yaml` + +```yaml +registry: + url: "" # Harbor 地址,本地开发时留空 + project: infiniops + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + +jobs: + nvidia_gpu: + image: stable # stable | latest | 具体 commit hash + platform: nvidia + resources: + gpu_id: "0" # GPU 设备 ID,如 "0" "0,2" "all" + memory: 32GB + timeout: 3600 + setup: pip install .[dev] + stages: + - name: test + run: pytest tests/ -v --tb=short +``` + +- **`registry.url`** 为空时镜像仅保存在本地,tag 格式为 `-ci/:`。 +- **`images..build_args`** 会作为 `--build-arg` 传入 `docker build`。 +- **`jobs..image`** 支持 `stable`、`latest` 或具体 commit hash。 +- **`resources.gpu_id`** 指定 GPU 设备 ID,支持 `"0"`、`"0,2"`、`"all"` 等格式,映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。 + +## 镜像构建 `build.py` + +```bash +python .ci/build.py [options] +``` + +| 参数 | 默认值 | 说明 | +|---|---|---| +| `--platform` | `all` | 构建平台:`nvidia`、`ascend` 或 `all` | +| `--commit` | `HEAD` | 用于镜像 tag 的 git ref | +| `--push` | — | 构建后推送到 registry | +| `--force` | — | 跳过变更检测,强制构建 | +| `--dry-run` | — | 仅打印命令,不执行 | +| `--config` | `.ci/config.yaml` | 配置文件路径 | + +### 示例 + +```bash +# 构建 nvidia 镜像(自动检测 Dockerfile 变更,无变更则跳过) +python .ci/build.py --platform nvidia + +# 强制构建 +python .ci/build.py --platform nvidia --force + +# 构建全部平台并推送到 registry +python .ci/build.py --push --force + +# 预览实际执行的 docker 命令 +python .ci/build.py --platform nvidia --force --dry-run +``` + +### 构建流程 + +1. 通过 `git diff HEAD~1` 检测 Dockerfile 目录是否有变更(`--force` 跳过此步) +2. `docker build` 构建镜像,同时打 `` 和 `latest` 两个 tag +3. 自动透传宿主机的 `http_proxy`/`https_proxy`/`no_proxy` 到构建容器 +4. 若指定 `--push`,将两个 tag 推送到 registry + +### 产物 + +| Tag | 说明 | +|---|---| +| `infiniops-ci/:` | 精确追溯到某次构建 | +| `infiniops-ci/:latest` | 最近一次构建 | + +## 流水线执行 `run.py` + +```bash +python .ci/run.py [options] +``` + +| 参数 | 默认值 | 说明 | +|---|---|---| +| `--job` | 配置中第一个 job | 要执行的 job 名称 | +| `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 | +| `--stage` | 全部 | 仅运行指定 stage | +| `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 | +| `--gpu-id` | config 中的 `gpu_id` | GPU 设备 ID,如 `0`、`0,2`、`all` | +| `--dry-run` | — | 仅打印 docker 命令,不执行 | +| `--config` | `.ci/config.yaml` | 配置文件路径 | + +### 示例 + +```bash +# 运行默认 job +python .ci/run.py + +# 指定分支和镜像版本 +python .ci/run.py --branch feature-xxx --image-tag latest + +# 只用 GPU 0 运行 +python .ci/run.py --gpu-id 0 + +# 用 GPU 0 和 2 运行 +python .ci/run.py --gpu-id 0,2 + +# 使用全部 GPU +python .ci/run.py --gpu-id all + +# 只跑 test stage +python .ci/run.py --stage test + +# 预览 docker 命令 +python .ci/run.py --dry-run +``` + +### 执行流程 + +1. 解析 job 配置,拉取对应镜像 +2. `docker run` 启动容器(自动挂载 GPU、限制内存) +3. 容器内 `git clone` → `checkout` → 执行 `setup` 命令 +4. 依次执行各 stage,汇总结果 + +## 代理配置 + +如果网络环境需要代理,在宿主机设置环境变量后即可: + +```bash +export http_proxy=http://localhost:9991 +export https_proxy=http://localhost:9991 +``` + +- **`build.py`** 会自动透传代理到 `docker build`(通过 `--build-arg` + `--network host`)。 +- **`run.py`** 使用 `--network host`,容器内可直接访问宿主机代理。 diff --git a/.ci/build.py b/.ci/build.py new file mode 100644 index 0000000..489ebf0 --- /dev/null +++ b/.ci/build.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +"""CI image builder: detect changes, build, tag, and optionally push Docker images.""" + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def get_git_commit(ref="HEAD"): + result = subprocess.run( + ["git", "rev-parse", "--short", ref], + capture_output=True, + text=True, + ) + if result.returncode != 0: + print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr) + sys.exit(1) + + return result.stdout.strip() + + +def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): + """Check if any file under `dockerfile_dir` changed since `base_ref`.""" + result = subprocess.run( + ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir], + capture_output=True, + text=True, + ) + + return bool(result.stdout.strip()) + + +def build_image_tag(registry_url, project, platform, tag): + if registry_url: + return f"{registry_url}/{project}/{platform}:{tag}" + + return f"{project}-ci/{platform}:{tag}" + + +def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run): + """Build a single platform image. Returns True on success.""" + registry_url = registry_cfg.get("url", "") + project = registry_cfg.get("project", "infiniops") + dockerfile_dir = platform_cfg["dockerfile"] + + commit_tag = build_image_tag(registry_url, project, platform, commit) + latest_tag = build_image_tag(registry_url, project, platform, "latest") + + build_args_cfg = platform_cfg.get("build_args", {}) + build_cmd = ["docker", "build", "--network", "host"] + for key, value in build_args_cfg.items(): + build_cmd.extend(["--build-arg", f"{key}={value}"]) + + for proxy_var in ("http_proxy", "https_proxy", "no_proxy"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.upper()) + if proxy_val: + build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) + + private_sdk = platform_cfg.get("private_sdk", {}) + if private_sdk: + sdk_url = private_sdk.get("source", "") + if sdk_url.startswith("${") and sdk_url.endswith("}"): + env_var = sdk_url[2:-1] + sdk_url = os.environ.get(env_var, "") + if sdk_url: + build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) + + build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) + + if dry_run: + print(f"[dry-run] {' '.join(build_cmd)}") + if push: + print(f"[dry-run] docker push {commit_tag}") + print(f"[dry-run] docker push {latest_tag}") + + return True + + print(f"==> building {platform}: {commit_tag}", file=sys.stderr) + result = subprocess.run(build_cmd) + if result.returncode != 0: + error = { + "stage": "build", + "platform": platform, + "tag": commit_tag, + "exit_code": result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + if push: + for tag in (commit_tag, latest_tag): + print(f"==> pushing {tag}", file=sys.stderr) + push_result = subprocess.run(["docker", "push", tag]) + if push_result.returncode != 0: + error = { + "stage": "push", + "platform": platform, + "tag": tag, + "exit_code": push_result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + return True + + +def main(): + parser = argparse.ArgumentParser(description="Build CI Docker images") + parser.add_argument( + "--platform", + type=str, + default="all", + help="Platform to build: nvidia, ascend, or all (default: all)", + ) + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument( + "--commit", + type=str, + default="HEAD", + help="Git ref for tagging the image (default: HEAD)", + ) + parser.add_argument( + "--push", + action="store_true", + help="Push images to registry after building", + ) + parser.add_argument( + "--force", + action="store_true", + help="Skip change detection and force build", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print commands without executing", + ) + args = parser.parse_args() + + config = load_config(args.config) + registry_cfg = config.get("registry", {}) + images_cfg = config.get("images", {}) + + if not images_cfg: + print("error: no `images` section in config", file=sys.stderr) + sys.exit(1) + + if args.platform == "all": + platforms = list(images_cfg.keys()) + else: + if args.platform not in images_cfg: + print( + f"error: platform `{args.platform}` not found in config", + file=sys.stderr, + ) + sys.exit(1) + platforms = [args.platform] + + commit = get_git_commit(args.commit) + failed = False + + for platform in platforms: + platform_cfg = images_cfg[platform] + dockerfile_dir = platform_cfg["dockerfile"] + + if not Path(dockerfile_dir).is_dir(): + print( + f"warning: dockerfile directory `{dockerfile_dir}` does not exist, skipping {platform}", + file=sys.stderr, + ) + continue + + if not args.force and not has_dockerfile_changed(dockerfile_dir): + print(f"==> {platform}: no changes detected, skipping", file=sys.stderr) + continue + + ok = build_image( + platform, platform_cfg, registry_cfg, commit, args.push, args.dry_run + ) + if not ok: + failed = True + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.ci/config.yaml b/.ci/config.yaml new file mode 100644 index 0000000..84cddf4 --- /dev/null +++ b/.ci/config.yaml @@ -0,0 +1,36 @@ +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +registry: + url: "" # TODO: Harbor not ready yet + project: infiniops + credentials_env: REGISTRY_TOKEN + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + ascend: # TODO: Ascend image is not ready yet + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source: "${PRIVATE_SDK_URL}" + +jobs: + nvidia_gpu: + image: stable + platform: nvidia + resources: + gpu_id: "0" # 指定 GPU ID,如 "0" "0,2" "all" + gpu_type: A100 + memory: 32GB + timeout: 3600 + + setup: pip install .[dev] + + stages: + - name: test + run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile new file mode 100644 index 0000000..87f7c91 --- /dev/null +++ b/.ci/images/ascend/Dockerfile @@ -0,0 +1,31 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + curl \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PRIVATE_SDK_URL +RUN if [ -n "$PRIVATE_SDK_URL" ]; then \ + curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \ + chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \ + rm /tmp/sdk.run; \ + fi + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile new file mode 100644 index 0000000..d89ea91 --- /dev/null +++ b/.ci/images/nvidia/Dockerfile @@ -0,0 +1,26 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG http_proxy +ARG https_proxy + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py new file mode 100644 index 0000000..7bc8249 --- /dev/null +++ b/.ci/run.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" + +import argparse +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def resolve_image(config, platform, image_tag): + """Resolve an image reference ('stable', 'latest', or commit hash) to a full URL.""" + registry = config.get("registry", {}) + registry_url = registry.get("url", "") + project = registry.get("project", "infiniops") + + if not registry_url: + return f"{project}-ci/{platform}:{image_tag}" + + return f"{registry_url}/{project}/{platform}:{image_tag}" + + +def build_runner_script(): + return r""" +export https_proxy=http://localhost:9991 +set -e +cd /workspace +git clone "$REPO_URL" repo +cd repo +git checkout "$BRANCH" +echo "========== Setup ==========" +eval "$SETUP_CMD" +set +e +failed=0 +for i in $(seq 1 "$NUM_STAGES"); do + name_var="STAGE_${i}_NAME" + cmd_var="STAGE_${i}_CMD" + name="${!name_var}" + cmd="${!cmd_var}" + echo "========== Stage: $name ==========" + eval "$cmd" || failed=1 +done +echo "========== Summary ==========" +exit $failed +""" + + +def build_docker_args( + config, job_name, repo_url, branch, stages, workdir, image_tag_override, + gpu_id_override=None, +): + job = config["jobs"][job_name] + platform = job.get("platform", "nvidia") + image_tag = image_tag_override or job.get("image", "stable") + image = resolve_image(config, platform, image_tag) + resources = job.get("resources", {}) + setup_cmd = job.get("setup", "pip install .[dev]") + + args = [ + "docker", + "run", + "--rm", + "--network", + "host", + "-i", + "-w", + workdir, + "-e", + f"REPO_URL={repo_url}", + "-e", + f"BRANCH={branch}", + "-e", + f"SETUP_CMD={setup_cmd}", + "-e", + f"NUM_STAGES={len(stages)}", + ] + for i, s in enumerate(stages): + args.append("-e") + args.append(f"STAGE_{i + 1}_NAME={s['name']}") + args.append("-e") + args.append(f"STAGE_{i + 1}_CMD={s['run']}") + + gpu_id = gpu_id_override or str(resources.get("gpu_id", "")) + gpu_count = resources.get("gpu_count", 0) + if gpu_id: + if gpu_id == "all": + args.extend(["--gpus", "all"]) + else: + args.extend(["--gpus", f'"device={gpu_id}"']) + elif gpu_count and gpu_count > 0: + args.extend(["--gpus", f"count={gpu_count}"]) + + memory = resources.get("memory") + if memory: + mem = str(memory).upper().replace("GB", "g").replace("MB", "m") + if not mem.endswith("g") and not mem.endswith("m"): + mem = f"{mem}g" + args.extend(["--memory", mem]) + + timeout_sec = resources.get("timeout") + if timeout_sec: + args.extend(["--stop-timeout", str(timeout_sec)]) + + args.append(image) + args.append("bash") + args.append("-c") + args.append(build_runner_script().strip()) + + return args + + +def main(): + parser = argparse.ArgumentParser(description="Run Docker CI pipeline") + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument("--branch", type=str, help="Override repo branch") + parser.add_argument("--job", type=str, help="Job name to run (default: first job)") + parser.add_argument( + "--stage", + type=str, + help="Run only this stage name (still runs setup first)", + ) + parser.add_argument( + "--image-tag", + type=str, + help="Override image tag (stable, latest, or commit hash)", + ) + parser.add_argument( + "--gpu-id", + type=str, + help='GPU device IDs to use, e.g. "0", "0,2", "all"', + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print docker command and exit", + ) + args = parser.parse_args() + + config = load_config(args.config) + repo = config.get("repo", {}) + repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") + branch = args.branch or repo.get("branch", "dev-infra") + + jobs = config.get("jobs", {}) + if not jobs: + print("error: no jobs in config", file=sys.stderr) + sys.exit(1) + job_name = args.job or next(iter(jobs)) + if job_name not in jobs: + print(f"error: job {job_name!r} not in config", file=sys.stderr) + sys.exit(1) + + job = jobs[job_name] + all_stages = job.get("stages", []) + if args.stage: + stages = [s for s in all_stages if s["name"] == args.stage] + if not stages: + print(f"error: stage {args.stage!r} not found", file=sys.stderr) + sys.exit(1) + else: + stages = all_stages + + workdir = "/workspace" + docker_args = build_docker_args( + config, job_name, repo_url, branch, stages, workdir, args.image_tag, + gpu_id_override=args.gpu_id, + ) + + if args.dry_run: + print(" ".join(docker_args)) + + return + + sys.exit(subprocess.run(docker_args).returncode) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 765b90a..3dbc186 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "InfiniOps" version = "0.1.0" [project.optional-dependencies] -dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch"] +dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"] [tool.scikit-build.wheel] install-dir = "infini" From d884069ce662063c81ad6ba16ebda3f31286788d Mon Sep 17 00:00:00 2001 From: zhangyue Date: Fri, 13 Mar 2026 05:51:20 +0000 Subject: [PATCH 6/6] fix: update GPU configuration keys in CI files - Changed `gpu_id` to `gpu_ids` in `config.yaml` and related documentation for consistency. - Updated `run.py` to reflect the new key for GPU ID retrieval. - Enhanced README to clarify the usage of the updated GPU configuration. --- .ci/README.md | 24 ++++++++++++++++++------ .ci/config.yaml | 2 +- .ci/run.py | 2 +- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/.ci/README.md b/.ci/README.md index afc6f53..59ee101 100644 --- a/.ci/README.md +++ b/.ci/README.md @@ -24,34 +24,46 @@ ## 配置文件 `config.yaml` ```yaml +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + registry: - url: "" # Harbor 地址,本地开发时留空 + url: "" # Harbor 地址,本地开发时留空 project: infiniops + credentials_env: REGISTRY_TOKEN images: nvidia: dockerfile: .ci/images/nvidia/ build_args: BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + ascend: + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source: "${PRIVATE_SDK_URL}" jobs: nvidia_gpu: - image: stable # stable | latest | 具体 commit hash + image: stable # stable | latest | 具体 commit hash platform: nvidia resources: - gpu_id: "0" # GPU 设备 ID,如 "0" "0,2" "all" + gpu_ids: "0" # GPU 设备 ID,如 "0" "0,2" "all" + gpu_type: A100 memory: 32GB timeout: 3600 setup: pip install .[dev] stages: - name: test - run: pytest tests/ -v --tb=short + run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml ``` - **`registry.url`** 为空时镜像仅保存在本地,tag 格式为 `-ci/:`。 - **`images..build_args`** 会作为 `--build-arg` 传入 `docker build`。 - **`jobs..image`** 支持 `stable`、`latest` 或具体 commit hash。 -- **`resources.gpu_id`** 指定 GPU 设备 ID,支持 `"0"`、`"0,2"`、`"all"` 等格式,映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。 +- **`resources.gpu_ids`** 指定 GPU 设备 ID,支持 `"0"`、`"0,2"`、`"all"` 等格式,映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。 ## 镜像构建 `build.py` @@ -110,7 +122,7 @@ python .ci/run.py [options] | `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 | | `--stage` | 全部 | 仅运行指定 stage | | `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 | -| `--gpu-id` | config 中的 `gpu_id` | GPU 设备 ID,如 `0`、`0,2`、`all` | +| `--gpu-id` | config 中的 `gpu_ids` | GPU 设备 ID,如 `0`、`0,2`、`all` | | `--dry-run` | — | 仅打印 docker 命令,不执行 | | `--config` | `.ci/config.yaml` | 配置文件路径 | diff --git a/.ci/config.yaml b/.ci/config.yaml index 84cddf4..fea3f7c 100644 --- a/.ci/config.yaml +++ b/.ci/config.yaml @@ -24,7 +24,7 @@ jobs: image: stable platform: nvidia resources: - gpu_id: "0" # 指定 GPU ID,如 "0" "0,2" "all" + gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" gpu_type: A100 memory: 32GB timeout: 3600 diff --git a/.ci/run.py b/.ci/run.py index 7bc8249..0421a56 100644 --- a/.ci/run.py +++ b/.ci/run.py @@ -92,7 +92,7 @@ def build_docker_args( args.append("-e") args.append(f"STAGE_{i + 1}_CMD={s['run']}") - gpu_id = gpu_id_override or str(resources.get("gpu_id", "")) + gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) gpu_count = resources.get("gpu_count", 0) if gpu_id: if gpu_id == "all":