diff --git a/.ci/README.md b/.ci/README.md new file mode 100644 index 0000000..59ee101 --- /dev/null +++ b/.ci/README.md @@ -0,0 +1,171 @@ +# .ci — CI 镜像与流水线 + +本目录管理 CI 所用的 Docker 镜像构建与测试流水线执行。 + +## 目录结构 + +``` +.ci/ +├── config.yaml # 统一配置(registry、镜像、job 定义) +├── build.py # 镜像构建脚本 +├── run.py # CI 流水线执行脚本 +├── README.md +└── images/ + ├── nvidia/Dockerfile # NVIDIA 平台镜像 + └── ascend/Dockerfile # 昇腾平台镜像 +``` + +## 前置依赖 + +- Docker +- Python 3.10+ +- pyyaml (`pip install pyyaml`) + +## 配置文件 `config.yaml` + +```yaml +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +registry: + url: "" # Harbor 地址,本地开发时留空 + project: infiniops + credentials_env: REGISTRY_TOKEN + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + ascend: + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source: "${PRIVATE_SDK_URL}" + +jobs: + nvidia_gpu: + image: stable # stable | latest | 具体 commit hash + platform: nvidia + resources: + gpu_ids: "0" # GPU 设备 ID,如 "0" "0,2" "all" + gpu_type: A100 + memory: 32GB + timeout: 3600 + setup: pip install .[dev] + stages: + - name: test + run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml +``` + +- **`registry.url`** 为空时镜像仅保存在本地,tag 格式为 `-ci/:`。 +- **`images..build_args`** 会作为 `--build-arg` 传入 `docker build`。 +- **`jobs..image`** 支持 `stable`、`latest` 或具体 commit hash。 +- **`resources.gpu_ids`** 指定 GPU 设备 ID,支持 `"0"`、`"0,2"`、`"all"` 等格式,映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。 + +## 镜像构建 `build.py` + +```bash +python .ci/build.py [options] +``` + +| 参数 | 默认值 | 说明 | +|---|---|---| +| `--platform` | `all` | 构建平台:`nvidia`、`ascend` 或 `all` | +| `--commit` | `HEAD` | 用于镜像 tag 的 git ref | +| `--push` | — | 构建后推送到 registry | +| `--force` | — | 跳过变更检测,强制构建 | +| `--dry-run` | — | 仅打印命令,不执行 | +| `--config` | `.ci/config.yaml` | 配置文件路径 | + +### 示例 + +```bash +# 构建 nvidia 镜像(自动检测 Dockerfile 变更,无变更则跳过) +python .ci/build.py --platform nvidia + +# 强制构建 +python .ci/build.py --platform nvidia --force + +# 构建全部平台并推送到 registry +python .ci/build.py --push --force + +# 预览实际执行的 docker 命令 +python .ci/build.py --platform nvidia --force --dry-run +``` + +### 构建流程 + +1. 通过 `git diff HEAD~1` 检测 Dockerfile 目录是否有变更(`--force` 跳过此步) +2. `docker build` 构建镜像,同时打 `` 和 `latest` 两个 tag +3. 自动透传宿主机的 `http_proxy`/`https_proxy`/`no_proxy` 到构建容器 +4. 若指定 `--push`,将两个 tag 推送到 registry + +### 产物 + +| Tag | 说明 | +|---|---| +| `infiniops-ci/:` | 精确追溯到某次构建 | +| `infiniops-ci/:latest` | 最近一次构建 | + +## 流水线执行 `run.py` + +```bash +python .ci/run.py [options] +``` + +| 参数 | 默认值 | 说明 | +|---|---|---| +| `--job` | 配置中第一个 job | 要执行的 job 名称 | +| `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 | +| `--stage` | 全部 | 仅运行指定 stage | +| `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 | +| `--gpu-id` | config 中的 `gpu_ids` | GPU 设备 ID,如 `0`、`0,2`、`all` | +| `--dry-run` | — | 仅打印 docker 命令,不执行 | +| `--config` | `.ci/config.yaml` | 配置文件路径 | + +### 示例 + +```bash +# 运行默认 job +python .ci/run.py + +# 指定分支和镜像版本 +python .ci/run.py --branch feature-xxx --image-tag latest + +# 只用 GPU 0 运行 +python .ci/run.py --gpu-id 0 + +# 用 GPU 0 和 2 运行 +python .ci/run.py --gpu-id 0,2 + +# 使用全部 GPU +python .ci/run.py --gpu-id all + +# 只跑 test stage +python .ci/run.py --stage test + +# 预览 docker 命令 +python .ci/run.py --dry-run +``` + +### 执行流程 + +1. 解析 job 配置,拉取对应镜像 +2. `docker run` 启动容器(自动挂载 GPU、限制内存) +3. 容器内 `git clone` → `checkout` → 执行 `setup` 命令 +4. 依次执行各 stage,汇总结果 + +## 代理配置 + +如果网络环境需要代理,在宿主机设置环境变量后即可: + +```bash +export http_proxy=http://localhost:9991 +export https_proxy=http://localhost:9991 +``` + +- **`build.py`** 会自动透传代理到 `docker build`(通过 `--build-arg` + `--network host`)。 +- **`run.py`** 使用 `--network host`,容器内可直接访问宿主机代理。 diff --git a/.ci/build.py b/.ci/build.py new file mode 100644 index 0000000..489ebf0 --- /dev/null +++ b/.ci/build.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +"""CI image builder: detect changes, build, tag, and optionally push Docker images.""" + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def get_git_commit(ref="HEAD"): + result = subprocess.run( + ["git", "rev-parse", "--short", ref], + capture_output=True, + text=True, + ) + if result.returncode != 0: + print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr) + sys.exit(1) + + return result.stdout.strip() + + +def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): + """Check if any file under `dockerfile_dir` changed since `base_ref`.""" + result = subprocess.run( + ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir], + capture_output=True, + text=True, + ) + + return bool(result.stdout.strip()) + + +def build_image_tag(registry_url, project, platform, tag): + if registry_url: + return f"{registry_url}/{project}/{platform}:{tag}" + + return f"{project}-ci/{platform}:{tag}" + + +def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run): + """Build a single platform image. Returns True on success.""" + registry_url = registry_cfg.get("url", "") + project = registry_cfg.get("project", "infiniops") + dockerfile_dir = platform_cfg["dockerfile"] + + commit_tag = build_image_tag(registry_url, project, platform, commit) + latest_tag = build_image_tag(registry_url, project, platform, "latest") + + build_args_cfg = platform_cfg.get("build_args", {}) + build_cmd = ["docker", "build", "--network", "host"] + for key, value in build_args_cfg.items(): + build_cmd.extend(["--build-arg", f"{key}={value}"]) + + for proxy_var in ("http_proxy", "https_proxy", "no_proxy"): + proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.upper()) + if proxy_val: + build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) + + private_sdk = platform_cfg.get("private_sdk", {}) + if private_sdk: + sdk_url = private_sdk.get("source", "") + if sdk_url.startswith("${") and sdk_url.endswith("}"): + env_var = sdk_url[2:-1] + sdk_url = os.environ.get(env_var, "") + if sdk_url: + build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) + + build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) + + if dry_run: + print(f"[dry-run] {' '.join(build_cmd)}") + if push: + print(f"[dry-run] docker push {commit_tag}") + print(f"[dry-run] docker push {latest_tag}") + + return True + + print(f"==> building {platform}: {commit_tag}", file=sys.stderr) + result = subprocess.run(build_cmd) + if result.returncode != 0: + error = { + "stage": "build", + "platform": platform, + "tag": commit_tag, + "exit_code": result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + if push: + for tag in (commit_tag, latest_tag): + print(f"==> pushing {tag}", file=sys.stderr) + push_result = subprocess.run(["docker", "push", tag]) + if push_result.returncode != 0: + error = { + "stage": "push", + "platform": platform, + "tag": tag, + "exit_code": push_result.returncode, + } + print(json.dumps(error), file=sys.stderr) + + return False + + return True + + +def main(): + parser = argparse.ArgumentParser(description="Build CI Docker images") + parser.add_argument( + "--platform", + type=str, + default="all", + help="Platform to build: nvidia, ascend, or all (default: all)", + ) + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument( + "--commit", + type=str, + default="HEAD", + help="Git ref for tagging the image (default: HEAD)", + ) + parser.add_argument( + "--push", + action="store_true", + help="Push images to registry after building", + ) + parser.add_argument( + "--force", + action="store_true", + help="Skip change detection and force build", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print commands without executing", + ) + args = parser.parse_args() + + config = load_config(args.config) + registry_cfg = config.get("registry", {}) + images_cfg = config.get("images", {}) + + if not images_cfg: + print("error: no `images` section in config", file=sys.stderr) + sys.exit(1) + + if args.platform == "all": + platforms = list(images_cfg.keys()) + else: + if args.platform not in images_cfg: + print( + f"error: platform `{args.platform}` not found in config", + file=sys.stderr, + ) + sys.exit(1) + platforms = [args.platform] + + commit = get_git_commit(args.commit) + failed = False + + for platform in platforms: + platform_cfg = images_cfg[platform] + dockerfile_dir = platform_cfg["dockerfile"] + + if not Path(dockerfile_dir).is_dir(): + print( + f"warning: dockerfile directory `{dockerfile_dir}` does not exist, skipping {platform}", + file=sys.stderr, + ) + continue + + if not args.force and not has_dockerfile_changed(dockerfile_dir): + print(f"==> {platform}: no changes detected, skipping", file=sys.stderr) + continue + + ok = build_image( + platform, platform_cfg, registry_cfg, commit, args.push, args.dry_run + ) + if not ok: + failed = True + + if failed: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.ci/config.yaml b/.ci/config.yaml new file mode 100644 index 0000000..fea3f7c --- /dev/null +++ b/.ci/config.yaml @@ -0,0 +1,36 @@ +repo: + url: https://github.com/InfiniTensor/InfiniOps.git + branch: master + +registry: + url: "" # TODO: Harbor not ready yet + project: infiniops + credentials_env: REGISTRY_TOKEN + +images: + nvidia: + dockerfile: .ci/images/nvidia/ + build_args: + BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 + ascend: # TODO: Ascend image is not ready yet + dockerfile: .ci/images/ascend/ + build_args: + BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0 + private_sdk: + source: "${PRIVATE_SDK_URL}" + +jobs: + nvidia_gpu: + image: stable + platform: nvidia + resources: + gpu_ids: "0" # 指定 GPU ID,如 "0" "0,2" "all" + gpu_type: A100 + memory: 32GB + timeout: 3600 + + setup: pip install .[dev] + + stages: + - name: test + run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile new file mode 100644 index 0000000..87f7c91 --- /dev/null +++ b/.ci/images/ascend/Dockerfile @@ -0,0 +1,31 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + curl \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +ARG PRIVATE_SDK_URL +RUN if [ -n "$PRIVATE_SDK_URL" ]; then \ + curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \ + chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \ + rm /tmp/sdk.run; \ + fi + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile new file mode 100644 index 0000000..d89ea91 --- /dev/null +++ b/.ci/images/nvidia/Dockerfile @@ -0,0 +1,26 @@ +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ENV DEBIAN_FRONTEND=noninteractive + +ARG http_proxy +ARG https_proxy + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git \ + cmake \ + ninja-build \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir \ + scikit-build-core \ + pybind11 \ + libclang \ + pytest \ + pytest-cov \ + pytest-xdist \ + pyyaml + +WORKDIR /workspace diff --git a/.ci/run.py b/.ci/run.py new file mode 100644 index 0000000..0421a56 --- /dev/null +++ b/.ci/run.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" + +import argparse +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print( + "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr + ) + sys.exit(1) + + +def load_config(path): + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def resolve_image(config, platform, image_tag): + """Resolve an image reference ('stable', 'latest', or commit hash) to a full URL.""" + registry = config.get("registry", {}) + registry_url = registry.get("url", "") + project = registry.get("project", "infiniops") + + if not registry_url: + return f"{project}-ci/{platform}:{image_tag}" + + return f"{registry_url}/{project}/{platform}:{image_tag}" + + +def build_runner_script(): + return r""" +export https_proxy=http://localhost:9991 +set -e +cd /workspace +git clone "$REPO_URL" repo +cd repo +git checkout "$BRANCH" +echo "========== Setup ==========" +eval "$SETUP_CMD" +set +e +failed=0 +for i in $(seq 1 "$NUM_STAGES"); do + name_var="STAGE_${i}_NAME" + cmd_var="STAGE_${i}_CMD" + name="${!name_var}" + cmd="${!cmd_var}" + echo "========== Stage: $name ==========" + eval "$cmd" || failed=1 +done +echo "========== Summary ==========" +exit $failed +""" + + +def build_docker_args( + config, job_name, repo_url, branch, stages, workdir, image_tag_override, + gpu_id_override=None, +): + job = config["jobs"][job_name] + platform = job.get("platform", "nvidia") + image_tag = image_tag_override or job.get("image", "stable") + image = resolve_image(config, platform, image_tag) + resources = job.get("resources", {}) + setup_cmd = job.get("setup", "pip install .[dev]") + + args = [ + "docker", + "run", + "--rm", + "--network", + "host", + "-i", + "-w", + workdir, + "-e", + f"REPO_URL={repo_url}", + "-e", + f"BRANCH={branch}", + "-e", + f"SETUP_CMD={setup_cmd}", + "-e", + f"NUM_STAGES={len(stages)}", + ] + for i, s in enumerate(stages): + args.append("-e") + args.append(f"STAGE_{i + 1}_NAME={s['name']}") + args.append("-e") + args.append(f"STAGE_{i + 1}_CMD={s['run']}") + + gpu_id = gpu_id_override or str(resources.get("gpu_ids", "")) + gpu_count = resources.get("gpu_count", 0) + if gpu_id: + if gpu_id == "all": + args.extend(["--gpus", "all"]) + else: + args.extend(["--gpus", f'"device={gpu_id}"']) + elif gpu_count and gpu_count > 0: + args.extend(["--gpus", f"count={gpu_count}"]) + + memory = resources.get("memory") + if memory: + mem = str(memory).upper().replace("GB", "g").replace("MB", "m") + if not mem.endswith("g") and not mem.endswith("m"): + mem = f"{mem}g" + args.extend(["--memory", mem]) + + timeout_sec = resources.get("timeout") + if timeout_sec: + args.extend(["--stop-timeout", str(timeout_sec)]) + + args.append(image) + args.append("bash") + args.append("-c") + args.append(build_runner_script().strip()) + + return args + + +def main(): + parser = argparse.ArgumentParser(description="Run Docker CI pipeline") + parser.add_argument( + "--config", + type=Path, + default=Path(__file__).resolve().parent / "config.yaml", + help="Path to config.yaml", + ) + parser.add_argument("--branch", type=str, help="Override repo branch") + parser.add_argument("--job", type=str, help="Job name to run (default: first job)") + parser.add_argument( + "--stage", + type=str, + help="Run only this stage name (still runs setup first)", + ) + parser.add_argument( + "--image-tag", + type=str, + help="Override image tag (stable, latest, or commit hash)", + ) + parser.add_argument( + "--gpu-id", + type=str, + help='GPU device IDs to use, e.g. "0", "0,2", "all"', + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print docker command and exit", + ) + args = parser.parse_args() + + config = load_config(args.config) + repo = config.get("repo", {}) + repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") + branch = args.branch or repo.get("branch", "dev-infra") + + jobs = config.get("jobs", {}) + if not jobs: + print("error: no jobs in config", file=sys.stderr) + sys.exit(1) + job_name = args.job or next(iter(jobs)) + if job_name not in jobs: + print(f"error: job {job_name!r} not in config", file=sys.stderr) + sys.exit(1) + + job = jobs[job_name] + all_stages = job.get("stages", []) + if args.stage: + stages = [s for s in all_stages if s["name"] == args.stage] + if not stages: + print(f"error: stage {args.stage!r} not found", file=sys.stderr) + sys.exit(1) + else: + stages = all_stages + + workdir = "/workspace" + docker_args = build_docker_args( + config, job_name, repo_url, branch, stages, workdir, args.image_tag, + gpu_id_override=args.gpu_id, + ) + + if args.dry_run: + print(" ".join(docker_args)) + + return + + sys.exit(subprocess.run(docker_args).returncode) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 765b90a..3dbc186 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "InfiniOps" version = "0.1.0" [project.optional-dependencies] -dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch"] +dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"] [tool.scikit-build.wheel] install-dir = "infini" diff --git a/tests/test_add.py b/tests/test_add.py index afbce0d..61d6715 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -4,15 +4,39 @@ from tests.utils import Payload, empty_strided, randint_strided, randn_strided -_INT_DTYPES = ( - torch.int16, - torch.uint16, - torch.int32, - torch.uint32, - torch.int64, - torch.uint64, +_INT_DTYPES = tuple( + d + for d in ( + torch.int16, + torch.int32, + torch.int64, + ) + if d is not None ) +_UINT_DTYPES = tuple( + d + for d in ( + getattr(torch, "uint16", None), + getattr(torch, "uint32", None), + getattr(torch, "uint64", None), + ) + if d is not None +) + +def _dtype_parametrize(): + candidates = [ + (torch.float32, 1e-7, 1e-7), + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-2, 5e-3), + (torch.int16, 0, 0), + (torch.int32, 0, 0), + (getattr(torch, "uint32", None), 0, 0), + (torch.int64, 0, 0), + (getattr(torch, "uint64", None), 0, 0), + ] + return tuple((d, r, a) for (d, r, a) in candidates if d is not None) + @pytest.mark.auto_act_and_assert @pytest.mark.parametrize( @@ -32,22 +56,9 @@ ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), ), ) -@pytest.mark.parametrize( - ("dtype", "rtol", "atol"), - ( - (torch.float32, 1e-7, 1e-7), - (torch.float16, 1e-3, 1e-3), - (torch.bfloat16, 1e-2, 5e-3), - (torch.int16, 0, 0), - (torch.uint16, 0, 0), - (torch.int32, 0, 0), - (torch.uint32, 0, 0), - (torch.int64, 0, 0), - (torch.uint64, 0, 0), - ), -) +@pytest.mark.parametrize(("dtype", "rtol", "atol"), _dtype_parametrize()) def test_add(shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol): - if dtype in _INT_DTYPES: + if dtype in _INT_DTYPES or dtype in _UINT_DTYPES: input = randint_strided(0, 100, shape, input_strides, dtype=dtype, device=device) other = randint_strided(0, 100, shape, other_strides, dtype=dtype, device=device) else: @@ -66,10 +77,10 @@ def _add(input, other, out): def _torch_add(input, other, out): - if input.dtype in (torch.uint16, torch.uint32, torch.uint64): + if input.dtype in _UINT_DTYPES: input = input.to(torch.int64) - if other.dtype in (torch.uint16, torch.uint32, torch.uint64): + if other.dtype in _UINT_DTYPES: other = other.to(torch.int64) res = torch.add(input, other) diff --git a/tests/test_rms_norm.py b/tests/test_rms_norm.py index f447091..b0c9c5d 100644 --- a/tests/test_rms_norm.py +++ b/tests/test_rms_norm.py @@ -59,4 +59,13 @@ def _rms_norm(input, weight, *, eps=1e-6, out=None): def _torch_rms_norm(input, weight, *, eps=1e-6, out=None): - return torch.nn.functional.rms_norm(input, input.shape[-1:], weight=weight, eps=eps) + rms_norm_fn = getattr(torch.nn.functional, "rms_norm", None) + if rms_norm_fn is not None: + return rms_norm_fn(input, input.shape[-1:], weight=weight, eps=eps) + # Fallback for PyTorch < 2.3: RMS norm = (x / sqrt(mean(x^2) + eps)) * weight + rms = torch.sqrt(torch.mean(input * input, dim=-1, keepdim=True) + eps) + result = (input / rms) * weight + if out is not None: + out.copy_(result) + return out + return result