diff --git a/.ci/README.md b/.ci/README.md
new file mode 100644
index 0000000..59ee101
--- /dev/null
+++ b/.ci/README.md
@@ -0,0 +1,171 @@
+# .ci — CI 镜像与流水线
+
+本目录管理 CI 所用的 Docker 镜像构建与测试流水线执行。
+
+## 目录结构
+
+```
+.ci/
+├── config.yaml              # 统一配置（registry、镜像、job 定义）
+├── build.py                 # 镜像构建脚本
+├── run.py                   # CI 流水线执行脚本
+├── README.md
+└── images/
+    ├── nvidia/Dockerfile    # NVIDIA 平台镜像
+    └── ascend/Dockerfile    # 昇腾平台镜像
+```
+
+## 前置依赖
+
+- Docker
+- Python 3.10+
+- pyyaml (`pip install pyyaml`)
+
+## 配置文件 `config.yaml`
+
+```yaml
+repo:
+  url: https://github.com/InfiniTensor/InfiniOps.git
+  branch: master
+
+registry:
+  url: ""                    # Harbor 地址，本地开发时留空
+  project: infiniops
+  credentials_env: REGISTRY_TOKEN
+
+images:
+  nvidia:
+    dockerfile: .ci/images/nvidia/
+    build_args:
+      BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
+  ascend:
+    dockerfile: .ci/images/ascend/
+    build_args:
+      BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
+    private_sdk:
+      source: "${PRIVATE_SDK_URL}"
+
+jobs:
+  nvidia_gpu:
+    image: stable            # stable | latest | 具体 commit hash
+    platform: nvidia
+    resources:
+      gpu_ids: "0"           # GPU 设备 ID，如 "0" "0,2" "all"
+      gpu_type: A100
+      memory: 32GB
+      timeout: 3600
+    setup: pip install .[dev]
+    stages:
+      - name: test
+        run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml
+```
+
+- **`registry.url`** 为空时镜像仅保存在本地，tag 格式为 `<project>-ci/<platform>:<tag>`。
+- **`images.<platform>.build_args`** 会作为 `--build-arg` 传入 `docker build`。
+- **`jobs.<name>.image`** 支持 `stable`、`latest` 或具体 commit hash。
+- **`resources.gpu_ids`** 指定 GPU 设备 ID，支持 `"0"`、`"0,2"`、`"all"` 等格式，映射为 `docker run --gpus "device=..."`。也可保留 `gpu_count` 按数量分配。
+
+## 镜像构建 `build.py`
+
+```bash
+python .ci/build.py [options]
+```
+
+| 参数 | 默认值 | 说明 |
+|---|---|---|
+| `--platform` | `all` | 构建平台：`nvidia`、`ascend` 或 `all` |
+| `--commit` | `HEAD` | 用于镜像 tag 的 git ref |
+| `--push` | — | 构建后推送到 registry |
+| `--force` | — | 跳过变更检测，强制构建 |
+| `--dry-run` | — | 仅打印命令，不执行 |
+| `--config` | `.ci/config.yaml` | 配置文件路径 |
+
+### 示例
+
+```bash
+# 构建 nvidia 镜像（自动检测 Dockerfile 变更，无变更则跳过）
+python .ci/build.py --platform nvidia
+
+# 强制构建
+python .ci/build.py --platform nvidia --force
+
+# 构建全部平台并推送到 registry
+python .ci/build.py --push --force
+
+# 预览实际执行的 docker 命令
+python .ci/build.py --platform nvidia --force --dry-run
+```
+
+### 构建流程
+
+1. 通过 `git diff HEAD~1` 检测 Dockerfile 目录是否有变更（`--force` 跳过此步）
+2. `docker build` 构建镜像，同时打 `<commit-hash>` 和 `latest` 两个 tag
+3. 自动透传宿主机的 `http_proxy`/`https_proxy`/`no_proxy` 到构建容器
+4. 若指定 `--push`，将两个 tag 推送到 registry
+
+### 产物
+
+| Tag | 说明 |
+|---|---|
+| `infiniops-ci/<platform>:<commit-hash>` | 精确追溯到某次构建 |
+| `infiniops-ci/<platform>:latest` | 最近一次构建 |
+
+## 流水线执行 `run.py`
+
+```bash
+python .ci/run.py [options]
+```
+
+| 参数 | 默认值 | 说明 |
+|---|---|---|
+| `--job` | 配置中第一个 job | 要执行的 job 名称 |
+| `--branch` | `config.yaml` 中的 `repo.branch` | 覆盖克隆分支 |
+| `--stage` | 全部 | 仅运行指定 stage |
+| `--image-tag` | job 中的 `image` 字段 | 覆盖镜像版本 |
+| `--gpu-id` | config 中的 `gpu_ids` | GPU 设备 ID，如 `0`、`0,2`、`all` |
+| `--dry-run` | — | 仅打印 docker 命令，不执行 |
+| `--config` | `.ci/config.yaml` | 配置文件路径 |
+
+### 示例
+
+```bash
+# 运行默认 job
+python .ci/run.py
+
+# 指定分支和镜像版本
+python .ci/run.py --branch feature-xxx --image-tag latest
+
+# 只用 GPU 0 运行
+python .ci/run.py --gpu-id 0
+
+# 用 GPU 0 和 2 运行
+python .ci/run.py --gpu-id 0,2
+
+# 使用全部 GPU
+python .ci/run.py --gpu-id all
+
+# 只跑 test stage
+python .ci/run.py --stage test
+
+# 预览 docker 命令
+python .ci/run.py --dry-run
+```
+
+### 执行流程
+
+1. 解析 job 配置，拉取对应镜像
+2. `docker run` 启动容器（自动挂载 GPU、限制内存）
+3. 容器内 `git clone` → `checkout` → 执行 `setup` 命令
+4. 依次执行各 stage，汇总结果
+
+## 代理配置
+
+如果网络环境需要代理，在宿主机设置环境变量后即可：
+
+```bash
+export http_proxy=http://localhost:9991
+export https_proxy=http://localhost:9991
+```
+
+- **`build.py`** 会自动透传代理到 `docker build`（通过 `--build-arg` + `--network host`）。
+- **`run.py`** 使用 `--network host`，容器内可直接访问宿主机代理。
diff --git a/.ci/build.py b/.ci/build.py
new file mode 100644
index 0000000..489ebf0
--- /dev/null
+++ b/.ci/build.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""CI image builder: detect changes, build, tag, and optionally push Docker images."""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print(
+        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
+    )
+    sys.exit(1)
+
+
+def load_config(path):
+    with open(path, encoding="utf-8") as f:
+        return yaml.safe_load(f)
+
+
+def get_git_commit(ref="HEAD"):
+    result = subprocess.run(
+        ["git", "rev-parse", "--short", ref],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        print(f"error: failed to get commit hash for `{ref}`", file=sys.stderr)
+        sys.exit(1)
+
+    return result.stdout.strip()
+
+
+def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"):
+    """Check if any file under `dockerfile_dir` changed since `base_ref`."""
+    result = subprocess.run(
+        ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir],
+        capture_output=True,
+        text=True,
+    )
+
+    return bool(result.stdout.strip())
+
+
+def build_image_tag(registry_url, project, platform, tag):
+    if registry_url:
+        return f"{registry_url}/{project}/{platform}:{tag}"
+
+    return f"{project}-ci/{platform}:{tag}"
+
+
+def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run):
+    """Build a single platform image. Returns True on success."""
+    registry_url = registry_cfg.get("url", "")
+    project = registry_cfg.get("project", "infiniops")
+    dockerfile_dir = platform_cfg["dockerfile"]
+
+    commit_tag = build_image_tag(registry_url, project, platform, commit)
+    latest_tag = build_image_tag(registry_url, project, platform, "latest")
+
+    build_args_cfg = platform_cfg.get("build_args", {})
+    build_cmd = ["docker", "build", "--network", "host"]
+    for key, value in build_args_cfg.items():
+        build_cmd.extend(["--build-arg", f"{key}={value}"])
+
+    for proxy_var in ("http_proxy", "https_proxy", "no_proxy"):
+        proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.upper())
+        if proxy_val:
+            build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"])
+
+    private_sdk = platform_cfg.get("private_sdk", {})
+    if private_sdk:
+        sdk_url = private_sdk.get("source", "")
+        if sdk_url.startswith("${") and sdk_url.endswith("}"):
+            env_var = sdk_url[2:-1]
+            sdk_url = os.environ.get(env_var, "")
+        if sdk_url:
+            build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"])
+
+    build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir])
+
+    if dry_run:
+        print(f"[dry-run] {' '.join(build_cmd)}")
+        if push:
+            print(f"[dry-run] docker push {commit_tag}")
+            print(f"[dry-run] docker push {latest_tag}")
+
+        return True
+
+    print(f"==> building {platform}: {commit_tag}", file=sys.stderr)
+    result = subprocess.run(build_cmd)
+    if result.returncode != 0:
+        error = {
+            "stage": "build",
+            "platform": platform,
+            "tag": commit_tag,
+            "exit_code": result.returncode,
+        }
+        print(json.dumps(error), file=sys.stderr)
+
+        return False
+
+    if push:
+        for tag in (commit_tag, latest_tag):
+            print(f"==> pushing {tag}", file=sys.stderr)
+            push_result = subprocess.run(["docker", "push", tag])
+            if push_result.returncode != 0:
+                error = {
+                    "stage": "push",
+                    "platform": platform,
+                    "tag": tag,
+                    "exit_code": push_result.returncode,
+                }
+                print(json.dumps(error), file=sys.stderr)
+
+                return False
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Build CI Docker images")
+    parser.add_argument(
+        "--platform",
+        type=str,
+        default="all",
+        help="Platform to build: nvidia, ascend, or all (default: all)",
+    )
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path(__file__).resolve().parent / "config.yaml",
+        help="Path to config.yaml",
+    )
+    parser.add_argument(
+        "--commit",
+        type=str,
+        default="HEAD",
+        help="Git ref for tagging the image (default: HEAD)",
+    )
+    parser.add_argument(
+        "--push",
+        action="store_true",
+        help="Push images to registry after building",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Skip change detection and force build",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print commands without executing",
+    )
+    args = parser.parse_args()
+
+    config = load_config(args.config)
+    registry_cfg = config.get("registry", {})
+    images_cfg = config.get("images", {})
+
+    if not images_cfg:
+        print("error: no `images` section in config", file=sys.stderr)
+        sys.exit(1)
+
+    if args.platform == "all":
+        platforms = list(images_cfg.keys())
+    else:
+        if args.platform not in images_cfg:
+            print(
+                f"error: platform `{args.platform}` not found in config",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        platforms = [args.platform]
+
+    commit = get_git_commit(args.commit)
+    failed = False
+
+    for platform in platforms:
+        platform_cfg = images_cfg[platform]
+        dockerfile_dir = platform_cfg["dockerfile"]
+
+        if not Path(dockerfile_dir).is_dir():
+            print(
+                f"warning: dockerfile directory `{dockerfile_dir}` does not exist, skipping {platform}",
+                file=sys.stderr,
+            )
+            continue
+
+        if not args.force and not has_dockerfile_changed(dockerfile_dir):
+            print(f"==> {platform}: no changes detected, skipping", file=sys.stderr)
+            continue
+
+        ok = build_image(
+            platform, platform_cfg, registry_cfg, commit, args.push, args.dry_run
+        )
+        if not ok:
+            failed = True
+
+    if failed:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.ci/config.yaml b/.ci/config.yaml
new file mode 100644
index 0000000..fea3f7c
--- /dev/null
+++ b/.ci/config.yaml
@@ -0,0 +1,36 @@
+repo:
+  url: https://github.com/InfiniTensor/InfiniOps.git
+  branch: master
+
+registry:
+  url: ""                              # TODO: Harbor not ready yet
+  project: infiniops
+  credentials_env: REGISTRY_TOKEN
+
+images: 
+  nvidia:
+    dockerfile: .ci/images/nvidia/
+    build_args:
+      BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3
+  ascend:                              # TODO: Ascend image is not ready yet
+    dockerfile: .ci/images/ascend/
+    build_args:
+      BASE_IMAGE: ascendhub.huawei.com/public-ascendhub/ascend-pytorch:24.0.0
+    private_sdk:
+      source: "${PRIVATE_SDK_URL}"
+
+jobs:
+  nvidia_gpu:
+    image: stable
+    platform: nvidia
+    resources:
+      gpu_ids: "0"                       # 指定 GPU ID，如 "0" "0,2" "all"
+      gpu_type: A100
+      memory: 32GB
+      timeout: 3600
+
+    setup: pip install .[dev]
+
+    stages:
+      - name: test
+        run: pytest tests/ -v --tb=short --junitxml=/workspace/test-results.xml
diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile
new file mode 100644
index 0000000..87f7c91
--- /dev/null
+++ b/.ci/images/ascend/Dockerfile
@@ -0,0 +1,31 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        cmake \
+        ninja-build \
+        curl \
+        libclang-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG PRIVATE_SDK_URL
+RUN if [ -n "$PRIVATE_SDK_URL" ]; then \
+        curl -fSL "$PRIVATE_SDK_URL" -o /tmp/sdk.run && \
+        chmod +x /tmp/sdk.run && /tmp/sdk.run --quiet && \
+        rm /tmp/sdk.run; \
+    fi
+
+RUN pip install --no-cache-dir \
+    scikit-build-core \
+    pybind11 \
+    libclang \
+    pytest \
+    pytest-cov \
+    pytest-xdist \
+    pyyaml
+
+WORKDIR /workspace
diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile
new file mode 100644
index 0000000..d89ea91
--- /dev/null
+++ b/.ci/images/nvidia/Dockerfile
@@ -0,0 +1,26 @@
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+ARG http_proxy
+ARG https_proxy
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        cmake \
+        ninja-build \
+        libclang-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir \
+    scikit-build-core \
+    pybind11 \
+    libclang \
+    pytest \
+    pytest-cov \
+    pytest-xdist \
+    pyyaml
+
+WORKDIR /workspace
diff --git a/.ci/run.py b/.ci/run.py
new file mode 100644
index 0000000..0421a56
--- /dev/null
+++ b/.ci/run.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout."""
+
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print(
+        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
+    )
+    sys.exit(1)
+
+
+def load_config(path):
+    with open(path, encoding="utf-8") as f:
+        return yaml.safe_load(f)
+
+
+def resolve_image(config, platform, image_tag):
+    """Resolve an image reference ('stable', 'latest', or commit hash) to a full URL."""
+    registry = config.get("registry", {})
+    registry_url = registry.get("url", "")
+    project = registry.get("project", "infiniops")
+
+    if not registry_url:
+        return f"{project}-ci/{platform}:{image_tag}"
+
+    return f"{registry_url}/{project}/{platform}:{image_tag}"
+
+
+def build_runner_script():
+    return r"""
+export https_proxy=http://localhost:9991
+set -e
+cd /workspace
+git clone "$REPO_URL" repo
+cd repo
+git checkout "$BRANCH"
+echo "========== Setup =========="
+eval "$SETUP_CMD"
+set +e
+failed=0
+for i in $(seq 1 "$NUM_STAGES"); do
+  name_var="STAGE_${i}_NAME"
+  cmd_var="STAGE_${i}_CMD"
+  name="${!name_var}"
+  cmd="${!cmd_var}"
+  echo "========== Stage: $name =========="
+  eval "$cmd" || failed=1
+done
+echo "========== Summary =========="
+exit $failed
+"""
+
+
+def build_docker_args(
+    config, job_name, repo_url, branch, stages, workdir, image_tag_override,
+    gpu_id_override=None,
+):
+    job = config["jobs"][job_name]
+    platform = job.get("platform", "nvidia")
+    image_tag = image_tag_override or job.get("image", "stable")
+    image = resolve_image(config, platform, image_tag)
+    resources = job.get("resources", {})
+    setup_cmd = job.get("setup", "pip install .[dev]")
+
+    args = [
+        "docker",
+        "run",
+        "--rm",
+        "--network",
+        "host",
+        "-i",
+        "-w",
+        workdir,
+        "-e",
+        f"REPO_URL={repo_url}",
+        "-e",
+        f"BRANCH={branch}",
+        "-e",
+        f"SETUP_CMD={setup_cmd}",
+        "-e",
+        f"NUM_STAGES={len(stages)}",
+    ]
+    for i, s in enumerate(stages):
+        args.append("-e")
+        args.append(f"STAGE_{i + 1}_NAME={s['name']}")
+        args.append("-e")
+        args.append(f"STAGE_{i + 1}_CMD={s['run']}")
+
+    gpu_id = gpu_id_override or str(resources.get("gpu_ids", ""))
+    gpu_count = resources.get("gpu_count", 0)
+    if gpu_id:
+        if gpu_id == "all":
+            args.extend(["--gpus", "all"])
+        else:
+            args.extend(["--gpus", f'"device={gpu_id}"'])
+    elif gpu_count and gpu_count > 0:
+        args.extend(["--gpus", f"count={gpu_count}"])
+
+    memory = resources.get("memory")
+    if memory:
+        mem = str(memory).upper().replace("GB", "g").replace("MB", "m")
+        if not mem.endswith("g") and not mem.endswith("m"):
+            mem = f"{mem}g"
+        args.extend(["--memory", mem])
+
+    timeout_sec = resources.get("timeout")
+    if timeout_sec:
+        args.extend(["--stop-timeout", str(timeout_sec)])
+
+    args.append(image)
+    args.append("bash")
+    args.append("-c")
+    args.append(build_runner_script().strip())
+
+    return args
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run Docker CI pipeline")
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path(__file__).resolve().parent / "config.yaml",
+        help="Path to config.yaml",
+    )
+    parser.add_argument("--branch", type=str, help="Override repo branch")
+    parser.add_argument("--job", type=str, help="Job name to run (default: first job)")
+    parser.add_argument(
+        "--stage",
+        type=str,
+        help="Run only this stage name (still runs setup first)",
+    )
+    parser.add_argument(
+        "--image-tag",
+        type=str,
+        help="Override image tag (stable, latest, or commit hash)",
+    )
+    parser.add_argument(
+        "--gpu-id",
+        type=str,
+        help='GPU device IDs to use, e.g. "0", "0,2", "all"',
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print docker command and exit",
+    )
+    args = parser.parse_args()
+
+    config = load_config(args.config)
+    repo = config.get("repo", {})
+    repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git")
+    branch = args.branch or repo.get("branch", "dev-infra")
+
+    jobs = config.get("jobs", {})
+    if not jobs:
+        print("error: no jobs in config", file=sys.stderr)
+        sys.exit(1)
+    job_name = args.job or next(iter(jobs))
+    if job_name not in jobs:
+        print(f"error: job {job_name!r} not in config", file=sys.stderr)
+        sys.exit(1)
+
+    job = jobs[job_name]
+    all_stages = job.get("stages", [])
+    if args.stage:
+        stages = [s for s in all_stages if s["name"] == args.stage]
+        if not stages:
+            print(f"error: stage {args.stage!r} not found", file=sys.stderr)
+            sys.exit(1)
+    else:
+        stages = all_stages
+
+    workdir = "/workspace"
+    docker_args = build_docker_args(
+        config, job_name, repo_url, branch, stages, workdir, args.image_tag,
+        gpu_id_override=args.gpu_id,
+    )
+
+    if args.dry_run:
+        print(" ".join(docker_args))
+
+        return
+
+    sys.exit(subprocess.run(docker_args).returncode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 765b90a..3dbc186 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ name = "InfiniOps"
 version = "0.1.0"
 
 [project.optional-dependencies]
-dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"]
 
 [tool.scikit-build.wheel]
 install-dir = "infini"
diff --git a/tests/test_add.py b/tests/test_add.py
index afbce0d..61d6715 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -4,15 +4,39 @@
 
 from tests.utils import Payload, empty_strided, randint_strided, randn_strided
 
-_INT_DTYPES = (
-    torch.int16,
-    torch.uint16,
-    torch.int32,
-    torch.uint32,
-    torch.int64,
-    torch.uint64,
+_INT_DTYPES = tuple(
+    d
+    for d in (
+        torch.int16,
+        torch.int32,
+        torch.int64,
+    )
+    if d is not None
 )
 
+_UINT_DTYPES = tuple(
+    d
+    for d in (
+        getattr(torch, "uint16", None),
+        getattr(torch, "uint32", None),
+        getattr(torch, "uint64", None),
+    )
+    if d is not None
+)
+
+def _dtype_parametrize():
+    candidates = [
+        (torch.float32, 1e-7, 1e-7),
+        (torch.float16, 1e-3, 1e-3),
+        (torch.bfloat16, 1e-2, 5e-3),
+        (torch.int16, 0, 0),
+        (torch.int32, 0, 0),
+        (getattr(torch, "uint32", None), 0, 0),
+        (torch.int64, 0, 0),
+        (getattr(torch, "uint64", None), 0, 0),
+    ]
+    return tuple((d, r, a) for (d, r, a) in candidates if d is not None)
+
 
 @pytest.mark.auto_act_and_assert
 @pytest.mark.parametrize(
@@ -32,22 +56,9 @@
         ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
     ),
 )
-@pytest.mark.parametrize(
-    ("dtype", "rtol", "atol"),
-    (
-        (torch.float32, 1e-7, 1e-7),
-        (torch.float16, 1e-3, 1e-3),
-        (torch.bfloat16, 1e-2, 5e-3),
-        (torch.int16, 0, 0),
-        (torch.uint16, 0, 0),
-        (torch.int32, 0, 0),
-        (torch.uint32, 0, 0),
-        (torch.int64, 0, 0),
-        (torch.uint64, 0, 0),
-    ),
-)
+@pytest.mark.parametrize(("dtype", "rtol", "atol"), _dtype_parametrize())
 def test_add(shape, input_strides, other_strides, out_strides, dtype, device, rtol, atol):
-    if dtype in _INT_DTYPES:
+    if dtype in _INT_DTYPES or dtype in _UINT_DTYPES:
         input = randint_strided(0, 100, shape, input_strides, dtype=dtype, device=device)
         other = randint_strided(0, 100, shape, other_strides, dtype=dtype, device=device)
     else:
@@ -66,10 +77,10 @@ def _add(input, other, out):
 
 
 def _torch_add(input, other, out):
-    if input.dtype in (torch.uint16, torch.uint32, torch.uint64):
+    if input.dtype in _UINT_DTYPES:
         input = input.to(torch.int64)
 
-    if other.dtype in (torch.uint16, torch.uint32, torch.uint64):
+    if other.dtype in _UINT_DTYPES:
         other = other.to(torch.int64)
 
     res = torch.add(input, other)
diff --git a/tests/test_rms_norm.py b/tests/test_rms_norm.py
index f447091..b0c9c5d 100644
--- a/tests/test_rms_norm.py
+++ b/tests/test_rms_norm.py
@@ -59,4 +59,13 @@ def _rms_norm(input, weight, *, eps=1e-6, out=None):
 
 
 def _torch_rms_norm(input, weight, *, eps=1e-6, out=None):
-    return torch.nn.functional.rms_norm(input, input.shape[-1:], weight=weight, eps=eps)
+    rms_norm_fn = getattr(torch.nn.functional, "rms_norm", None)
+    if rms_norm_fn is not None:
+        return rms_norm_fn(input, input.shape[-1:], weight=weight, eps=eps)
+    # Fallback for PyTorch < 2.3: RMS norm = (x / sqrt(mean(x^2) + eps)) * weight
+    rms = torch.sqrt(torch.mean(input * input, dim=-1, keepdim=True) + eps)
+    result = (input / rms) * weight
+    if out is not None:
+        out.copy_(result)
+        return out
+    return result