diff --git a/03_nf4_dequant/xfarawayx/README.md b/03_nf4_dequant/xfarawayx/README.md new file mode 100644 index 0000000..6122986 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/README.md @@ -0,0 +1,67 @@ +# NF4 反量化 CUDA Kernel + +NF4(Normal Float 4)双重量化权重的 GPU 反量化实现,兼容 bitsandbytes 格式。技术细节与实验结果见 [`docs/report.md`](docs/report.md)。 + +## 目录结构 + +``` +03_nf4_dequant/ +├── run.sh # 统一入口脚本 +├── kernel/ +│ ├── CMakeLists.txt # CMake 构建 (自动检测 GPU 架构) +│ ├── main.cu # 主程序: 文件 IO、kernel 启动、性能计时 +│ └── nf4_dequant_kernel.cuh # 反量化 kernel 实现 +├── kernel_noncuda/ # 国产 GPU 适配 +│ ├── iluvatar/ # 天数智芯 (clang++ / CUDA 兼容) +│ ├── moore/ # 摩尔线程 (mcc / MUSA) +│ └── mutex/ # 沐曦 (mxcc / MACA) +├── scripts/ +│ ├── generate_data.py # 用 bitsandbytes 生成 NF4 量化数据 + 参考输出 +│ ├── verify.py # 正确性验证: CUDA 输出 vs bitsandbytes 参考 +│ └── bench_bnb.py # bitsandbytes 性能基准 +├── docs/ +│ └── report.md # 实现报告 +└── data/ # 生成的数据 (自动创建) +``` + +## 快速开始 + +```bash +# 全流程: 生成数据 → 编译 → 运行 kernel → 验证正确性 → bnb 性能对比 +./run.sh all +./run.sh # 等价于 ./run.sh test +``` + +## 子命令与选项 + +| 命令 | 说明 | +|------|------| +| `./run.sh generate` | 仅生成 NF4 量化测试数据 | +| `./run.sh build` | 仅编译 CUDA kernel | +| `./run.sh test` | 生成数据 → 编译 → 运行 → 验证正确性 (默认) | +| `./run.sh bench` | bitsandbytes 基准性能测试 | +| `./run.sh all` | 完整流程 | + +| 选项 | 默认值 | 说明 | +|------|--------|------| +| `--rows` | 4096 | 矩阵行数 | +| `--cols` | 4096 | 矩阵列数 | +| `--blocksize` | 64 | 量化块大小 (64/128/256/…/4096) | +| `--compute_type` | bf16 | 输出类型 (bf16/fp16) | +| `--seed` | 42 | 随机种子 | +| `--gpu_arch` | 自动检测 | GPU 架构, 如 80(A100)/89(4090)/90(H100) | +| `--warmup` | 10 | 预热次数 | +| `--repeats` | 100 | 计时重复次数 | +| `--sweep` | - | bench 时扫描多种矩阵大小 | + +```bash +# 示例 +./run.sh --rows 4096 --cols 11008 --blocksize 128 +./run.sh --compute_type fp16 +./run.sh bench --sweep +``` + +## 依赖 + +- CUDA Toolkit +- Python 3.8+, PyTorch (CUDA), bitsandbytes >= 0.43, NumPy diff --git a/03_nf4_dequant/xfarawayx/docs/report.md b/03_nf4_dequant/xfarawayx/docs/report.md new file mode 100644 index 0000000..4ee9946 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/docs/report.md @@ -0,0 +1,292 @@ +# NF4 双重量化反量化 CUDA Kernel 实现报告 + +## 1. NF4 反量化原理 + +### 1.1 NF4 量化简介 + +NF4(Normal Float 4-bit)是由 bitsandbytes 库提出的一种 4-bit 量化格式。其核心思想是:预训练神经网络的权重近似服从正态分布 $N(0, \sigma^2)$,因此可以用标准正态分布 $N(0,1)$ 的 **16 等概率分位点** 作为量化码本(codebook),使每个区间内包含的权重数量大致相等,从而最大化信息利用率。这 16 个分位值构成固定的 NF4 查找表: + +| 索引 | 值 | 索引 | 值 | +|:----:|:-----:|:----:|:-----:| +| 0 | -1.0000 | 8 | +0.0796 | +| 1 | -0.6962 | 9 | +0.1609 | +| 2 | -0.5251 | 10 | +0.2461 | +| 3 | -0.3949 | 11 | +0.3379 | +| 4 | -0.2844 | 12 | +0.4407 | +| 5 | -0.1848 | 13 | +0.5626 | +| 6 | -0.0911 | 14 | +0.7230 | +| 7 | 0.0000 | 15 | +1.0000 | + +量化时,每个权重被除以其所在块的缩放因子(absmax)进行归一化,然后映射到最近的分位值索引(0-15),以 4 bit 存储。两个 4-bit 索引打包进一个 `uint8` 字节。 + +### 1.2 双重量化 + +为进一步压缩元数据开销,bitsandbytes 对一级缩放因子 `absmax` 本身再做一次量化,形成**双重量化(double quantization)**体系: + +- **一级量化**:每 `blocksize`(通常 64)个元素共享一个缩放因子 `absmax`。经二次量化后,原始的 FP32 `absmax` 被压缩为 `uint8` 索引 `absmax_q`。 +- **二级量化**:每 `s2_blocksize`(通常 256)个一级块组成一个组,共享一个二级缩放因子 `absmax2`(FP16)。同时还有一张 256 项的二级码表 `code2`(FP16[256])和一个全局偏移 `offset`(FP32)。 + +### 1.3 反量化公式 + +反量化需要逆向还原上述两级量化过程。对于第 $k$ 个元素: + +``` +block_idx = k / blocksize +group_idx = block_idx / s2_blocksize + +absmax_real = code2[ absmax_q[block_idx] ] × absmax2[group_idx] + offset + +output[k] = NF4_TABLE[ nf4_index(k) ] × absmax_real +``` + +其中 `nf4_index(k)` 是从 `packed_weights` 中解包得到的 4-bit 索引。 + +--- + +## 2. 基础版 Kernel 实现 + +基础版 kernel 的核心策略是:**每个线程负责一个 packed byte 的反量化**,产出 2 个输出元素。以下以 `nf4_dequant_kernel_pre.cuh` 中的实现为参考,分步介绍。 + +### 2.1 线程映射与 NF4 码表 + +每个线程处理 1 个 packed byte(含 2 个 4-bit 索引),生成 2 个输出元素。总线程数为 `ceil(n_elements / 2)`,通过全局线程 ID `tid` 一一映射到 `packed_weights[tid]`。 + +16 个 NF4 分位值预存在 `__constant__` memory 中,线程通过索引直接查表: + +```cpp +__constant__ float NF4_DEQUANT_TABLE[16] = { -1.0f, -0.6962f, ..., 1.0f }; +``` + +### 2.2 反量化计算流程 + +每个线程的完整执行逻辑如下: + +**Step 1 — 读取与解包**:从全局内存读取 1 字节 packed data,分离出高 4 位和低 4 位两个索引,分别对应输出位置 `elem0 = tid * 2` 和 `elem1 = tid * 2 + 1`: + +```cpp +uint8_t packed = packed_weights[tid]; +uint8_t idx_hi = (packed >> 4) & 0x0F; // 偶数位索引 +uint8_t idx_lo = packed & 0x0F; // 奇数位索引 +``` + +**Step 2 — NF4 查表**:用索引查 constant memory 中的码表,获得归一化的浮点值: + +```cpp +float val_hi = NF4_DEQUANT_TABLE[idx_hi]; +float val_lo = NF4_DEQUANT_TABLE[idx_lo]; +``` + +**Step 3 — 还原 absmax(双重量化反解)**:通过整数除法确定元素所在的一级块和二级组,再逆向还原真实的缩放因子: + +```cpp +int block_idx0 = elem0 / blocksize; +int group_idx0 = block_idx0 / s2_blocksize; +float absmax_real0 = __half2float(code2[absmax_q[block_idx0]]) + * __half2float(absmax2[group_idx0]) + offset; +``` + +**Step 4 — 块边界复用**:相邻的两个元素(偶数位与奇数位)大概率落在同一量化块内。kernel 先为 `elem0` 计算 `absmax_real`,处理 `elem1` 时比较 `block_idx`——若相同则直接复用,仅在跨块时重新计算,避免冗余的全局内存访问和浮点运算: + +```cpp +int block_idx1 = elem1 / blocksize; +if (block_idx1 == block_idx0) { + absmax_real1 = absmax_real0; // 复用 +} else { /* 重新计算 */ } +``` + +**Step 5 — Packed Store**:将两个 16-bit 输出值(BF16/FP16)的原始位表示打包为一个 `uint32_t`,以单次 32-bit 写入全局内存。这将 2 次 16-bit store 合并为 1 次 32-bit store,减少内存事务数并保证自然对齐: + +```cpp +uint16_t bits0 = *reinterpret_cast(&out0); +uint16_t bits1 = *reinterpret_cast(&out1); +uint32_t packed_out = (uint32_t)bits0 | ((uint32_t)bits1 << 16); +reinterpret_cast(output)[tid] = packed_out; +``` + +### 2.3 基础版的性能瓶颈 + +基础版实现功能正确,但存在三个主要瓶颈,将在第 3 节依次优化: + +1. **Constant memory 串行化**:NF4 查表时 warp 内线程访问不同索引,触发串行读取; +2. **细粒度全局内存访问**:每线程仅读 1 字节 packed data,远小于 GPU 内存事务粒度(32 字节/sector),带宽利用率低; +3. **整数除法开销**:`block_idx = elem / blocksize` 在 GPU 上延迟高(数十周期),且每线程多次执行。 + +--- + +## 3. Kernel 优化 + +在基础版之上,按顺序实施了三项优化。 + +### 3.1 优化一:NF4 码表从 `__constant__` 加载到 Shared Memory + +**问题分析**:`__constant__` memory 的特点是当 warp 内所有线程访问**同一地址**时才能实现广播读取,延迟很低。但 NF4 查表时,warp 内 32 个线程各自查询不同的 4-bit 索引(0-15),访问地址各异,导致 constant memory 的访问被**串行化**,最差情况下需要 16 次串行读取才能满足一个 warp 的请求。 + +**优化方案**:在 kernel 启动时,由每个 block 的前 16 个线程协作将 NF4 码表从 `__constant__` memory 加载到 `__shared__` memory: + +```cpp +__shared__ float s_nf4_table[16]; // 64 字节 +if (threadIdx.x < 16) { + s_nf4_table[threadIdx.x] = NF4_DEQUANT_TABLE[threadIdx.x]; +} +__syncthreads(); +``` + +`__shared__` memory 支持 bank 级别的并行访问:32 个 bank 可以同时响应不同地址的请求。16 个 `float` 映射到不同的 bank,warp 内线程即使访问不同索引也可以在**一个周期**内完成。 + +**加速效果**(实验环境:NVIDIA A100-SXM4-80GB,矩阵 4096×4096,blocksize=64):V0 中位数 0.1139 ms → V1 中位数 0.0791 ms,加速比 $\textbf{1.44x}$(提升 44%)。这是三项优化中独立效果最显著的一项。 + +### 3.2 优化二:向量化读取(每线程读 4 字节 packed_weights) + +**问题分析**:基础版中每个线程仅读取 1 字节 `packed_weights`,产生大量细粒度的全局内存事务。GPU 全局内存事务的最小粒度为 32 字节(一个 sector),1 字节的标量读取会浪费大量带宽。 + +**优化方案**:将线程映射从"1 thread = 1 byte = 2 元素"扩展为"**1 thread = 4 bytes = 8 元素**": + +```cpp +// 向量化读取: 一次读 4 字节 +uint32_t packed4 = reinterpret_cast(packed_weights)[tid_vec]; + +// 从 packed4 中提取各字节 +uint8_t packed_byte = (packed4 >> (b * 8)) & 0xFF; +``` + +同时写入也从 `uint32_t`(32-bit)升级为 `uint4`(128-bit = 8 个 FP16/BF16): + +```cpp +reinterpret_cast(out_u32)[tid_vec] = + make_uint4(out_packed[0], out_packed[1], out_packed[2], out_packed[3]); +``` + +对于尾部不足 4 字节的边界情况,回退到逐字节标量读取和逐 pack 写入,确保任意矩阵尺寸的正确性。内层循环使用 `#pragma unroll` 展开以减少循环开销。 + +**加速效果**:V1 中位数 0.0791 ms → V2 中位数 0.0652 ms,增量加速比 $\textbf{1.21x}$;相对基础版累计加速比达 $\textbf{1.75x}$。值得注意的是,向量化读写单独加到基础版上仅产生 1.09x 加速,但在 shared memory 优化之后叠加效果更强(1.21x),说明两者之间存在正向交互——shared memory 消除了查表瓶颈后,访存带宽成为新的主要瓶颈,向量化此时才能充分发挥作用。 + +### 3.3 优化三:用位移代替整数除法 + +**问题分析**:基础版中计算 block 索引和 group 索引使用了整数除法: + +```cuda +int block_idx = elem / blocksize; // 整数除法 +int group_idx = block_idx / s2_blocksize; // 整数除法 +``` + +GPU 上整数除法指令的吞吐量远低于位移指令。在 NVIDIA GPU 上,32-bit 整数除法的延迟约为数十个周期,而位移仅需 1 个周期。由于每个线程需对 8 个元素执行多次除法运算,累积的延迟开销不容忽视。 + +**优化方案**:由于 `blocksize` 和 `s2_blocksize` 总是 2 的幂(如 64 = 2⁶, 256 = 2⁸),在 host 端预计算 log₂ 值,kernel 中用右移替代除法: + +```cuda +// Host 端 +int log2_bs = log2_pow2(data.blocksize); // 64 → 6 +int log2_s2 = log2_pow2(data.s2_blocksize); // 256 → 8 + +// Kernel 内 +int block_idx = elem >> log2_blocksize; // 右移代替除法 +int group_idx = block_idx >> log2_s2_blocksize; +``` + +`log2_pow2()` 辅助函数通过循环右移计算 2 的幂的对数值。 + +**加速效果**:V2 中位数 0.0652 ms → V3 中位数 0.0431 ms,增量加速比 $\textbf{1.51x}$;相对基础版累计加速比达 $\textbf{2.64x}$。位移优化单独加到基础版上仅有 1.03x 提升,但在前两项优化消除了查表和带宽瓶颈后,整数除法的延迟成为关键路径,此时替换为位移带来了显著加速(1.51x),体现了强烈的正向交互效应。 + +--- + +## 4. 项目架构与实验流程 + +### 4.1 项目架构 + +``` +03_nf4_dequant/ +├── run.sh # 统一流程入口脚本 +├── kernel/ +│ ├── CMakeLists.txt # CMake 构建系统 (自动检测 GPU 架构) +│ ├── main.cu # 主程序: 文件 IO、kernel 启动、CUDA Events 计时 +│ ├── nf4_dequant_kernel.cuh # 优化后的反量化 kernel 实现 +│ └── run_test_ncu.sh # Nsight Compute 性能分析脚本 +├── scripts/ +│ ├── generate_data.py # 数据生成: 用 bitsandbytes 生成 NF4 量化数据 + 参考输出 +│ ├── verify.py # 正确性验证: CUDA 输出 vs bitsandbytes 参考输出 +│ └── bench_bnb.py # bitsandbytes 官方库性能基准测试 +└── data/ # 生成的测试数据与输出结果 +``` + +各组件职责: + +- **`run.sh`**:统一入口脚本,支持 `generate`、`build`、`test`、`bench`、`all` 五个子命令,通过命令行选项控制矩阵大小、量化块大小、输出精度等参数。 +- **`generate_data.py`**:使用 bitsandbytes 的 `quantize_4bit()` 接口生成 NF4 量化数据,导出为自定义二进制格式,并保存 bitsandbytes 的反量化结果作为参考标准。 +- **`main.cu`**:读取二进制文件、分配 GPU 内存、启动 kernel、使用 CUDA Events 精确计时(warmup + repeats 模式,取中位数抗干扰)、输出结果。 +- **`nf4_dequant_kernel.cuh`**:包含全部三项优化的最终 kernel 实现。 +- **`verify.py`**:加载 CUDA 输出与 bitsandbytes 参考输出,计算 MAE、MaxError、RMSE、相对 MAE,判定正确性(相对 MAE < 1e-2 为 PASS)。 +- **`bench_bnb.py`**:独立测量 bitsandbytes 官方库的反量化性能,支持扫描多种矩阵尺寸,用于对比加速比。 + +### 4.2 实验流程 + +完整的实验流程通过 `./run.sh all` 一键执行,依次完成以下五个步骤: + +1. **生成数据**(`generate_data.py`):调用 bitsandbytes 的 `quantize_4bit()` 对随机权重进行 NF4 量化,导出 packed weights、absmax_q、absmax2、code2、offset 等数据为二进制文件(`nf4_weights_*.bin`),同时保存 bitsandbytes 的反量化结果作为正确性参考(`nf4_ref_output_*.bin`)。 +2. **编译 CUDA kernel**(`cmake + make`):CMake 自动检测 GPU 架构,编译生成 `nf4_dequant` 可执行文件。 +3. **运行 CUDA kernel**(`nf4_dequant`):读取二进制数据文件,在 GPU 上执行反量化 kernel,输出结果(`cuda_output_*.bin`)和性能数据。 +4. **验证正确性**(`verify.py`):加载 CUDA 输出与 bitsandbytes 参考输出,计算 MAE、MaxError、RMSE、相对 MAE,判定正确性(相对 MAE < 1e-2 为 PASS)。 +5. **基准性能对比**(`bench_bnb.py`):独立测量 bitsandbytes 官方库的反量化耗时和带宽,供计算加速比。 + +**性能计时方案**:CUDA Events 精确计时,每次 kernel 启动前执行 `cudaDeviceSynchronize()` 确保 GPU 空闲,事件同步后采集单次耗时。收集 `repeats`(默认 100)次数据后排序,报告平均值、中位数、最小值、最大值和基于中位数的有效内存带宽。 + +--- + +## 5. 实验结果 + +### 5.1 正确性验证 + +| 矩阵大小 | 块大小 | 输出类型 | MAE | MaxError | 相对 MAE | 结果 | +|:---------:|:------:|:--------:|:---:|:--------:|:--------:|:----:| +| 4096×4096 | 64 | BF16 | ~2.8e-4 | ~0.03 | ~2.6e-5 | PASS | +| 4096×4096 | 64 | FP16 | ~0 | ~0 | ~0 | PASS | +| 2047×4096 | 64 | FP16 | ~0 | ~0 | ~0 | PASS | + +BF16 的误差来源于 BF16 本身的表示精度(尾数仅 7 bit),与 bitsandbytes 使用 FP32 中间计算再存储为 BF16 存在精度差异;FP16 输出可达到与 bitsandbytes 的 bit-exact 一致。 + +### 5.2 性能对比 + +> 实验环境:NVIDIA A100-SXM4-80GB,矩阵 4096×4096,blocksize=64,warmup=20,repeats=200。 + +| 矩阵大小 | 块大小 | 输出类型 | CUDA Kernel 中位数耗时 | CUDA 带宽 | bitsandbytes 中位数耗时 | 加速比 | +|:---------:|:------:|:--------:|:--------------------:|:---------:|:---------------------:|:------:| +| 4096×4096 | 64 | BF16 | 0.0426 ms | 990.98 GB/s | 0.0488 ms | 1.15x | +| 4096×4096 | 64 | FP16 | 0.0432 ms | 976.31 GB/s | 0.0488 ms | 1.13x | + +### 5.3 各优化阶段加速比 + +> 消融实验数据(FP16,矩阵 4096×4096,blocksize=64,repeats=200): + +| 优化阶段 | 中位数耗时 | 有效带宽 | 相对基础版加速比 | +|:--------:|:---------:|:--------:|:--------------:| +| 基础版 (constant memory + 标量读取 + 整数除法) | 0.1139 ms | 370.50 GB/s | 1.00x | +| +优化一: Shared Memory 码表 | 0.0791 ms | 533.36 GB/s | 1.44x | +| +优化二: 向量化读取 (4 bytes/thread) | 0.0652 ms | 647.84 GB/s | 1.75x | +| +优化三: 位移代替除法 | 0.0431 ms | 979.93 GB/s | 2.64x | + +各优化的增量分析与交互效应: + +| 优化 | 独立加速比 | 叠加增量加速比 | 交互效应 | +|:----:|:---------:|:------------:|:-------:| +| Shared Memory NF4 码表 | 1.44x | 1.44x | 1.00x | +| 向量化读写 | 1.09x | 1.21x | 1.12x | +| 位移代替除法 | 1.03x | 1.51x | 1.48x | + +三项优化叠加后,总加速比为 **2.64x**(0.1139 ms → 0.0431 ms),有效带宽从 370.50 GB/s 提升至 979.93 GB/s。值得注意的是,向量化和位移优化的独立效果较小(1.09x、1.03x),但在前序优化消除了其他瓶颈后,叠加效果显著增强(1.21x、1.51x),体现了优化之间的正向交互:shared memory 消除查表瓶颈后,带宽成为新瓶颈,向量化得以发挥;带宽优化后,计算延迟成为新瓶颈,位移替换得以发挥。 + +--- + +## 6. 国产 GPU 平台适配 + +`kernel_noncuda/` 目录将优化后的 kernel 移植到三个国产 GPU 平台。三个版本共享相同的算法逻辑与二进制数据格式,可复用 `scripts/verify.py` 进行正确性验证。 + +| 平台 | 目录 | 编译器 | 源码后缀 | 运行时 API 前缀 | +|:----:|:----:|:------:|:--------:|:--------------:| +| 天数智芯 (Iluvatar) | `iluvatar/` | `clang++` | `.cu` | `cuda*`(兼容模式) | +| 摩尔线程 (Moore) | `moore/` | `mcc` | `.mu` | `musa*` | +| 沐曦 (Mutex) | `mutex/` | `mxcc` | `.maca` | `mc*` | + +**主要适配差异**:各平台不直接支持 CUDA 的 `half` / `__nv_bfloat16` 内建类型,因此改用 `uint16_t` 位操作配合手写的浮点转换函数(`half_bits_to_float()`、`float_to_half_bits()`、`float_to_bf16_bits()`)实现等价语义,kernel 模板参数也相应从输出类型改为 `bool OUTPUT_BF16`。除此之外,kernel 核心逻辑(shared memory 码表、向量化读写、位移索引计算)与 CUDA 版本保持一致。 + +各平台目录均提供 `Makefile` 和一键脚本(`run_*.sh`),用法与主工程类似。测试数据需在 CUDA 环境预先生成后拷贝至目标机。 + +--- diff --git a/03_nf4_dequant/xfarawayx/docs/report.pdf b/03_nf4_dequant/xfarawayx/docs/report.pdf new file mode 100644 index 0000000..022279f Binary files /dev/null and b/03_nf4_dequant/xfarawayx/docs/report.pdf differ diff --git a/03_nf4_dequant/xfarawayx/kernel/CMakeLists.txt b/03_nf4_dequant/xfarawayx/kernel/CMakeLists.txt new file mode 100644 index 0000000..aa0ec99 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel/CMakeLists.txt @@ -0,0 +1,48 @@ +cmake_minimum_required(VERSION 3.18) +project(nf4_dequant LANGUAGES CXX CUDA) + +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CXX_STANDARD 17) + +# ---------- GPU 架构 ---------- +# 用法: +# cmake .. -DGPU_ARCH=80 # A100 +# cmake .. -DGPU_ARCH=89 # RTX 4090 +# cmake .. -DGPU_ARCH=90 # H100 +# cmake .. -DGPU_ARCH="80;89;90" # 多架构 +# cmake .. # 自动检测 +if(DEFINED GPU_ARCH) + set(CMAKE_CUDA_ARCHITECTURES ${GPU_ARCH}) + message(STATUS "GPU architecture (user-specified): ${CMAKE_CUDA_ARCHITECTURES}") +else() + # 自动检测当前 GPU 的 compute capability + execute_process( + COMMAND nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits + OUTPUT_VARIABLE _detected_arch + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE _detect_result + ) + if(_detect_result EQUAL 0 AND _detected_arch) + # 取第一块 GPU,格式 "8.0" → "80" + string(REGEX MATCH "^[0-9]+\\.[0-9]+" _first_arch "${_detected_arch}") + string(REPLACE "." "" _arch_num "${_first_arch}") + set(CMAKE_CUDA_ARCHITECTURES ${_arch_num}) + message(STATUS "GPU architecture (auto-detected): sm_${_arch_num}") + else() + # 检测失败时回退到常见架构 + set(CMAKE_CUDA_ARCHITECTURES "80;89;90") + message(STATUS "GPU architecture (fallback): ${CMAKE_CUDA_ARCHITECTURES}") + endif() +endif() + +# ---------- 临时目录 ---------- +set(NF4_LOCAL_TMP_DIR "${CMAKE_BINARY_DIR}/.tmp") +file(MAKE_DIRECTORY "${NF4_LOCAL_TMP_DIR}") +set(NF4_TMP_ENV_PREFIX + "${CMAKE_COMMAND} -E env TMPDIR=${NF4_LOCAL_TMP_DIR} TMP=${NF4_LOCAL_TMP_DIR} TEMP=${NF4_LOCAL_TMP_DIR}") +set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${NF4_TMP_ENV_PREFIX}") +set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${NF4_TMP_ENV_PREFIX}") + +# ---------- 构建目标 ---------- +add_executable(nf4_dequant main.cu) +target_include_directories(nf4_dequant PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/03_nf4_dequant/xfarawayx/kernel/main.cu b/03_nf4_dequant/xfarawayx/kernel/main.cu new file mode 100644 index 0000000..91ec693 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel/main.cu @@ -0,0 +1,269 @@ +// NF4 反量化 CUDA 程序 +// 用法: ./nf4_dequant [bf16|fp16] [warmup] [repeats] + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "nf4_dequant_kernel.cuh" + +// CUDA 错误检查 +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// 二进制权重文件布局: header (rows, cols, blocksize) + packed_weights + absmax_q + absmax2 + code2 + offset +struct NF4Data { + int64_t num_rows; + int64_t num_cols; + int32_t blocksize; + + std::vector packed_weights; + std::vector absmax_q; + std::vector absmax2; // fp16 raw bits + std::vector code2; // fp16[256] raw bits + float offset; + + int64_t n_elements; + int32_t num_blocks; + int32_t num_groups; + int32_t s2_blocksize; +}; + +bool read_nf4_data(const char* filepath, NF4Data& data) { + FILE* f = fopen(filepath, "rb"); + if (!f) { + fprintf(stderr, "[ERROR] Cannot open file: %s\n", filepath); + return false; + } + + // Header + fread(&data.num_rows, sizeof(int64_t), 1, f); + fread(&data.num_cols, sizeof(int64_t), 1, f); + fread(&data.blocksize, sizeof(int32_t), 1, f); + + data.n_elements = data.num_rows * data.num_cols; + data.num_blocks = (int32_t)((data.n_elements + data.blocksize - 1) / data.blocksize); + + int64_t packed_size = data.n_elements / 2; + data.packed_weights.resize(packed_size); + fread(data.packed_weights.data(), 1, packed_size, f); + + data.absmax_q.resize(data.num_blocks); + fread(data.absmax_q.data(), 1, data.num_blocks, f); + + // 从剩余字节反推 num_groups(文件中未显式存储) + long current_pos = ftell(f); + fseek(f, 0, SEEK_END); + long file_size = ftell(f); + fseek(f, current_pos, SEEK_SET); + + long remaining = file_size - current_pos; + long fixed_tail = 256 * 2 + 4; // code2 (512B) + offset (4B) + long absmax2_bytes = remaining - fixed_tail; + data.num_groups = (int32_t)(absmax2_bytes / 2); + data.s2_blocksize = (data.num_blocks + data.num_groups - 1) / data.num_groups; + + data.absmax2.resize(data.num_groups); + fread(data.absmax2.data(), 2, data.num_groups, f); + + data.code2.resize(256); + fread(data.code2.data(), 2, 256, f); + + fread(&data.offset, sizeof(float), 1, f); + + fclose(f); + return true; +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + fprintf(stderr, "用法: %s [bf16|fp16] [warmup] [repeats]\n", argv[0]); + return 1; + } + + const char* weight_file = argv[1]; + const char* output_file = argv[2]; + std::string compute_type = (argc > 3) ? argv[3] : "bf16"; + int warmup = (argc > 4) ? atoi(argv[4]) : 10; + int repeats = (argc > 5) ? atoi(argv[5]) : 100; + + bool use_bf16 = (compute_type == "bf16"); + + // 读取数据 + printf("[INFO] 读取权重文件: %s\n", weight_file); + NF4Data data; + if (!read_nf4_data(weight_file, data)) return 1; + + printf(" num_rows = %ld\n", (long)data.num_rows); + printf(" num_cols = %ld\n", (long)data.num_cols); + printf(" blocksize = %d\n", data.blocksize); + printf(" n_elements = %ld\n", (long)data.n_elements); + printf(" num_blocks = %d\n", data.num_blocks); + printf(" num_groups = %d\n", data.num_groups); + printf(" s2_blocksize = %d\n", data.s2_blocksize); + printf(" offset = %f\n", data.offset); + printf(" compute_type = %s\n", compute_type.c_str()); + + // 分配 GPU 内存 + uint8_t* d_packed_weights; + uint8_t* d_absmax_q; + half* d_absmax2; + half* d_code2; + void* d_output; + + int64_t packed_size = data.n_elements / 2; + int64_t output_bytes = data.n_elements * 2; // bf16/fp16 = 2 bytes each + + CUDA_CHECK(cudaMalloc(&d_packed_weights, packed_size)); + CUDA_CHECK(cudaMalloc(&d_absmax_q, data.num_blocks)); + CUDA_CHECK(cudaMalloc(&d_absmax2, data.num_groups * sizeof(half))); + CUDA_CHECK(cudaMalloc(&d_code2, 256 * sizeof(half))); + CUDA_CHECK(cudaMalloc(&d_output, output_bytes)); + + // H2D 传输 + CUDA_CHECK(cudaMemcpy(d_packed_weights, data.packed_weights.data(), + packed_size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_absmax_q, data.absmax_q.data(), + data.num_blocks, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_absmax2, data.absmax2.data(), + data.num_groups * sizeof(half), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_code2, data.code2.data(), + 256 * sizeof(half), cudaMemcpyHostToDevice)); + + // Kernel launch 配置 + int n_packed = (int)((data.n_elements + 1) / 2); + int n_packed_vec = (n_packed + 3) / 4; // 每线程 4 字节 + int threads_per_block = 256; + int num_blocks_kernel = (n_packed_vec + threads_per_block - 1) / threads_per_block; + + // 预计算 log2 用于位移优化 + int log2_bs = log2_pow2(data.blocksize); + int log2_s2 = log2_pow2(data.s2_blocksize); + + printf("\n[INFO] Kernel 配置:\n"); + printf(" n_packed = %d\n", n_packed); + printf(" n_packed_vec = %d (向量化后)\n", n_packed_vec); + printf(" threads_per_block = %d\n", threads_per_block); + printf(" grid_size = %d\n", num_blocks_kernel); + printf(" log2_blocksize = %d\n", log2_bs); + printf(" log2_s2_blocksize = %d\n", log2_s2); + + // 预热 + printf("\n[INFO] 预热 %d 次...\n", warmup); + for (int i = 0; i < warmup; i++) { + if (use_bf16) { + nf4_dequantize_kernel<__nv_bfloat16><<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, (__nv_bfloat16*)d_output + ); + } else { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, (half*)d_output + ); + } + } + CUDA_CHECK(cudaDeviceSynchronize()); + + // 计时: CUDA Events,每次迭代间同步以隔离测量 + printf("[INFO] 计时 %d 次...\n", repeats); + + cudaEvent_t ev_start, ev_end; + CUDA_CHECK(cudaEventCreate(&ev_start)); + CUDA_CHECK(cudaEventCreate(&ev_end)); + + std::vector times(repeats); + + for (int i = 0; i < repeats; i++) { + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaEventRecord(ev_start)); + if (use_bf16) { + nf4_dequantize_kernel<__nv_bfloat16><<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, (__nv_bfloat16*)d_output + ); + } else { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, (half*)d_output + ); + } + CUDA_CHECK(cudaEventRecord(ev_end)); + CUDA_CHECK(cudaEventSynchronize(ev_end)); + CUDA_CHECK(cudaEventElapsedTime(×[i], ev_start, ev_end)); + } + + // 排序取中位数,抗干扰 + std::vector sorted_times = times; + std::sort(sorted_times.begin(), sorted_times.end()); + + float total_ms = 0.0f; + float min_ms = sorted_times.front(); + float max_ms = sorted_times.back(); + for (int i = 0; i < repeats; i++) total_ms += times[i]; + float avg_ms = total_ms / repeats; + float median_ms = sorted_times[repeats / 2]; + + // 有效内存带宽 (基于中位数) + double read_bytes = (double)packed_size + data.num_blocks + data.num_groups * 2 + 256 * 2; + double write_bytes = (double)output_bytes; + double total_bytes = read_bytes + write_bytes; + double bandwidth_gbps = total_bytes / (median_ms * 1e-3) / 1e9; + + printf("\n========================================\n"); + printf(" NF4 反量化 Kernel 性能\n"); + printf("========================================\n"); + printf(" 矩阵大小 : (%ld, %ld)\n", (long)data.num_rows, (long)data.num_cols); + printf(" 块大小 : %d\n", data.blocksize); + printf(" 输出类型 : %s\n", compute_type.c_str()); + printf(" 平均耗时 : %.4f ms\n", avg_ms); + printf(" 中位数耗时 : %.4f ms\n", median_ms); + printf(" 最小耗时 : %.4f ms\n", min_ms); + printf(" 最大耗时 : %.4f ms\n", max_ms); + printf(" 有效带宽 : %.2f GB/s (基于中位数)\n", bandwidth_gbps); + printf("========================================\n"); + + // 写出结果 + std::vector h_output(output_bytes); + CUDA_CHECK(cudaMemcpy(h_output.data(), d_output, output_bytes, cudaMemcpyDeviceToHost)); + + FILE* fout = fopen(output_file, "wb"); + if (!fout) { + fprintf(stderr, "[ERROR] Cannot open output file: %s\n", output_file); + return 1; + } + fwrite(h_output.data(), 1, output_bytes, fout); + fclose(fout); + printf("\n[INFO] 已写入解量化输出: %s (%ld bytes)\n", output_file, (long)output_bytes); + + // 清理 + cudaEventDestroy(ev_start); + cudaEventDestroy(ev_end); + CUDA_CHECK(cudaFree(d_packed_weights)); + CUDA_CHECK(cudaFree(d_absmax_q)); + CUDA_CHECK(cudaFree(d_absmax2)); + CUDA_CHECK(cudaFree(d_code2)); + CUDA_CHECK(cudaFree(d_output)); + + printf("[DONE] 完成\n"); + return 0; +} diff --git a/03_nf4_dequant/xfarawayx/kernel/nf4_dequant_kernel.cuh b/03_nf4_dequant/xfarawayx/kernel/nf4_dequant_kernel.cuh new file mode 100644 index 0000000..eafd686 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel/nf4_dequant_kernel.cuh @@ -0,0 +1,175 @@ +#pragma once + +#include +#include +#include +#include + +// NF4 码表 (bitsandbytes create_normal_map),kernel 启动时加载到 shared memory +__constant__ float NF4_DEQUANT_TABLE[16] = { + -1.0f, + -0.6961928009986877f, + -0.5250730514526367f, + -0.39491748809814453f, + -0.28444138169288635f, + -0.18477343022823334f, + -0.09105003625154495f, + 0.0f, + 0.07958029955625534f, + 0.16093020141124725f, + 0.24611230194568634f, + 0.33791524171829224f, + 0.44070982933044434f, + 0.5626170039176941f, + 0.7229568362236023f, + 1.0f +}; + +// float → half / __nv_bfloat16 模板转换 +template +__device__ __forceinline__ OutT nf4_cast_from_float(float x); + +template <> +__device__ __forceinline__ half nf4_cast_from_float(float x) { + return __float2half(x); +} + +template <> +__device__ __forceinline__ __nv_bfloat16 nf4_cast_from_float<__nv_bfloat16>(float x) { + return __float2bfloat16(x); +} + +// 取 half/__nv_bfloat16 的原始 16-bit 位表示 +template +__device__ __forceinline__ uint16_t nf4_raw_bits(OutT v) { + return *reinterpret_cast(&v); +} + +// log2(x),要求 x 为 2 的幂 +inline int log2_pow2(int x) { + int r = 0; + while (x > 1) { x >>= 1; r++; } + return r; +} + +// NF4 双重量化反量化 kernel +// 线程映射: 1 thread → 4 packed bytes → 8 output elements +// absmax_real = code2[absmax_q[block_idx]] * absmax2[group_idx] + offset +// output[i] = NF4_TABLE[index] * absmax_real + +template +__global__ void nf4_dequantize_kernel( + const uint8_t* __restrict__ packed_weights, // [n/2] 每字节 2 个 4-bit 索引 + const uint8_t* __restrict__ absmax_q, // [num_blocks] 一级缩放(二次量化后) + const half* __restrict__ absmax2, // [num_groups] 二级缩放因子 + const half* __restrict__ code2, // [256] 二级码表: uint8 → float16 + float offset, // 二级量化偏移 + int log2_blocksize, // log2(blocksize),用位移代替除法 + int log2_s2_blocksize, // log2(s2_blocksize) + int64_t n_elements, // 总元素数 M*N + OutT* __restrict__ output // [n] 反量化输出 +) +{ + // NF4 码表加载到 shared memory,避免 constant memory 的 warp 串行化 + __shared__ float s_nf4_table[16]; + if (threadIdx.x < 16) { + s_nf4_table[threadIdx.x] = NF4_DEQUANT_TABLE[threadIdx.x]; + } + __syncthreads(); + + // 每线程处理 4 packed bytes = 8 输出元素 + int tid_vec = blockIdx.x * blockDim.x + threadIdx.x; + int n_packed = (int)((n_elements + 1) / 2); + + if (tid_vec >= (n_packed + 3) / 4) return; + + // 向量化读 4 字节,尾部不足时逐字节回退 + int byte_offset = tid_vec * 4; + uint32_t packed4; + if (byte_offset + 4 <= n_packed) { + packed4 = reinterpret_cast(packed_weights)[tid_vec]; + } else { + packed4 = 0; + for (int b = 0; b < 4 && byte_offset + b < n_packed; b++) { + packed4 |= ((uint32_t)packed_weights[byte_offset + b]) << (b << 3); + } + } + + int elem_base = tid_vec * 8; + + uint32_t out_packed[4]; + + #pragma unroll + for (int b = 0; b < 4; b++) { + int elem0 = elem_base + b * 2; + int elem1 = elem0 + 1; + + // 解包高 4 位 / 低 4 位索引,查 NF4 码表 + uint8_t packed_byte = (packed4 >> (b * 8)) & 0xFF; + uint8_t idx_hi = (packed_byte >> 4) & 0x0F; + uint8_t idx_lo = packed_byte & 0x0F; + + float val_hi = s_nf4_table[idx_hi]; + float val_lo = s_nf4_table[idx_lo]; + + // 双重量化反解: absmax_real = code2[absmax_q[block_idx]] * absmax2[group_idx] + offset + int block_idx0 = elem0 >> log2_blocksize; + int group_idx0 = block_idx0 >> log2_s2_blocksize; + + uint8_t aq0 = absmax_q[block_idx0]; + float absmax_real0 = __half2float(code2[aq0]) + * __half2float(absmax2[group_idx0]) + + offset; + + OutT out0, out1; + + if (elem0 < n_elements) { + float dq0 = val_hi * absmax_real0; + out0 = nf4_cast_from_float(dq0); + } else { + out0 = nf4_cast_from_float(0.0f); + } + + if (elem1 < n_elements) { + // 相邻元素大概率同块,跨块时才重新计算 absmax + int block_idx1 = elem1 >> log2_blocksize; + float absmax_real1; + if (block_idx1 == block_idx0) { + absmax_real1 = absmax_real0; + } else { + uint8_t aq1 = absmax_q[block_idx1]; + int group_idx1 = block_idx1 >> log2_s2_blocksize; + absmax_real1 = __half2float(code2[aq1]) + * __half2float(absmax2[group_idx1]) + + offset; + } + float dq1 = val_lo * absmax_real1; + out1 = nf4_cast_from_float(dq1); + } else { + out1 = nf4_cast_from_float(0.0f); + } + + // 两个 fp16/bf16 打包为一个 uint32_t + uint16_t bits0 = nf4_raw_bits(out0); + uint16_t bits1 = nf4_raw_bits(out1); + out_packed[b] = (uint32_t)bits0 | ((uint32_t)bits1 << 16); + } + + // 向量化写入: 完整 4-pack 用 uint4 (128-bit) 一次写出,尾部逐个写 + int out_base = tid_vec * 4; + uint32_t* out_u32 = reinterpret_cast(output); + + int valid_packs = 0; + for (int b = 0; b < 4; b++) { + if (byte_offset + b < n_packed) valid_packs++; + } + + if (valid_packs == 4) { + reinterpret_cast(out_u32)[tid_vec] = + make_uint4(out_packed[0], out_packed[1], out_packed[2], out_packed[3]); + } else { + for (int b = 0; b < valid_packs; b++) { + out_u32[out_base + b] = out_packed[b]; + } + } +} diff --git a/03_nf4_dequant/xfarawayx/kernel/run_test_ncu.sh b/03_nf4_dequant/xfarawayx/kernel/run_test_ncu.sh new file mode 100755 index 0000000..05219e1 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel/run_test_ncu.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Nsight Compute profiling +# 用法: sudo bash kernel/run_test_ncu.sh [run.sh 选项] +# +# 示例: +# sudo bash kernel/run_test_ncu.sh +# sudo bash kernel/run_test_ncu.sh --rows 2048 --cols 2048 + +PROJ_DIR="$(cd "$(dirname "$0")/.." && pwd)" + +sudo ncu \ + --target-processes all \ + -k "nf4_dequantize_kernel" \ + -s 10 -c 1 \ + --set full \ + -o profile_result -f \ + "${PROJ_DIR}/run.sh" "$@" diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/Makefile b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/Makefile new file mode 100644 index 0000000..46c349b --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/Makefile @@ -0,0 +1,21 @@ +ILCC ?= clang++ +TARGET ?= nf4_dequant_iluvatar +SRC ?= main.cu + +CXXFLAGS ?= -O3 -std=c++17 -fPIC +INCLUDES ?= -I. -I/usr/local/corex/include +LDFLAGS ?= -L/usr/local/corex/lib64 +LDLIBS ?= -lcudart + +.PHONY: all clean run + +all: $(TARGET) + +$(TARGET): $(SRC) nf4_dequant_kernel.cuh + $(ILCC) $(CXXFLAGS) $(INCLUDES) $(LDFLAGS) -o $@ $(SRC) $(LDLIBS) + +run: $(TARGET) + @echo "Usage: ./$(TARGET) [bf16|fp16] [warmup] [repeats]" + +clean: + rm -f $(TARGET) diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/README.md b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/README.md new file mode 100644 index 0000000..9ebcf74 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/README.md @@ -0,0 +1,46 @@ +# 天数智芯 (Iluvatar) 适配版 NF4 反量化 + +该目录是对 `kernel/` 中 CUDA 版本的平移适配,目标是在天数智芯环境优先使用 CUDA 兼容编译链跑通。 + +当前策略: + +- 默认编译器使用 `clang++`(可通过 `ILCC` 覆盖) +- 源码保持 `.cu` 形式,便于复用 CUDA 风格 kernel 与运行时 API +- 运行时 API 维持 `cuda*` 命名,依赖目标机提供 CUDA 兼容 SDK +- 保留与原工程一致的二进制输入输出格式,可直接复用 `scripts/verify.py` + +## 目录文件 + +- `main.cu`: 主入口,负责文件 IO / kernel 启动 / 性能统计 +- `nf4_dequant_kernel.cuh`: 适配后的 NF4 反量化 kernel +- `Makefile`: 使用 `ILCC` 构建 `nf4_dequant_iluvatar` +- `run_iluvatar.sh`: 一键 build/run/verify + +## 构建 + +```bash +cd kernel_noncuda/iluvatar +make ILCC=clang++ -j +``` + +## 运行 + +```bash +# 需要已有 data/nf4_weights_*.bin +./nf4_dequant_iluvatar ../../data/nf4_weights_4096x4096_bs64.bin \ + ../../data/iluvatar_output_4096x4096_bs64_fp16.bin \ + fp16 10 100 +``` + +## 一键流程 + +```bash +cd kernel_noncuda/iluvatar +bash run_iluvatar.sh test --rows 4096 --cols 4096 --blocksize 64 --compute_type fp16 +``` + +说明: + +- `run_iluvatar.sh` 默认只消费已有测试数据,不会调用 `generate_data.py`。 + - 可先在 CUDA 机器执行 `./run.sh generate` 生成 `data/` 再拷贝到目标机。 +- `compute_type` 支持 `fp16` 和 `bf16`,输出文件格式与原验证脚本兼容。 \ No newline at end of file diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/main.cu b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/main.cu new file mode 100644 index 0000000..67b5470 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/main.cu @@ -0,0 +1,312 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#if __has_include() +#include +#else +#error "CUDA-compatible runtime header not found. Please install Iluvatar SDK and set include paths." +#endif + +#include "nf4_dequant_kernel.cuh" + +#define ILUVATAR_BACKEND_NAME "ILUVATAR (CUDA-Compatible)" + +#define ILU_CHECK(call) \ + do { \ + cudaError_t err__ = (call); \ + if (err__ != cudaSuccess) { \ + std::fprintf(stderr, "Runtime error at %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(err__)); \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +struct NF4Data { + int64_t num_rows = 0; + int64_t num_cols = 0; + int32_t blocksize = 0; + + std::vector packed_weights; + std::vector absmax_q; + std::vector absmax2; + std::vector code2; + float offset = 0.0f; + + int64_t n_elements = 0; + int32_t num_blocks = 0; + int32_t num_groups = 0; + int32_t s2_blocksize = 0; +}; + +static bool is_power_of_two(int x) { + return x > 0 && ((x & (x - 1)) == 0); +} + +static bool read_nf4_data(const char* filepath, NF4Data& data) { + FILE* f = std::fopen(filepath, "rb"); + if (!f) { + std::fprintf(stderr, "[ERROR] Cannot open file: %s\n", filepath); + return false; + } + + if (std::fread(&data.num_rows, sizeof(int64_t), 1, f) != 1 || + std::fread(&data.num_cols, sizeof(int64_t), 1, f) != 1 || + std::fread(&data.blocksize, sizeof(int32_t), 1, f) != 1) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad header in file: %s\n", filepath); + return false; + } + + data.n_elements = data.num_rows * data.num_cols; + data.num_blocks = (int32_t)((data.n_elements + data.blocksize - 1) / data.blocksize); + + int64_t packed_size = data.n_elements / 2; + data.packed_weights.resize(packed_size); + if (std::fread(data.packed_weights.data(), 1, packed_size, f) != (size_t)packed_size) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad packed data in file: %s\n", filepath); + return false; + } + + data.absmax_q.resize(data.num_blocks); + if (std::fread(data.absmax_q.data(), 1, data.num_blocks, f) != (size_t)data.num_blocks) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad absmax_q in file: %s\n", filepath); + return false; + } + + long current_pos = std::ftell(f); + std::fseek(f, 0, SEEK_END); + long file_size = std::ftell(f); + std::fseek(f, current_pos, SEEK_SET); + + long remaining = file_size - current_pos; + long fixed_tail = 256 * 2 + 4; + long absmax2_bytes = remaining - fixed_tail; + + if (absmax2_bytes <= 0 || (absmax2_bytes % 2) != 0) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Invalid absmax2 segment in file: %s\n", filepath); + return false; + } + + data.num_groups = (int32_t)(absmax2_bytes / 2); + data.s2_blocksize = (data.num_blocks + data.num_groups - 1) / data.num_groups; + + data.absmax2.resize(data.num_groups); + if (std::fread(data.absmax2.data(), 2, data.num_groups, f) != (size_t)data.num_groups) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad absmax2 in file: %s\n", filepath); + return false; + } + + data.code2.resize(256); + if (std::fread(data.code2.data(), 2, 256, f) != 256) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad code2 in file: %s\n", filepath); + return false; + } + + if (std::fread(&data.offset, sizeof(float), 1, f) != 1) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Missing offset in file: %s\n", filepath); + return false; + } + + std::fclose(f); + return true; +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::fprintf(stderr, "Usage: %s [bf16|fp16] [warmup] [repeats]\n", argv[0]); + return 1; + } + + const char* weight_file = argv[1]; + const char* output_file = argv[2]; + std::string compute_type = (argc > 3) ? argv[3] : "bf16"; + int warmup = (argc > 4) ? std::atoi(argv[4]) : 10; + int repeats = (argc > 5) ? std::atoi(argv[5]) : 100; + + bool use_bf16 = (compute_type == "bf16"); + if (!use_bf16 && compute_type != "fp16") { + std::fprintf(stderr, "[ERROR] compute_type must be bf16 or fp16\n"); + return 1; + } + + std::printf("[INFO] Backend: %s\n", ILUVATAR_BACKEND_NAME); + std::printf("[INFO] Loading weight file: %s\n", weight_file); + + NF4Data data; + if (!read_nf4_data(weight_file, data)) { + return 1; + } + + std::printf(" num_rows = %ld\n", (long)data.num_rows); + std::printf(" num_cols = %ld\n", (long)data.num_cols); + std::printf(" blocksize = %d\n", data.blocksize); + std::printf(" n_elements = %ld\n", (long)data.n_elements); + std::printf(" num_blocks = %d\n", data.num_blocks); + std::printf(" num_groups = %d\n", data.num_groups); + std::printf(" s2_blocksize = %d\n", data.s2_blocksize); + std::printf(" offset = %f\n", data.offset); + std::printf(" compute_type = %s\n", compute_type.c_str()); + + if (!is_power_of_two(data.blocksize) || !is_power_of_two(data.s2_blocksize)) { + std::fprintf(stderr, + "[ERROR] blocksize and s2_blocksize must be powers of two. got blocksize=%d s2_blocksize=%d\n", + data.blocksize, data.s2_blocksize); + return 1; + } + + uint8_t* d_packed_weights = nullptr; + uint8_t* d_absmax_q = nullptr; + uint16_t* d_absmax2 = nullptr; + uint16_t* d_code2 = nullptr; + uint16_t* d_output_bits = nullptr; + + int64_t packed_size = data.n_elements / 2; + int64_t output_bytes = data.n_elements * 2; + + ILU_CHECK(cudaMalloc((void**)&d_packed_weights, packed_size)); + ILU_CHECK(cudaMalloc((void**)&d_absmax_q, data.num_blocks)); + ILU_CHECK(cudaMalloc((void**)&d_absmax2, data.num_groups * sizeof(uint16_t))); + ILU_CHECK(cudaMalloc((void**)&d_code2, 256 * sizeof(uint16_t))); + ILU_CHECK(cudaMalloc((void**)&d_output_bits, output_bytes)); + + ILU_CHECK(cudaMemcpy(d_packed_weights, data.packed_weights.data(), + packed_size, cudaMemcpyHostToDevice)); + ILU_CHECK(cudaMemcpy(d_absmax_q, data.absmax_q.data(), + data.num_blocks, cudaMemcpyHostToDevice)); + ILU_CHECK(cudaMemcpy(d_absmax2, data.absmax2.data(), + data.num_groups * sizeof(uint16_t), cudaMemcpyHostToDevice)); + ILU_CHECK(cudaMemcpy(d_code2, data.code2.data(), + 256 * sizeof(uint16_t), cudaMemcpyHostToDevice)); + + int n_packed = (int)((data.n_elements + 1) / 2); + int n_packed_vec = (n_packed + 3) / 4; + int threads_per_block = 256; + int num_blocks_kernel = (n_packed_vec + threads_per_block - 1) / threads_per_block; + int log2_bs = log2_pow2(data.blocksize); + int log2_s2 = log2_pow2(data.s2_blocksize); + + std::printf("\n[INFO] Kernel config:\n"); + std::printf(" n_packed = %d\n", n_packed); + std::printf(" n_packed_vec = %d\n", n_packed_vec); + std::printf(" threads_per_block = %d\n", threads_per_block); + std::printf(" grid_size = %d\n", num_blocks_kernel); + std::printf(" log2_blocksize = %d\n", log2_bs); + std::printf(" log2_s2_blocksize = %d\n", log2_s2); + + std::printf("\n[INFO] Warmup %d iterations...\n", warmup); + for (int i = 0; i < warmup; ++i) { + if (use_bf16) { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } else { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } + ILU_CHECK(cudaGetLastError()); + } + ILU_CHECK(cudaDeviceSynchronize()); + + std::printf("[INFO] Timing %d iterations...\n", repeats); + + cudaEvent_t ev_start; + cudaEvent_t ev_end; + ILU_CHECK(cudaEventCreate(&ev_start)); + ILU_CHECK(cudaEventCreate(&ev_end)); + + std::vector times(repeats); + + for (int i = 0; i < repeats; ++i) { + ILU_CHECK(cudaDeviceSynchronize()); + ILU_CHECK(cudaEventRecord(ev_start)); + + if (use_bf16) { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } else { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } + + ILU_CHECK(cudaGetLastError()); + ILU_CHECK(cudaEventRecord(ev_end)); + ILU_CHECK(cudaEventSynchronize(ev_end)); + ILU_CHECK(cudaEventElapsedTime(×[i], ev_start, ev_end)); + } + + std::vector sorted_times = times; + std::sort(sorted_times.begin(), sorted_times.end()); + + float total_ms = 0.0f; + for (float t : times) { + total_ms += t; + } + + float min_ms = sorted_times.front(); + float max_ms = sorted_times.back(); + float avg_ms = total_ms / repeats; + float median_ms = sorted_times[repeats / 2]; + + double read_bytes = (double)packed_size + data.num_blocks + + data.num_groups * 2.0 + 256.0 * 2.0; + double write_bytes = (double)output_bytes; + double total_bytes = read_bytes + write_bytes; + double bandwidth_gbps = total_bytes / (median_ms * 1e-3) / 1e9; + + std::printf("\n========================================\n"); + std::printf(" NF4 Dequant Kernel Performance (ILUVATAR)\n"); + std::printf("========================================\n"); + std::printf(" matrix shape : (%ld, %ld)\n", (long)data.num_rows, (long)data.num_cols); + std::printf(" blocksize : %d\n", data.blocksize); + std::printf(" output type : %s\n", compute_type.c_str()); + std::printf(" avg latency : %.4f ms\n", avg_ms); + std::printf(" median latency : %.4f ms\n", median_ms); + std::printf(" min latency : %.4f ms\n", min_ms); + std::printf(" max latency : %.4f ms\n", max_ms); + std::printf(" bandwidth : %.2f GB/s (median)\n", bandwidth_gbps); + std::printf("========================================\n"); + + std::vector h_output_bits(data.n_elements); + ILU_CHECK(cudaMemcpy(h_output_bits.data(), d_output_bits, output_bytes, cudaMemcpyDeviceToHost)); + + FILE* fout = std::fopen(output_file, "wb"); + if (!fout) { + std::fprintf(stderr, "[ERROR] Cannot open output file: %s\n", output_file); + return 1; + } + + std::fwrite(h_output_bits.data(), sizeof(uint16_t), h_output_bits.size(), fout); + std::fclose(fout); + + std::printf("\n[INFO] Wrote output: %s (%ld bytes)\n", output_file, (long)output_bytes); + + ILU_CHECK(cudaEventDestroy(ev_start)); + ILU_CHECK(cudaEventDestroy(ev_end)); + ILU_CHECK(cudaFree(d_packed_weights)); + ILU_CHECK(cudaFree(d_absmax_q)); + ILU_CHECK(cudaFree(d_absmax2)); + ILU_CHECK(cudaFree(d_code2)); + ILU_CHECK(cudaFree(d_output_bits)); + + std::printf("[DONE] Finished\n"); + return 0; +} diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/nf4_dequant_kernel.cuh b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/nf4_dequant_kernel.cuh new file mode 100644 index 0000000..c4449c8 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/nf4_dequant_kernel.cuh @@ -0,0 +1,216 @@ +#pragma once + +#include + +__constant__ float NF4_DEQUANT_TABLE[16] = { + -1.0f, + -0.6961928009986877f, + -0.5250730514526367f, + -0.39491748809814453f, + -0.28444138169288635f, + -0.18477343022823334f, + -0.09105003625154495f, + 0.0f, + 0.07958029955625534f, + 0.16093020141124725f, + 0.24611230194568634f, + 0.33791524171829224f, + 0.44070982933044434f, + 0.5626170039176941f, + 0.7229568362236023f, + 1.0f +}; + +inline int log2_pow2(int x) { + int r = 0; + while (x > 1) { + x >>= 1; + r++; + } + return r; +} + +__device__ __forceinline__ uint32_t float_to_bits(float v) { + union { + float f; + uint32_t u; + } x; + x.f = v; + return x.u; +} + +__device__ __forceinline__ float bits_to_float(uint32_t v) { + union { + float f; + uint32_t u; + } x; + x.u = v; + return x.f; +} + +__device__ __forceinline__ float half_bits_to_float(uint16_t h) { + uint32_t sign = (uint32_t)(h & 0x8000u) << 16; + uint32_t exp = (h >> 10) & 0x1Fu; + uint32_t mant = h & 0x03FFu; + + uint32_t out; + if (exp == 0) { + if (mant == 0) { + out = sign; + } else { + exp = 127 - 15 + 1; + while ((mant & 0x0400u) == 0) { + mant <<= 1; + exp--; + } + mant &= 0x03FFu; + out = sign | (exp << 23) | (mant << 13); + } + } else if (exp == 0x1Fu) { + out = sign | 0x7F800000u | (mant << 13); + } else { + out = sign | ((exp + (127 - 15)) << 23) | (mant << 13); + } + + return bits_to_float(out); +} + +__device__ __forceinline__ uint16_t float_to_half_bits(float v) { + uint32_t x = float_to_bits(v); + uint32_t sign = (x >> 16) & 0x8000u; + int32_t exp = (int32_t)((x >> 23) & 0xFFu) - 127 + 15; + uint32_t mant = x & 0x7FFFFFu; + + if (exp <= 0) { + if (exp < -10) { + return (uint16_t)sign; + } + mant = (mant | 0x800000u) >> (1 - exp); + if ((mant & 0x00001000u) != 0) { + mant += 0x00002000u; + } + return (uint16_t)(sign | (mant >> 13)); + } + + if (exp >= 31) { + return (uint16_t)(sign | 0x7C00u); + } + + uint32_t out = sign | ((uint32_t)exp << 10) | (mant >> 13); + if ((mant & 0x00001000u) != 0) { + out += 1; + } + return (uint16_t)out; +} + +__device__ __forceinline__ uint16_t float_to_bf16_bits(float v) { + uint32_t x = float_to_bits(v); + uint32_t lsb = (x >> 16) & 1u; + x += 0x7FFFu + lsb; + return (uint16_t)(x >> 16); +} + +template +__global__ void nf4_dequantize_kernel( + const uint8_t* __restrict__ packed_weights, + const uint8_t* __restrict__ absmax_q, + const uint16_t* __restrict__ absmax2, + const uint16_t* __restrict__ code2, + float offset, + int log2_blocksize, + int log2_s2_blocksize, + int64_t n_elements, + uint16_t* __restrict__ output_bits) { + __shared__ float s_nf4_table[16]; + if (threadIdx.x < 16) { + s_nf4_table[threadIdx.x] = NF4_DEQUANT_TABLE[threadIdx.x]; + } + __syncthreads(); + + int tid_vec = blockIdx.x * blockDim.x + threadIdx.x; + int n_packed = (int)((n_elements + 1) / 2); + int n_packed_vec = (n_packed + 3) / 4; + if (tid_vec >= n_packed_vec) { + return; + } + + int byte_offset = tid_vec * 4; + uint32_t packed4 = 0; + if (byte_offset + 4 <= n_packed) { + packed4 = reinterpret_cast(packed_weights)[tid_vec]; + } else { + for (int b = 0; b < 4 && byte_offset + b < n_packed; ++b) { + packed4 |= ((uint32_t)packed_weights[byte_offset + b]) << (b << 3); + } + } + + int elem_base = tid_vec * 8; + uint32_t out_packed[4]; + + #pragma unroll + for (int b = 0; b < 4; ++b) { + int elem0 = elem_base + b * 2; + int elem1 = elem0 + 1; + + uint8_t packed_byte = (packed4 >> (b * 8)) & 0xFF; + uint8_t idx_hi = (packed_byte >> 4) & 0x0F; + uint8_t idx_lo = packed_byte & 0x0F; + + float val_hi = s_nf4_table[idx_hi]; + float val_lo = s_nf4_table[idx_lo]; + + int block_idx0 = elem0 >> log2_blocksize; + int group_idx0 = block_idx0 >> log2_s2_blocksize; + uint8_t aq0 = absmax_q[block_idx0]; + + float absmax_real0 = half_bits_to_float(code2[aq0]) + * half_bits_to_float(absmax2[group_idx0]) + + offset; + + uint16_t out0; + if (elem0 < n_elements) { + float dq0 = val_hi * absmax_real0; + out0 = OUTPUT_BF16 ? float_to_bf16_bits(dq0) : float_to_half_bits(dq0); + } else { + out0 = OUTPUT_BF16 ? float_to_bf16_bits(0.0f) : float_to_half_bits(0.0f); + } + + uint16_t out1; + if (elem1 < n_elements) { + int block_idx1 = elem1 >> log2_blocksize; + float absmax_real1; + if (block_idx1 == block_idx0) { + absmax_real1 = absmax_real0; + } else { + uint8_t aq1 = absmax_q[block_idx1]; + int group_idx1 = block_idx1 >> log2_s2_blocksize; + absmax_real1 = half_bits_to_float(code2[aq1]) + * half_bits_to_float(absmax2[group_idx1]) + + offset; + } + float dq1 = val_lo * absmax_real1; + out1 = OUTPUT_BF16 ? float_to_bf16_bits(dq1) : float_to_half_bits(dq1); + } else { + out1 = OUTPUT_BF16 ? float_to_bf16_bits(0.0f) : float_to_half_bits(0.0f); + } + + out_packed[b] = (uint32_t)out0 | ((uint32_t)out1 << 16); + } + + int out_base = tid_vec * 4; + uint32_t* out_u32 = reinterpret_cast(output_bits); + + int valid_packs = 0; + for (int b = 0; b < 4; ++b) { + if (byte_offset + b < n_packed) { + valid_packs++; + } + } + + #pragma unroll + for (int b = 0; b < 4; ++b) { + if (b < valid_packs) { + out_u32[out_base + b] = out_packed[b]; + } + } +} diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/run_iluvatar.sh b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/run_iluvatar.sh new file mode 100644 index 0000000..4433707 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/iluvatar/run_iluvatar.sh @@ -0,0 +1,105 @@ +#!/bin/bash +set -e +set -o pipefail + +PROJ_DIR="$(cd "$(dirname "$0")/../.." && pwd)" +KERNEL_DIR="${PROJ_DIR}/kernel_noncuda/iluvatar" +SCRIPTS_DIR="${PROJ_DIR}/scripts" +DATA_DIR="${PROJ_DIR}/data" + +if [ -x "${PROJ_DIR}/.venv/bin/python" ]; then + PYTHON="${PROJ_DIR}/.venv/bin/python" +elif command -v python3 >/dev/null 2>&1; then + PYTHON="$(command -v python3)" +else + echo "[ERROR] 找不到 python3" + exit 1 +fi + +ROWS=4096 +COLS=4096 +BLOCKSIZE=64 +COMPUTE_TYPE="bf16" +WARMUP=10 +REPEATS=100 +ILCC_BIN="${ILCC:-clang++}" +COMMAND="test" + +if [[ "$#" -gt 0 && ! "$1" == --* ]]; then + COMMAND="$1" + shift +fi + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --rows) ROWS="$2"; shift ;; + --cols) COLS="$2"; shift ;; + --blocksize) BLOCKSIZE="$2"; shift ;; + --compute_type) COMPUTE_TYPE="$2"; shift ;; + --warmup) WARMUP="$2"; shift ;; + --repeats) REPEATS="$2"; shift ;; + --ilcc) ILCC_BIN="$2"; shift ;; + *) echo "[ERROR] 未知参数: $1"; exit 1 ;; + esac + shift +done + +TAG="${ROWS}x${COLS}_bs${BLOCKSIZE}" +WEIGHT_FILE="${DATA_DIR}/nf4_weights_${TAG}.bin" +REF_FILE="${DATA_DIR}/nf4_ref_output_${TAG}_${COMPUTE_TYPE}.bin" +ILUVATAR_OUTPUT="${DATA_DIR}/iluvatar_output_${TAG}_${COMPUTE_TYPE}.bin" + +build_kernel() { + echo "[build] 使用编译器: ${ILCC_BIN}" + make -C "${KERNEL_DIR}" clean >/dev/null + make -C "${KERNEL_DIR}" ILCC="${ILCC_BIN}" -j"$(nproc)" +} + +run_kernel() { + if [ ! -f "${WEIGHT_FILE}" ]; then + echo "[ERROR] 缺少权重文件: ${WEIGHT_FILE}" + echo " 请先在支持 CUDA 的环境执行 ./run.sh generate 生成数据" + exit 1 + fi + + "${KERNEL_DIR}/nf4_dequant_iluvatar" \ + "${WEIGHT_FILE}" "${ILUVATAR_OUTPUT}" "${COMPUTE_TYPE}" "${WARMUP}" "${REPEATS}" +} + +verify_output() { + if [ ! -f "${REF_FILE}" ]; then + echo "[WARN] 缺少参考文件: ${REF_FILE}" + echo " 跳过 verify。可先在 CUDA 环境运行 ./run.sh generate --compute_type ${COMPUTE_TYPE}" + return 0 + fi + + "${PYTHON}" "${SCRIPTS_DIR}/verify.py" \ + --weight_file "${WEIGHT_FILE}" \ + --ref_file "${REF_FILE}" \ + --cuda_file "${ILUVATAR_OUTPUT}" \ + --compute_type "${COMPUTE_TYPE}" +} + +case "${COMMAND}" in + build) + build_kernel + ;; + run) + run_kernel + ;; + verify) + verify_output + ;; + test) + build_kernel + run_kernel + verify_output + ;; + *) + echo "[ERROR] 未知命令: ${COMMAND}" + echo "可用命令: build | run | verify | test" + exit 1 + ;; +esac + +echo "[DONE] ${COMMAND} 完成" diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/Makefile b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/Makefile new file mode 100644 index 0000000..dc34f83 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/Makefile @@ -0,0 +1,21 @@ +MCC ?= mcc +TARGET ?= nf4_dequant_musa +SRC ?= main.mu + +CXXFLAGS ?= -O3 -std=c++17 +INCLUDES ?= -I. +LDFLAGS ?= +LDLIBS ?= -lmusart + +.PHONY: all clean run + +all: $(TARGET) + +$(TARGET): $(SRC) nf4_dequant_kernel.mu + $(MCC) $(CXXFLAGS) $(INCLUDES) $(LDFLAGS) -o $@ $(SRC) $(LDLIBS) + +run: $(TARGET) + @echo "Usage: ./$(TARGET) [bf16|fp16] [warmup] [repeats]" + +clean: + rm -f $(TARGET) diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/README.md b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/README.md new file mode 100644 index 0000000..088e9fe --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/README.md @@ -0,0 +1,46 @@ +# 摩尔线程 (MUSA) 适配版 NF4 反量化 + +该目录是对 `kernel/` 中 CUDA 版本的平移适配,目标是运行在摩尔线程 GPU 环境。 + +## 适配要点 + +- 编译器从 `nvcc` 切换为 `mcc` +- 主源码使用 `.mu` 后缀 +- 运行时 API 使用 `musa*`(例如 `musaMalloc` / `musaMemcpy`) +- 保留与原工程一致的二进制输入输出格式,直接复用 `scripts/verify.py` + +## 目录文件 + +- `main.mu`: 主入口,负责文件 IO / kernel 启动 / 性能统计 +- `nf4_dequant_kernel.mu`: NF4 反量化 kernel +- `Makefile`: 使用 `mcc` 构建 `nf4_dequant_musa` +- `run_moore.sh`: 一键 build/run/verify + +## 构建 + +```bash +cd kernel_noncuda/moore +make MCC=mcc -j +``` + +## 运行 + +```bash +# 需要已有 data/nf4_weights_*.bin +./nf4_dequant_musa ../../data/nf4_weights_4096x4096_bs64.bin \ + ../../data/moore_output_4096x4096_bs64_fp16.bin \ + fp16 10 100 +``` + +## 一键流程 + +```bash +cd kernel_noncuda/moore +bash run_moore.sh test --rows 4096 --cols 4096 --blocksize 64 --compute_type fp16 +``` + +说明: + +- `run_moore.sh` 默认只消费已有测试数据,不会调用 `generate_data.py`。 + - 可先在另一台 CUDA 机器执行 `./run.sh generate` 生成 `data/` 再拷贝过来。 +- `compute_type` 支持 `fp16` 和 `bf16`,输出文件格式与原验证脚本兼容。 diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/main.mu b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/main.mu new file mode 100644 index 0000000..930b978 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/main.mu @@ -0,0 +1,317 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __has_include() +#include +#else +#error "MUSA runtime header not found. Please install MUSA SDK and set include paths." +#endif + +#if __has_include() +#include +#endif + +#include "nf4_dequant_kernel.mu" + +#define MOORE_BACKEND_NAME "MUSA" + +#define MUSA_CHECK(call) \ + do { \ + musaError_t err__ = (call); \ + if (err__ != musaSuccess) { \ + std::fprintf(stderr, "MUSA error at %s:%d: %s\n", \ + __FILE__, __LINE__, musaGetErrorString(err__)); \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +struct NF4Data { + int64_t num_rows = 0; + int64_t num_cols = 0; + int32_t blocksize = 0; + + std::vector packed_weights; + std::vector absmax_q; + std::vector absmax2; + std::vector code2; + float offset = 0.0f; + + int64_t n_elements = 0; + int32_t num_blocks = 0; + int32_t num_groups = 0; + int32_t s2_blocksize = 0; +}; + +static bool is_power_of_two(int x) { + return x > 0 && ((x & (x - 1)) == 0); +} + +static bool read_nf4_data(const char* filepath, NF4Data& data) { + FILE* f = std::fopen(filepath, "rb"); + if (!f) { + std::fprintf(stderr, "[ERROR] Cannot open file: %s\n", filepath); + return false; + } + + if (std::fread(&data.num_rows, sizeof(int64_t), 1, f) != 1 || + std::fread(&data.num_cols, sizeof(int64_t), 1, f) != 1 || + std::fread(&data.blocksize, sizeof(int32_t), 1, f) != 1) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad header in file: %s\n", filepath); + return false; + } + + data.n_elements = data.num_rows * data.num_cols; + data.num_blocks = (int32_t)((data.n_elements + data.blocksize - 1) / data.blocksize); + + int64_t packed_size = data.n_elements / 2; + data.packed_weights.resize(packed_size); + if (std::fread(data.packed_weights.data(), 1, packed_size, f) != (size_t)packed_size) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad packed data in file: %s\n", filepath); + return false; + } + + data.absmax_q.resize(data.num_blocks); + if (std::fread(data.absmax_q.data(), 1, data.num_blocks, f) != (size_t)data.num_blocks) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad absmax_q in file: %s\n", filepath); + return false; + } + + long current_pos = std::ftell(f); + std::fseek(f, 0, SEEK_END); + long file_size = std::ftell(f); + std::fseek(f, current_pos, SEEK_SET); + + long remaining = file_size - current_pos; + long fixed_tail = 256 * 2 + 4; + long absmax2_bytes = remaining - fixed_tail; + + if (absmax2_bytes <= 0 || (absmax2_bytes % 2) != 0) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Invalid absmax2 segment in file: %s\n", filepath); + return false; + } + + data.num_groups = (int32_t)(absmax2_bytes / 2); + data.s2_blocksize = (data.num_blocks + data.num_groups - 1) / data.num_groups; + + data.absmax2.resize(data.num_groups); + if (std::fread(data.absmax2.data(), 2, data.num_groups, f) != (size_t)data.num_groups) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad absmax2 in file: %s\n", filepath); + return false; + } + + data.code2.resize(256); + if (std::fread(data.code2.data(), 2, 256, f) != 256) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad code2 in file: %s\n", filepath); + return false; + } + + if (std::fread(&data.offset, sizeof(float), 1, f) != 1) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Missing offset in file: %s\n", filepath); + return false; + } + + std::fclose(f); + return true; +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::fprintf(stderr, "用法: %s [bf16|fp16] [warmup] [repeats]\n", argv[0]); + return 1; + } + + const char* weight_file = argv[1]; + const char* output_file = argv[2]; + std::string compute_type = (argc > 3) ? argv[3] : "bf16"; + int warmup = (argc > 4) ? std::atoi(argv[4]) : 10; + int repeats = (argc > 5) ? std::atoi(argv[5]) : 100; + + bool use_bf16 = (compute_type == "bf16"); + if (!use_bf16 && compute_type != "fp16") { + std::fprintf(stderr, "[ERROR] compute_type must be bf16 or fp16\n"); + return 1; + } + + std::printf("[INFO] Backend: %s\n", MOORE_BACKEND_NAME); + std::printf("[INFO] 读取权重文件: %s\n", weight_file); + + NF4Data data; + if (!read_nf4_data(weight_file, data)) { + return 1; + } + + std::printf(" num_rows = %ld\n", (long)data.num_rows); + std::printf(" num_cols = %ld\n", (long)data.num_cols); + std::printf(" blocksize = %d\n", data.blocksize); + std::printf(" n_elements = %ld\n", (long)data.n_elements); + std::printf(" num_blocks = %d\n", data.num_blocks); + std::printf(" num_groups = %d\n", data.num_groups); + std::printf(" s2_blocksize = %d\n", data.s2_blocksize); + std::printf(" offset = %f\n", data.offset); + std::printf(" compute_type = %s\n", compute_type.c_str()); + + if (!is_power_of_two(data.blocksize) || !is_power_of_two(data.s2_blocksize)) { + std::fprintf(stderr, + "[ERROR] blocksize and s2_blocksize must be powers of two for log2-shift kernel. got blocksize=%d s2_blocksize=%d\n", + data.blocksize, data.s2_blocksize); + return 1; + } + + uint8_t* d_packed_weights = nullptr; + uint8_t* d_absmax_q = nullptr; + uint16_t* d_absmax2 = nullptr; + uint16_t* d_code2 = nullptr; + uint16_t* d_output_bits = nullptr; + + int64_t packed_size = data.n_elements / 2; + int64_t output_bytes = data.n_elements * 2; + + MUSA_CHECK(musaMalloc((void**)&d_packed_weights, packed_size)); + MUSA_CHECK(musaMalloc((void**)&d_absmax_q, data.num_blocks)); + MUSA_CHECK(musaMalloc((void**)&d_absmax2, data.num_groups * sizeof(uint16_t))); + MUSA_CHECK(musaMalloc((void**)&d_code2, 256 * sizeof(uint16_t))); + MUSA_CHECK(musaMalloc((void**)&d_output_bits, output_bytes)); + + MUSA_CHECK(musaMemcpy(d_packed_weights, data.packed_weights.data(), + packed_size, musaMemcpyHostToDevice)); + MUSA_CHECK(musaMemcpy(d_absmax_q, data.absmax_q.data(), + data.num_blocks, musaMemcpyHostToDevice)); + MUSA_CHECK(musaMemcpy(d_absmax2, data.absmax2.data(), + data.num_groups * sizeof(uint16_t), musaMemcpyHostToDevice)); + MUSA_CHECK(musaMemcpy(d_code2, data.code2.data(), + 256 * sizeof(uint16_t), musaMemcpyHostToDevice)); + + int n_packed = (int)((data.n_elements + 1) / 2); + int n_packed_vec = (n_packed + 3) / 4; + int threads_per_block = 256; + int num_blocks_kernel = (n_packed_vec + threads_per_block - 1) / threads_per_block; + int log2_bs = log2_pow2(data.blocksize); + int log2_s2 = log2_pow2(data.s2_blocksize); + + std::printf("\n[INFO] Kernel 配置:\n"); + std::printf(" n_packed = %d\n", n_packed); + std::printf(" n_packed_vec = %d (向量化后)\n", n_packed_vec); + std::printf(" threads_per_block = %d\n", threads_per_block); + std::printf(" grid_size = %d\n", num_blocks_kernel); + std::printf(" log2_blocksize = %d\n", log2_bs); + std::printf(" log2_s2_blocksize = %d\n", log2_s2); + + std::printf("\n[INFO] 预热 %d 次...\n", warmup); + for (int i = 0; i < warmup; ++i) { + if (use_bf16) { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } else { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } + MUSA_CHECK(musaGetLastError()); + } + MUSA_CHECK(musaDeviceSynchronize()); + + std::printf("[INFO] 计时 %d 次...\n", repeats); + + musaEvent_t ev_start; + musaEvent_t ev_end; + MUSA_CHECK(musaEventCreate(&ev_start)); + MUSA_CHECK(musaEventCreate(&ev_end)); + + std::vector times(repeats); + + for (int i = 0; i < repeats; ++i) { + MUSA_CHECK(musaDeviceSynchronize()); + MUSA_CHECK(musaEventRecord(ev_start)); + + if (use_bf16) { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } else { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } + + MUSA_CHECK(musaGetLastError()); + MUSA_CHECK(musaEventRecord(ev_end)); + MUSA_CHECK(musaEventSynchronize(ev_end)); + MUSA_CHECK(musaEventElapsedTime(×[i], ev_start, ev_end)); + } + + std::vector sorted_times = times; + std::sort(sorted_times.begin(), sorted_times.end()); + + float total_ms = 0.0f; + for (float t : times) { + total_ms += t; + } + + float min_ms = sorted_times.front(); + float max_ms = sorted_times.back(); + float avg_ms = total_ms / repeats; + float median_ms = sorted_times[repeats / 2]; + + double read_bytes = (double)packed_size + data.num_blocks + + data.num_groups * 2.0 + 256.0 * 2.0; + double write_bytes = (double)output_bytes; + double total_bytes = read_bytes + write_bytes; + double bandwidth_gbps = total_bytes / (median_ms * 1e-3) / 1e9; + + std::printf("\n========================================\n"); + std::printf(" NF4 反量化 Kernel 性能 (MUSA)\n"); + std::printf("========================================\n"); + std::printf(" 矩阵大小 : (%ld, %ld)\n", (long)data.num_rows, (long)data.num_cols); + std::printf(" 块大小 : %d\n", data.blocksize); + std::printf(" 输出类型 : %s\n", compute_type.c_str()); + std::printf(" 平均耗时 : %.4f ms\n", avg_ms); + std::printf(" 中位数耗时 : %.4f ms\n", median_ms); + std::printf(" 最小耗时 : %.4f ms\n", min_ms); + std::printf(" 最大耗时 : %.4f ms\n", max_ms); + std::printf(" 有效带宽 : %.2f GB/s (基于中位数)\n", bandwidth_gbps); + std::printf("========================================\n"); + + std::vector h_output_bits(data.n_elements); + MUSA_CHECK(musaMemcpy(h_output_bits.data(), d_output_bits, output_bytes, musaMemcpyDeviceToHost)); + + FILE* fout = std::fopen(output_file, "wb"); + if (!fout) { + std::fprintf(stderr, "[ERROR] Cannot open output file: %s\n", output_file); + return 1; + } + + std::fwrite(h_output_bits.data(), sizeof(uint16_t), h_output_bits.size(), fout); + std::fclose(fout); + + std::printf("\n[INFO] 已写入解量化输出: %s (%ld bytes)\n", output_file, (long)output_bytes); + + MUSA_CHECK(musaEventDestroy(ev_start)); + MUSA_CHECK(musaEventDestroy(ev_end)); + MUSA_CHECK(musaFree(d_packed_weights)); + MUSA_CHECK(musaFree(d_absmax_q)); + MUSA_CHECK(musaFree(d_absmax2)); + MUSA_CHECK(musaFree(d_code2)); + MUSA_CHECK(musaFree(d_output_bits)); + + std::printf("[DONE] 完成\n"); + return 0; +} diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/nf4_dequant_kernel.mu b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/nf4_dequant_kernel.mu new file mode 100644 index 0000000..f4b1e4e --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/nf4_dequant_kernel.mu @@ -0,0 +1,216 @@ +#pragma once + +#include + +__constant__ float NF4_DEQUANT_TABLE[16] = { + -1.0f, + -0.6961928009986877f, + -0.5250730514526367f, + -0.39491748809814453f, + -0.28444138169288635f, + -0.18477343022823334f, + -0.09105003625154495f, + 0.0f, + 0.07958029955625534f, + 0.16093020141124725f, + 0.24611230194568634f, + 0.33791524171829224f, + 0.44070982933044434f, + 0.5626170039176941f, + 0.7229568362236023f, + 1.0f +}; + +inline int log2_pow2(int x) { + int r = 0; + while (x > 1) { + x >>= 1; + r++; + } + return r; +} + +__device__ __forceinline__ uint32_t float_to_bits(float v) { + union { + float f; + uint32_t u; + } x; + x.f = v; + return x.u; +} + +__device__ __forceinline__ float bits_to_float(uint32_t v) { + union { + float f; + uint32_t u; + } x; + x.u = v; + return x.f; +} + +__device__ __forceinline__ float half_bits_to_float(uint16_t h) { + uint32_t sign = (uint32_t)(h & 0x8000u) << 16; + uint32_t exp = (h >> 10) & 0x1Fu; + uint32_t mant = h & 0x03FFu; + + uint32_t out; + if (exp == 0) { + if (mant == 0) { + out = sign; + } else { + exp = 127 - 15 + 1; + while ((mant & 0x0400u) == 0) { + mant <<= 1; + exp--; + } + mant &= 0x03FFu; + out = sign | (exp << 23) | (mant << 13); + } + } else if (exp == 0x1Fu) { + out = sign | 0x7F800000u | (mant << 13); + } else { + out = sign | ((exp + (127 - 15)) << 23) | (mant << 13); + } + + return bits_to_float(out); +} + +__device__ __forceinline__ uint16_t float_to_half_bits(float v) { + uint32_t x = float_to_bits(v); + uint32_t sign = (x >> 16) & 0x8000u; + int32_t exp = (int32_t)((x >> 23) & 0xFFu) - 127 + 15; + uint32_t mant = x & 0x7FFFFFu; + + if (exp <= 0) { + if (exp < -10) { + return (uint16_t)sign; + } + mant = (mant | 0x800000u) >> (1 - exp); + if ((mant & 0x00001000u) != 0) { + mant += 0x00002000u; + } + return (uint16_t)(sign | (mant >> 13)); + } + + if (exp >= 31) { + return (uint16_t)(sign | 0x7C00u); + } + + uint32_t out = sign | ((uint32_t)exp << 10) | (mant >> 13); + if ((mant & 0x00001000u) != 0) { + out += 1; + } + return (uint16_t)out; +} + +__device__ __forceinline__ uint16_t float_to_bf16_bits(float v) { + uint32_t x = float_to_bits(v); + uint32_t lsb = (x >> 16) & 1u; + x += 0x7FFFu + lsb; + return (uint16_t)(x >> 16); +} + +template +__global__ void nf4_dequantize_kernel( + const uint8_t* __restrict__ packed_weights, + const uint8_t* __restrict__ absmax_q, + const uint16_t* __restrict__ absmax2, + const uint16_t* __restrict__ code2, + float offset, + int log2_blocksize, + int log2_s2_blocksize, + int64_t n_elements, + uint16_t* __restrict__ output_bits) { + __shared__ float s_nf4_table[16]; + if (threadIdx.x < 16) { + s_nf4_table[threadIdx.x] = NF4_DEQUANT_TABLE[threadIdx.x]; + } + __syncthreads(); + + int tid_vec = blockIdx.x * blockDim.x + threadIdx.x; + int n_packed = (int)((n_elements + 1) / 2); + int n_packed_vec = (n_packed + 3) / 4; + if (tid_vec >= n_packed_vec) { + return; + } + + int byte_offset = tid_vec * 4; + uint32_t packed4 = 0; + if (byte_offset + 4 <= n_packed) { + packed4 = reinterpret_cast(packed_weights)[tid_vec]; + } else { + for (int b = 0; b < 4 && byte_offset + b < n_packed; ++b) { + packed4 |= ((uint32_t)packed_weights[byte_offset + b]) << (b << 3); + } + } + + int elem_base = tid_vec * 8; + uint32_t out_packed[4]; + + #pragma unroll + for (int b = 0; b < 4; ++b) { + int elem0 = elem_base + b * 2; + int elem1 = elem0 + 1; + + uint8_t packed_byte = (packed4 >> (b * 8)) & 0xFF; + uint8_t idx_hi = (packed_byte >> 4) & 0x0F; + uint8_t idx_lo = packed_byte & 0x0F; + + float val_hi = s_nf4_table[idx_hi]; + float val_lo = s_nf4_table[idx_lo]; + + int block_idx0 = elem0 >> log2_blocksize; + int group_idx0 = block_idx0 >> log2_s2_blocksize; + uint8_t aq0 = absmax_q[block_idx0]; + + float absmax_real0 = half_bits_to_float(code2[aq0]) + * half_bits_to_float(absmax2[group_idx0]) + + offset; + + uint16_t out0; + if (elem0 < n_elements) { + float dq0 = val_hi * absmax_real0; + out0 = OUTPUT_BF16 ? float_to_bf16_bits(dq0) : float_to_half_bits(dq0); + } else { + out0 = OUTPUT_BF16 ? float_to_bf16_bits(0.0f) : float_to_half_bits(0.0f); + } + + uint16_t out1; + if (elem1 < n_elements) { + int block_idx1 = elem1 >> log2_blocksize; + float absmax_real1; + if (block_idx1 == block_idx0) { + absmax_real1 = absmax_real0; + } else { + uint8_t aq1 = absmax_q[block_idx1]; + int group_idx1 = block_idx1 >> log2_s2_blocksize; + absmax_real1 = half_bits_to_float(code2[aq1]) + * half_bits_to_float(absmax2[group_idx1]) + + offset; + } + float dq1 = val_lo * absmax_real1; + out1 = OUTPUT_BF16 ? float_to_bf16_bits(dq1) : float_to_half_bits(dq1); + } else { + out1 = OUTPUT_BF16 ? float_to_bf16_bits(0.0f) : float_to_half_bits(0.0f); + } + + out_packed[b] = (uint32_t)out0 | ((uint32_t)out1 << 16); + } + + int out_base = tid_vec * 4; + uint32_t* out_u32 = reinterpret_cast(output_bits); + + int valid_packs = 0; + for (int b = 0; b < 4; ++b) { + if (byte_offset + b < n_packed) { + valid_packs++; + } + } + + #pragma unroll + for (int b = 0; b < 4; ++b) { + if (b < valid_packs) { + out_u32[out_base + b] = out_packed[b]; + } + } +} diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/nf4_dequant_musa b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/nf4_dequant_musa new file mode 100644 index 0000000..5096c54 Binary files /dev/null and b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/nf4_dequant_musa differ diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/run_moore.sh b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/run_moore.sh new file mode 100644 index 0000000..4f033a9 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/moore/run_moore.sh @@ -0,0 +1,105 @@ +#!/bin/bash +set -e +set -o pipefail + +PROJ_DIR="$(cd "$(dirname "$0")/../.." && pwd)" +KERNEL_DIR="${PROJ_DIR}/kernel_noncuda/moore" +SCRIPTS_DIR="${PROJ_DIR}/scripts" +DATA_DIR="${PROJ_DIR}/data" + +if [ -x "${PROJ_DIR}/.venv/bin/python" ]; then + PYTHON="${PROJ_DIR}/.venv/bin/python" +elif command -v python3 >/dev/null 2>&1; then + PYTHON="$(command -v python3)" +else + echo "[ERROR] 找不到 python3" + exit 1 +fi + +ROWS=4096 +COLS=4096 +BLOCKSIZE=64 +COMPUTE_TYPE="bf16" +WARMUP=10 +REPEATS=100 +MCC_BIN="${MCC:-mcc}" +COMMAND="test" + +if [[ "$#" -gt 0 && ! "$1" == --* ]]; then + COMMAND="$1" + shift +fi + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --rows) ROWS="$2"; shift ;; + --cols) COLS="$2"; shift ;; + --blocksize) BLOCKSIZE="$2"; shift ;; + --compute_type) COMPUTE_TYPE="$2"; shift ;; + --warmup) WARMUP="$2"; shift ;; + --repeats) REPEATS="$2"; shift ;; + --mcc) MCC_BIN="$2"; shift ;; + *) echo "[ERROR] 未知参数: $1"; exit 1 ;; + esac + shift +done + +TAG="${ROWS}x${COLS}_bs${BLOCKSIZE}" +WEIGHT_FILE="${DATA_DIR}/nf4_weights_${TAG}.bin" +REF_FILE="${DATA_DIR}/nf4_ref_output_${TAG}_${COMPUTE_TYPE}.bin" +MOORE_OUTPUT="${DATA_DIR}/moore_output_${TAG}_${COMPUTE_TYPE}.bin" + +build_kernel() { + echo "[build] 使用编译器: ${MCC_BIN}" + make -C "${KERNEL_DIR}" clean >/dev/null + make -C "${KERNEL_DIR}" MCC="${MCC_BIN}" -j"$(nproc)" +} + +run_kernel() { + if [ ! -f "${WEIGHT_FILE}" ]; then + echo "[ERROR] 缺少权重文件: ${WEIGHT_FILE}" + echo " 请先在支持 CUDA 的环境执行 ./run.sh generate 生成数据" + exit 1 + fi + + "${KERNEL_DIR}/nf4_dequant_musa" \ + "${WEIGHT_FILE}" "${MOORE_OUTPUT}" "${COMPUTE_TYPE}" "${WARMUP}" "${REPEATS}" +} + +verify_output() { + if [ ! -f "${REF_FILE}" ]; then + echo "[WARN] 缺少参考文件: ${REF_FILE}" + echo " 跳过 verify。可先在 CUDA 环境运行 ./run.sh generate --compute_type ${COMPUTE_TYPE}" + return 0 + fi + + "${PYTHON}" "${SCRIPTS_DIR}/verify.py" \ + --weight_file "${WEIGHT_FILE}" \ + --ref_file "${REF_FILE}" \ + --cuda_file "${MOORE_OUTPUT}" \ + --compute_type "${COMPUTE_TYPE}" +} + +case "${COMMAND}" in + build) + build_kernel + ;; + run) + run_kernel + ;; + verify) + verify_output + ;; + test) + build_kernel + run_kernel + verify_output + ;; + *) + echo "[ERROR] 未知命令: ${COMMAND}" + echo "可用命令: build | run | verify | test" + exit 1 + ;; +esac + +echo "[DONE] ${COMMAND} 完成" diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/Makefile b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/Makefile new file mode 100644 index 0000000..38ee0eb --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/Makefile @@ -0,0 +1,19 @@ +MXCC ?= mxcc +TARGET ?= nf4_dequant_maca +SRC ?= main.maca + +CXXFLAGS ?= -O3 -std=c++17 +INCLUDES ?= -I. + +.PHONY: all clean run + +all: $(TARGET) + +$(TARGET): $(SRC) nf4_dequant_kernel.maca + $(MXCC) $(CXXFLAGS) $(INCLUDES) -o $@ $(SRC) + +run: $(TARGET) + @echo "Usage: ./$(TARGET) [bf16|fp16] [warmup] [repeats]" + +clean: + rm -f $(TARGET) diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/README.md b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/README.md new file mode 100644 index 0000000..5964061 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/README.md @@ -0,0 +1,46 @@ +# 沐曦 (MACA) 适配版 NF4 反量化 + +该目录是对 `kernel/` 中 CUDA 版本的平移适配,目标是运行在沐曦 GPU 环境。 + +## 适配要点 + +- 编译器从 `nvcc` 切换为 `mxcc` +- 主源码使用 `.maca` 后缀 +- 运行时 API 使用 `mc*`(例如 `mcMalloc` / `mcMemcpy`) +- 保留与原工程一致的二进制输入输出格式,直接复用 `scripts/verify.py` + +## 目录文件 + +- `main.maca`: 主入口,负责文件 IO / kernel 启动 / 性能统计 +- `nf4_dequant_kernel.maca`: NF4 反量化 kernel +- `Makefile`: 使用 `mxcc` 构建 `nf4_dequant_maca` +- `run_mutex.sh`: 一键 build/run/verify + +## 构建 + +```bash +cd kernel_noncuda/mutex +make MXCC=mxcc -j +``` + +## 运行 + +```bash +# 需要已有 data/nf4_weights_*.bin +./nf4_dequant_maca ../../data/nf4_weights_4096x4096_bs64.bin \ + ../../data/mutex_output_4096x4096_bs64_fp16.bin \ + fp16 10 100 +``` + +## 一键流程 + +```bash +cd kernel_noncuda/mutex +bash run_mutex.sh test --rows 4096 --cols 4096 --blocksize 64 --compute_type fp16 +``` + +说明: + +- `run_mutex.sh` 默认只消费已有测试数据,不会调用 `generate_data.py`。 + - 可先在另一台 CUDA 机器执行 `./run.sh generate` 生成 `data/` 再拷贝过来。 +- `compute_type` 支持 `fp16` 和 `bf16`,输出文件格式与原验证脚本兼容。 diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/main.maca b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/main.maca new file mode 100644 index 0000000..10c52eb --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/main.maca @@ -0,0 +1,319 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __has_include() +#include +#elif __has_include() +#include +#else +#error "MACA runtime header not found. Please install MACA SDK and set include paths." +#endif + +#if __has_include() +#include +#endif + +#include "nf4_dequant_kernel.maca" + +#define MUTEX_BACKEND_NAME "MACA" + +#define MC_CHECK(call) \ + do { \ + mcError_t err__ = (call); \ + if (err__ != mcSuccess) { \ + std::fprintf(stderr, "MC error at %s:%d: %s\n", \ + __FILE__, __LINE__, mcGetErrorString(err__)); \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +struct NF4Data { + int64_t num_rows = 0; + int64_t num_cols = 0; + int32_t blocksize = 0; + + std::vector packed_weights; + std::vector absmax_q; + std::vector absmax2; + std::vector code2; + float offset = 0.0f; + + int64_t n_elements = 0; + int32_t num_blocks = 0; + int32_t num_groups = 0; + int32_t s2_blocksize = 0; +}; + +static bool is_power_of_two(int x) { + return x > 0 && ((x & (x - 1)) == 0); +} + +static bool read_nf4_data(const char* filepath, NF4Data& data) { + FILE* f = std::fopen(filepath, "rb"); + if (!f) { + std::fprintf(stderr, "[ERROR] Cannot open file: %s\n", filepath); + return false; + } + + if (std::fread(&data.num_rows, sizeof(int64_t), 1, f) != 1 || + std::fread(&data.num_cols, sizeof(int64_t), 1, f) != 1 || + std::fread(&data.blocksize, sizeof(int32_t), 1, f) != 1) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad header in file: %s\n", filepath); + return false; + } + + data.n_elements = data.num_rows * data.num_cols; + data.num_blocks = (int32_t)((data.n_elements + data.blocksize - 1) / data.blocksize); + + int64_t packed_size = data.n_elements / 2; + data.packed_weights.resize(packed_size); + if (std::fread(data.packed_weights.data(), 1, packed_size, f) != (size_t)packed_size) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad packed data in file: %s\n", filepath); + return false; + } + + data.absmax_q.resize(data.num_blocks); + if (std::fread(data.absmax_q.data(), 1, data.num_blocks, f) != (size_t)data.num_blocks) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad absmax_q in file: %s\n", filepath); + return false; + } + + long current_pos = std::ftell(f); + std::fseek(f, 0, SEEK_END); + long file_size = std::ftell(f); + std::fseek(f, current_pos, SEEK_SET); + + long remaining = file_size - current_pos; + long fixed_tail = 256 * 2 + 4; + long absmax2_bytes = remaining - fixed_tail; + + if (absmax2_bytes <= 0 || (absmax2_bytes % 2) != 0) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Invalid absmax2 segment in file: %s\n", filepath); + return false; + } + + data.num_groups = (int32_t)(absmax2_bytes / 2); + data.s2_blocksize = (data.num_blocks + data.num_groups - 1) / data.num_groups; + + data.absmax2.resize(data.num_groups); + if (std::fread(data.absmax2.data(), 2, data.num_groups, f) != (size_t)data.num_groups) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad absmax2 in file: %s\n", filepath); + return false; + } + + data.code2.resize(256); + if (std::fread(data.code2.data(), 2, 256, f) != 256) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Bad code2 in file: %s\n", filepath); + return false; + } + + if (std::fread(&data.offset, sizeof(float), 1, f) != 1) { + std::fclose(f); + std::fprintf(stderr, "[ERROR] Missing offset in file: %s\n", filepath); + return false; + } + + std::fclose(f); + return true; +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::fprintf(stderr, "用法: %s [bf16|fp16] [warmup] [repeats]\n", argv[0]); + return 1; + } + + const char* weight_file = argv[1]; + const char* output_file = argv[2]; + std::string compute_type = (argc > 3) ? argv[3] : "bf16"; + int warmup = (argc > 4) ? std::atoi(argv[4]) : 10; + int repeats = (argc > 5) ? std::atoi(argv[5]) : 100; + + bool use_bf16 = (compute_type == "bf16"); + if (!use_bf16 && compute_type != "fp16") { + std::fprintf(stderr, "[ERROR] compute_type must be bf16 or fp16\n"); + return 1; + } + + std::printf("[INFO] Backend: %s\n", MUTEX_BACKEND_NAME); + std::printf("[INFO] 读取权重文件: %s\n", weight_file); + + NF4Data data; + if (!read_nf4_data(weight_file, data)) { + return 1; + } + + std::printf(" num_rows = %ld\n", (long)data.num_rows); + std::printf(" num_cols = %ld\n", (long)data.num_cols); + std::printf(" blocksize = %d\n", data.blocksize); + std::printf(" n_elements = %ld\n", (long)data.n_elements); + std::printf(" num_blocks = %d\n", data.num_blocks); + std::printf(" num_groups = %d\n", data.num_groups); + std::printf(" s2_blocksize = %d\n", data.s2_blocksize); + std::printf(" offset = %f\n", data.offset); + std::printf(" compute_type = %s\n", compute_type.c_str()); + + if (!is_power_of_two(data.blocksize) || !is_power_of_two(data.s2_blocksize)) { + std::fprintf(stderr, + "[ERROR] blocksize and s2_blocksize must be powers of two for log2-shift kernel. got blocksize=%d s2_blocksize=%d\n", + data.blocksize, data.s2_blocksize); + return 1; + } + + uint8_t* d_packed_weights = nullptr; + uint8_t* d_absmax_q = nullptr; + uint16_t* d_absmax2 = nullptr; + uint16_t* d_code2 = nullptr; + uint16_t* d_output_bits = nullptr; + + int64_t packed_size = data.n_elements / 2; + int64_t output_bytes = data.n_elements * 2; + + MC_CHECK(mcMalloc((void**)&d_packed_weights, packed_size)); + MC_CHECK(mcMalloc((void**)&d_absmax_q, data.num_blocks)); + MC_CHECK(mcMalloc((void**)&d_absmax2, data.num_groups * sizeof(uint16_t))); + MC_CHECK(mcMalloc((void**)&d_code2, 256 * sizeof(uint16_t))); + MC_CHECK(mcMalloc((void**)&d_output_bits, output_bytes)); + + MC_CHECK(mcMemcpy(d_packed_weights, data.packed_weights.data(), + packed_size, mcMemcpyHostToDevice)); + MC_CHECK(mcMemcpy(d_absmax_q, data.absmax_q.data(), + data.num_blocks, mcMemcpyHostToDevice)); + MC_CHECK(mcMemcpy(d_absmax2, data.absmax2.data(), + data.num_groups * sizeof(uint16_t), mcMemcpyHostToDevice)); + MC_CHECK(mcMemcpy(d_code2, data.code2.data(), + 256 * sizeof(uint16_t), mcMemcpyHostToDevice)); + + int n_packed = (int)((data.n_elements + 1) / 2); + int n_packed_vec = (n_packed + 3) / 4; + int threads_per_block = 256; + int num_blocks_kernel = (n_packed_vec + threads_per_block - 1) / threads_per_block; + int log2_bs = log2_pow2(data.blocksize); + int log2_s2 = log2_pow2(data.s2_blocksize); + + std::printf("\n[INFO] Kernel 配置:\n"); + std::printf(" n_packed = %d\n", n_packed); + std::printf(" n_packed_vec = %d (向量化后)\n", n_packed_vec); + std::printf(" threads_per_block = %d\n", threads_per_block); + std::printf(" grid_size = %d\n", num_blocks_kernel); + std::printf(" log2_blocksize = %d\n", log2_bs); + std::printf(" log2_s2_blocksize = %d\n", log2_s2); + + std::printf("\n[INFO] 预热 %d 次...\n", warmup); + for (int i = 0; i < warmup; ++i) { + if (use_bf16) { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } else { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } + MC_CHECK(mcGetLastError()); + } + MC_CHECK(mcDeviceSynchronize()); + + std::printf("[INFO] 计时 %d 次...\n", repeats); + + mcEvent_t ev_start; + mcEvent_t ev_end; + MC_CHECK(mcEventCreate(&ev_start)); + MC_CHECK(mcEventCreate(&ev_end)); + + std::vector times(repeats); + + for (int i = 0; i < repeats; ++i) { + MC_CHECK(mcDeviceSynchronize()); + MC_CHECK(mcEventRecord(ev_start)); + + if (use_bf16) { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } else { + nf4_dequantize_kernel<<>>( + d_packed_weights, d_absmax_q, d_absmax2, d_code2, + data.offset, log2_bs, log2_s2, + data.n_elements, d_output_bits); + } + + MC_CHECK(mcGetLastError()); + MC_CHECK(mcEventRecord(ev_end)); + MC_CHECK(mcEventSynchronize(ev_end)); + MC_CHECK(mcEventElapsedTime(×[i], ev_start, ev_end)); + } + + std::vector sorted_times = times; + std::sort(sorted_times.begin(), sorted_times.end()); + + float total_ms = 0.0f; + for (float t : times) { + total_ms += t; + } + + float min_ms = sorted_times.front(); + float max_ms = sorted_times.back(); + float avg_ms = total_ms / repeats; + float median_ms = sorted_times[repeats / 2]; + + double read_bytes = (double)packed_size + data.num_blocks + + data.num_groups * 2.0 + 256.0 * 2.0; + double write_bytes = (double)output_bytes; + double total_bytes = read_bytes + write_bytes; + double bandwidth_gbps = total_bytes / (median_ms * 1e-3) / 1e9; + + std::printf("\n========================================\n"); + std::printf(" NF4 反量化 Kernel 性能 (MACA)\n"); + std::printf("========================================\n"); + std::printf(" 矩阵大小 : (%ld, %ld)\n", (long)data.num_rows, (long)data.num_cols); + std::printf(" 块大小 : %d\n", data.blocksize); + std::printf(" 输出类型 : %s\n", compute_type.c_str()); + std::printf(" 平均耗时 : %.4f ms\n", avg_ms); + std::printf(" 中位数耗时 : %.4f ms\n", median_ms); + std::printf(" 最小耗时 : %.4f ms\n", min_ms); + std::printf(" 最大耗时 : %.4f ms\n", max_ms); + std::printf(" 有效带宽 : %.2f GB/s (基于中位数)\n", bandwidth_gbps); + std::printf("========================================\n"); + + std::vector h_output_bits(data.n_elements); + MC_CHECK(mcMemcpy(h_output_bits.data(), d_output_bits, output_bytes, mcMemcpyDeviceToHost)); + + FILE* fout = std::fopen(output_file, "wb"); + if (!fout) { + std::fprintf(stderr, "[ERROR] Cannot open output file: %s\n", output_file); + return 1; + } + + std::fwrite(h_output_bits.data(), sizeof(uint16_t), h_output_bits.size(), fout); + std::fclose(fout); + + std::printf("\n[INFO] 已写入解量化输出: %s (%ld bytes)\n", output_file, (long)output_bytes); + + MC_CHECK(mcEventDestroy(ev_start)); + MC_CHECK(mcEventDestroy(ev_end)); + MC_CHECK(mcFree(d_packed_weights)); + MC_CHECK(mcFree(d_absmax_q)); + MC_CHECK(mcFree(d_absmax2)); + MC_CHECK(mcFree(d_code2)); + MC_CHECK(mcFree(d_output_bits)); + + std::printf("[DONE] 完成\n"); + return 0; +} diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/nf4_dequant_kernel.maca b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/nf4_dequant_kernel.maca new file mode 100644 index 0000000..f4b1e4e --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/nf4_dequant_kernel.maca @@ -0,0 +1,216 @@ +#pragma once + +#include + +__constant__ float NF4_DEQUANT_TABLE[16] = { + -1.0f, + -0.6961928009986877f, + -0.5250730514526367f, + -0.39491748809814453f, + -0.28444138169288635f, + -0.18477343022823334f, + -0.09105003625154495f, + 0.0f, + 0.07958029955625534f, + 0.16093020141124725f, + 0.24611230194568634f, + 0.33791524171829224f, + 0.44070982933044434f, + 0.5626170039176941f, + 0.7229568362236023f, + 1.0f +}; + +inline int log2_pow2(int x) { + int r = 0; + while (x > 1) { + x >>= 1; + r++; + } + return r; +} + +__device__ __forceinline__ uint32_t float_to_bits(float v) { + union { + float f; + uint32_t u; + } x; + x.f = v; + return x.u; +} + +__device__ __forceinline__ float bits_to_float(uint32_t v) { + union { + float f; + uint32_t u; + } x; + x.u = v; + return x.f; +} + +__device__ __forceinline__ float half_bits_to_float(uint16_t h) { + uint32_t sign = (uint32_t)(h & 0x8000u) << 16; + uint32_t exp = (h >> 10) & 0x1Fu; + uint32_t mant = h & 0x03FFu; + + uint32_t out; + if (exp == 0) { + if (mant == 0) { + out = sign; + } else { + exp = 127 - 15 + 1; + while ((mant & 0x0400u) == 0) { + mant <<= 1; + exp--; + } + mant &= 0x03FFu; + out = sign | (exp << 23) | (mant << 13); + } + } else if (exp == 0x1Fu) { + out = sign | 0x7F800000u | (mant << 13); + } else { + out = sign | ((exp + (127 - 15)) << 23) | (mant << 13); + } + + return bits_to_float(out); +} + +__device__ __forceinline__ uint16_t float_to_half_bits(float v) { + uint32_t x = float_to_bits(v); + uint32_t sign = (x >> 16) & 0x8000u; + int32_t exp = (int32_t)((x >> 23) & 0xFFu) - 127 + 15; + uint32_t mant = x & 0x7FFFFFu; + + if (exp <= 0) { + if (exp < -10) { + return (uint16_t)sign; + } + mant = (mant | 0x800000u) >> (1 - exp); + if ((mant & 0x00001000u) != 0) { + mant += 0x00002000u; + } + return (uint16_t)(sign | (mant >> 13)); + } + + if (exp >= 31) { + return (uint16_t)(sign | 0x7C00u); + } + + uint32_t out = sign | ((uint32_t)exp << 10) | (mant >> 13); + if ((mant & 0x00001000u) != 0) { + out += 1; + } + return (uint16_t)out; +} + +__device__ __forceinline__ uint16_t float_to_bf16_bits(float v) { + uint32_t x = float_to_bits(v); + uint32_t lsb = (x >> 16) & 1u; + x += 0x7FFFu + lsb; + return (uint16_t)(x >> 16); +} + +template +__global__ void nf4_dequantize_kernel( + const uint8_t* __restrict__ packed_weights, + const uint8_t* __restrict__ absmax_q, + const uint16_t* __restrict__ absmax2, + const uint16_t* __restrict__ code2, + float offset, + int log2_blocksize, + int log2_s2_blocksize, + int64_t n_elements, + uint16_t* __restrict__ output_bits) { + __shared__ float s_nf4_table[16]; + if (threadIdx.x < 16) { + s_nf4_table[threadIdx.x] = NF4_DEQUANT_TABLE[threadIdx.x]; + } + __syncthreads(); + + int tid_vec = blockIdx.x * blockDim.x + threadIdx.x; + int n_packed = (int)((n_elements + 1) / 2); + int n_packed_vec = (n_packed + 3) / 4; + if (tid_vec >= n_packed_vec) { + return; + } + + int byte_offset = tid_vec * 4; + uint32_t packed4 = 0; + if (byte_offset + 4 <= n_packed) { + packed4 = reinterpret_cast(packed_weights)[tid_vec]; + } else { + for (int b = 0; b < 4 && byte_offset + b < n_packed; ++b) { + packed4 |= ((uint32_t)packed_weights[byte_offset + b]) << (b << 3); + } + } + + int elem_base = tid_vec * 8; + uint32_t out_packed[4]; + + #pragma unroll + for (int b = 0; b < 4; ++b) { + int elem0 = elem_base + b * 2; + int elem1 = elem0 + 1; + + uint8_t packed_byte = (packed4 >> (b * 8)) & 0xFF; + uint8_t idx_hi = (packed_byte >> 4) & 0x0F; + uint8_t idx_lo = packed_byte & 0x0F; + + float val_hi = s_nf4_table[idx_hi]; + float val_lo = s_nf4_table[idx_lo]; + + int block_idx0 = elem0 >> log2_blocksize; + int group_idx0 = block_idx0 >> log2_s2_blocksize; + uint8_t aq0 = absmax_q[block_idx0]; + + float absmax_real0 = half_bits_to_float(code2[aq0]) + * half_bits_to_float(absmax2[group_idx0]) + + offset; + + uint16_t out0; + if (elem0 < n_elements) { + float dq0 = val_hi * absmax_real0; + out0 = OUTPUT_BF16 ? float_to_bf16_bits(dq0) : float_to_half_bits(dq0); + } else { + out0 = OUTPUT_BF16 ? float_to_bf16_bits(0.0f) : float_to_half_bits(0.0f); + } + + uint16_t out1; + if (elem1 < n_elements) { + int block_idx1 = elem1 >> log2_blocksize; + float absmax_real1; + if (block_idx1 == block_idx0) { + absmax_real1 = absmax_real0; + } else { + uint8_t aq1 = absmax_q[block_idx1]; + int group_idx1 = block_idx1 >> log2_s2_blocksize; + absmax_real1 = half_bits_to_float(code2[aq1]) + * half_bits_to_float(absmax2[group_idx1]) + + offset; + } + float dq1 = val_lo * absmax_real1; + out1 = OUTPUT_BF16 ? float_to_bf16_bits(dq1) : float_to_half_bits(dq1); + } else { + out1 = OUTPUT_BF16 ? float_to_bf16_bits(0.0f) : float_to_half_bits(0.0f); + } + + out_packed[b] = (uint32_t)out0 | ((uint32_t)out1 << 16); + } + + int out_base = tid_vec * 4; + uint32_t* out_u32 = reinterpret_cast(output_bits); + + int valid_packs = 0; + for (int b = 0; b < 4; ++b) { + if (byte_offset + b < n_packed) { + valid_packs++; + } + } + + #pragma unroll + for (int b = 0; b < 4; ++b) { + if (b < valid_packs) { + out_u32[out_base + b] = out_packed[b]; + } + } +} diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/nf4_dequant_maca b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/nf4_dequant_maca new file mode 100644 index 0000000..2054cff Binary files /dev/null and b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/nf4_dequant_maca differ diff --git a/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/run_mutex.sh b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/run_mutex.sh new file mode 100644 index 0000000..6e71022 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/kernel_noncuda/mutex/run_mutex.sh @@ -0,0 +1,105 @@ +#!/bin/bash +set -e +set -o pipefail + +PROJ_DIR="$(cd "$(dirname "$0")/../.." && pwd)" +KERNEL_DIR="${PROJ_DIR}/kernel_noncuda/mutex" +SCRIPTS_DIR="${PROJ_DIR}/scripts" +DATA_DIR="${PROJ_DIR}/data" + +if [ -x "${PROJ_DIR}/.venv/bin/python" ]; then + PYTHON="${PROJ_DIR}/.venv/bin/python" +elif command -v python3 >/dev/null 2>&1; then + PYTHON="$(command -v python3)" +else + echo "[ERROR] 找不到 python3" + exit 1 +fi + +ROWS=4096 +COLS=4096 +BLOCKSIZE=64 +COMPUTE_TYPE="bf16" +WARMUP=10 +REPEATS=100 +MXCC_BIN="${MXCC:-mxcc}" +COMMAND="test" + +if [[ "$#" -gt 0 && ! "$1" == --* ]]; then + COMMAND="$1" + shift +fi + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --rows) ROWS="$2"; shift ;; + --cols) COLS="$2"; shift ;; + --blocksize) BLOCKSIZE="$2"; shift ;; + --compute_type) COMPUTE_TYPE="$2"; shift ;; + --warmup) WARMUP="$2"; shift ;; + --repeats) REPEATS="$2"; shift ;; + --mxcc) MXCC_BIN="$2"; shift ;; + *) echo "[ERROR] 未知参数: $1"; exit 1 ;; + esac + shift +done + +TAG="${ROWS}x${COLS}_bs${BLOCKSIZE}" +WEIGHT_FILE="${DATA_DIR}/nf4_weights_${TAG}.bin" +REF_FILE="${DATA_DIR}/nf4_ref_output_${TAG}_${COMPUTE_TYPE}.bin" +MUTEX_OUTPUT="${DATA_DIR}/mutex_output_${TAG}_${COMPUTE_TYPE}.bin" + +build_kernel() { + echo "[build] 使用编译器: ${MXCC_BIN}" + make -C "${KERNEL_DIR}" clean >/dev/null + make -C "${KERNEL_DIR}" MXCC="${MXCC_BIN}" -j"$(nproc)" +} + +run_kernel() { + if [ ! -f "${WEIGHT_FILE}" ]; then + echo "[ERROR] 缺少权重文件: ${WEIGHT_FILE}" + echo " 请先在支持 CUDA 的环境执行 ./run.sh generate 生成数据" + exit 1 + fi + + "${KERNEL_DIR}/nf4_dequant_maca" \ + "${WEIGHT_FILE}" "${MUTEX_OUTPUT}" "${COMPUTE_TYPE}" "${WARMUP}" "${REPEATS}" +} + +verify_output() { + if [ ! -f "${REF_FILE}" ]; then + echo "[WARN] 缺少参考文件: ${REF_FILE}" + echo " 跳过 verify。可先在 CUDA 环境运行 ./run.sh generate --compute_type ${COMPUTE_TYPE}" + return 0 + fi + + "${PYTHON}" "${SCRIPTS_DIR}/verify.py" \ + --weight_file "${WEIGHT_FILE}" \ + --ref_file "${REF_FILE}" \ + --cuda_file "${MUTEX_OUTPUT}" \ + --compute_type "${COMPUTE_TYPE}" +} + +case "${COMMAND}" in + build) + build_kernel + ;; + run) + run_kernel + ;; + verify) + verify_output + ;; + test) + build_kernel + run_kernel + verify_output + ;; + *) + echo "[ERROR] 未知命令: ${COMMAND}" + echo "可用命令: build | run | verify | test" + exit 1 + ;; +esac + +echo "[DONE] ${COMMAND} 完成" diff --git a/03_nf4_dequant/xfarawayx/run.sh b/03_nf4_dequant/xfarawayx/run.sh new file mode 100755 index 0000000..6c2edf5 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/run.sh @@ -0,0 +1,191 @@ +#!/bin/bash +# ============================================================ +# NF4 反量化 —— 统一流程脚本 +# +# 用法: +# ./run.sh [子命令] [选项] +# +# 子命令: +# generate 仅生成测试数据 +# build 仅编译 CUDA kernel +# test 编译 → 运行 kernel → 验证正确性 (默认) +# bench bitsandbytes 基准性能测试 +# all 数据生成 → 编译 → 运行 → 验证 → bnb 对比 +# +# 选项: +# --rows R 矩阵行数 (默认: 4096) +# --cols C 矩阵列数 (默认: 4096) +# --blocksize B 量化块大小 (默认: 64) +# --compute_type T bf16|fp16 (默认: bf16) +# --seed S 随机种子 (默认: 42) +# --gpu_arch A GPU 架构, 如 80/89/90 (默认: 自动检测) +# --warmup W 预热次数 (默认: 10) +# --repeats N 计时重复次数 (默认: 100) +# --sweep bench 时扫描多种矩阵大小 +# ============================================================ +set -e +set -o pipefail + +PROJ_DIR="$(cd "$(dirname "$0")" && pwd)" +KERNEL_DIR="${PROJ_DIR}/kernel" +SCRIPTS_DIR="${PROJ_DIR}/scripts" +BUILD_DIR="${KERNEL_DIR}/build" +DATA_DIR="${PROJ_DIR}/data" + +# ---------- 自动查找 Python ---------- +# 优先使用环境变量 PYTHON,其次使用 venv (含所需依赖),再回退到系统 python3 +if [ -n "${PYTHON:-}" ] && [ -x "${PYTHON}" ]; then + : # 使用用户提供的 PYTHON +elif [ -x "${PROJ_DIR}/.venv/bin/python" ]; then + PYTHON="${PROJ_DIR}/.venv/bin/python" +elif [ -n "${HOME:-}" ] && [ -x "${HOME}/.venv/bin/python" ]; then + PYTHON="${HOME}/.venv/bin/python" +elif command -v python3 &>/dev/null; then + PYTHON="$(command -v python3)" +else + echo "[ERROR] 找不到 Python, 请设置 PYTHON 环境变量" + exit 1 +fi + +# ---------- 默认参数 ---------- +ROWS=4096 +COLS=4096 +BLOCKSIZE=64 +COMPUTE_TYPE="bf16" +SEED=42 +GPU_ARCH="" +WARMUP=10 +REPEATS=100 +SWEEP="" + +# ---------- 解析子命令 ---------- +COMMAND="test" +if [[ "$#" -gt 0 && ! "$1" == --* ]]; then + COMMAND="$1" + shift +fi + +# ---------- 解析选项 ---------- +while [[ "$#" -gt 0 ]]; do + case $1 in + --rows) ROWS="$2"; shift ;; + --cols) COLS="$2"; shift ;; + --blocksize) BLOCKSIZE="$2"; shift ;; + --compute_type) COMPUTE_TYPE="$2"; shift ;; + --seed) SEED="$2"; shift ;; + --gpu_arch) GPU_ARCH="$2"; shift ;; + --warmup) WARMUP="$2"; shift ;; + --repeats) REPEATS="$2"; shift ;; + --sweep) SWEEP="--sweep" ;; + *) echo "[ERROR] 未知参数: $1"; exit 1 ;; + esac + shift +done + +TAG="${ROWS}x${COLS}_bs${BLOCKSIZE}" +WEIGHT_FILE="${DATA_DIR}/nf4_weights_${TAG}.bin" +REF_FILE="${DATA_DIR}/nf4_ref_output_${TAG}_${COMPUTE_TYPE}.bin" +CUDA_OUTPUT="${DATA_DIR}/cuda_output_${TAG}_${COMPUTE_TYPE}.bin" + +# ============================================================ +# 阶段函数 +# ============================================================ + +do_generate() { + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo " [Step 1] 生成测试数据" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + if [ -f "${WEIGHT_FILE}" ] && [ -f "${REF_FILE}" ]; then + echo " 数据已存在: ${TAG}, 跳过 (删除 data/ 可强制重新生成)" + else + ${PYTHON} "${SCRIPTS_DIR}/generate_data.py" \ + --rows ${ROWS} --cols ${COLS} --blocksize ${BLOCKSIZE} \ + --seed ${SEED} --compute_type ${COMPUTE_TYPE} --outdir "${DATA_DIR}" + fi +} + +do_build() { + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo " [Step 2] 编译 CUDA kernel" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + mkdir -p "${BUILD_DIR}" + + local cmake_args="-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc" + if [ -n "${GPU_ARCH}" ]; then + cmake_args="${cmake_args} -DGPU_ARCH=${GPU_ARCH}" + fi + + cd "${BUILD_DIR}" + cmake .. ${cmake_args} 2>&1 | tail -5 + make -j$(nproc) 2>&1 | tail -5 + cd "${PROJ_DIR}" + + echo " 可执行文件: ${BUILD_DIR}/nf4_dequant" +} + +do_test() { + do_generate + do_build + + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo " [Step 3] 运行 CUDA kernel" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + "${BUILD_DIR}/nf4_dequant" \ + "${WEIGHT_FILE}" "${CUDA_OUTPUT}" "${COMPUTE_TYPE}" ${WARMUP} ${REPEATS} + + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo " [Step 4] 验证正确性" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + ${PYTHON} "${SCRIPTS_DIR}/verify.py" \ + --weight_file "${WEIGHT_FILE}" \ + --ref_file "${REF_FILE}" \ + --cuda_file "${CUDA_OUTPUT}" \ + --compute_type ${COMPUTE_TYPE} +} + +do_bench() { + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo " bitsandbytes 基准性能测试" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + ${PYTHON} "${SCRIPTS_DIR}/bench_bnb.py" \ + --rows ${ROWS} --cols ${COLS} --blocksize ${BLOCKSIZE} \ + --seed ${SEED} --warmup ${WARMUP} --repeats ${REPEATS} ${SWEEP} +} + +do_all() { + do_test + do_bench +} + +# ============================================================ +# 入口 +# ============================================================ + +echo "============================================" +echo " NF4 反量化测试" +echo " 矩阵: ${ROWS} x ${COLS}, 块大小: ${BLOCKSIZE}" +echo " 输出类型: ${COMPUTE_TYPE}, 命令: ${COMMAND}" +echo "============================================" + +case ${COMMAND} in + generate) do_generate ;; + build) do_build ;; + test) do_test ;; + bench) do_bench ;; + all) do_all ;; + *) + echo "[ERROR] 未知子命令: ${COMMAND}" + echo "可用子命令: generate | build | test | bench | all" + exit 1 + ;; +esac + +echo "" +echo "============================================" +echo " 完成: ${COMMAND}" +echo "============================================" diff --git a/03_nf4_dequant/xfarawayx/scripts/bench_bnb.py b/03_nf4_dequant/xfarawayx/scripts/bench_bnb.py new file mode 100644 index 0000000..026b57b --- /dev/null +++ b/03_nf4_dequant/xfarawayx/scripts/bench_bnb.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +""" +NF4 反量化 —— bitsandbytes 性能基准 + +测量 bitsandbytes dequantize_4bit 的执行时间和带宽, +供 CUDA kernel 实现计算加速比。 +""" + +import argparse +import statistics + +import torch +import bitsandbytes.functional as F + + +def parse_args(): + parser = argparse.ArgumentParser(description="bitsandbytes NF4 解量化性能基准") + parser.add_argument("--rows", type=int, default=4096) + parser.add_argument("--cols", type=int, default=4096) + parser.add_argument("--blocksize", type=int, default=64, choices=[64, 128]) + parser.add_argument("--warmup", type=int, default=10) + parser.add_argument("--repeats", type=int, default=100) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--sweep", action="store_true", + help="测试多种矩阵大小") + return parser.parse_args() + + +def bench_one(rows, cols, blocksize, seed, warmup, repeats): + """对单个配置运行基准测试,返回 (avg_ms, median_ms, min_ms, max_ms, bw_gbps).""" + torch.manual_seed(seed) + weight = torch.randn(rows, cols, dtype=torch.float16, device="cuda") + quant, state = F.quantize_4bit( + weight, quant_type="nf4", blocksize=blocksize, + compress_statistics=True, + ) + + n_elements = rows * cols + total_bytes = n_elements // 2 + n_elements * 2 # packed_in + fp16_out + + # 预热 + for _ in range(warmup): + _ = F.dequantize_4bit(quant, state) + torch.cuda.synchronize() + + # 计时 + starts = [torch.cuda.Event(enable_timing=True) for _ in range(repeats)] + ends = [torch.cuda.Event(enable_timing=True) for _ in range(repeats)] + + for i in range(repeats): + starts[i].record() + _ = F.dequantize_4bit(quant, state) + ends[i].record() + + torch.cuda.synchronize() + times = sorted(s.elapsed_time(e) for s, e in zip(starts, ends)) + + avg_ms = statistics.mean(times) + median_ms = statistics.median(times) + min_ms = times[0] + max_ms = times[-1] + bw_gbps = total_bytes / (median_ms * 1e-3) / 1e9 + + return avg_ms, median_ms, min_ms, max_ms, bw_gbps + + +def main(): + args = parse_args() + + # 主测试 + avg, med, mn, mx, bw = bench_one( + args.rows, args.cols, args.blocksize, + args.seed, args.warmup, args.repeats, + ) + + print(f"\n bitsandbytes dequantize_4bit 性能") + print(f" {'─' * 44}") + print(f" 矩阵 : ({args.rows}, {args.cols})") + print(f" 块大小 : {args.blocksize}") + print(f" 平均耗时 : {avg:.4f} ms") + print(f" 中位数耗时 : {med:.4f} ms") + print(f" 最小耗时 : {mn:.4f} ms") + print(f" 最大耗时 : {mx:.4f} ms") + print(f" 有效带宽 : {bw:.2f} GB/s (基于中位数)") + + # 可选:扫描不同矩阵大小 + if args.sweep: + shapes = [ + (1024, 1024), (2048, 2048), (4096, 4096), + (4096, 11008), (4096, 14336), + (1536, 1536), (1536, 8960), + ] + blocksizes = [64, 128] + + print(f"\n {'Shape':>18s} {'BS':>4s} {'Avg(ms)':>9s} {'Med(ms)':>9s} {'BW(GB/s)':>10s}") + print(f" {'─' * 54}") + + for r, c in shapes: + for bs in blocksizes: + a, m, _, _, b = bench_one(r, c, bs, args.seed, 5, 50) + print(f" ({r:>5d}, {c:>5d}) {bs:>4d} {a:>9.4f} {m:>9.4f} {b:>10.2f}") + + print(f"\n[bench_bnb] 完成") + + +if __name__ == "__main__": + main() diff --git a/03_nf4_dequant/xfarawayx/scripts/generate_data.py b/03_nf4_dequant/xfarawayx/scripts/generate_data.py new file mode 100644 index 0000000..b4d2ff6 --- /dev/null +++ b/03_nf4_dequant/xfarawayx/scripts/generate_data.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +NF4 反量化 —— 输入数据生成 + +使用 bitsandbytes 对随机正态分布权重矩阵进行 NF4 量化(双重量化), +将量化结果按二进制格式写入文件,同时保存 bitsandbytes 解量化的参考输出。 + +二进制文件格式: + [header] + num_rows : int64 + num_cols : int64 + blocksize : int32 + [data] + packed_weights : uint8[N/2] 每字节存两个 4-bit 索引 + absmax_q : uint8[num_blocks] 一级量化缩放因子 + absmax2 : float16[num_groups] 二级缩放因子 + code2 : float16[256] 二级码表 + offset : float32 量化偏移 +""" + +import argparse +import math +import os +import struct +import sys + +import numpy as np +import torch +import bitsandbytes.functional as F + + +def parse_args(): + parser = argparse.ArgumentParser(description="NF4 量化数据生成") + parser.add_argument("--rows", type=int, default=4096) + parser.add_argument("--cols", type=int, default=4096) + parser.add_argument("--blocksize", type=int, default=64, + choices=[64, 128, 256, 512, 1024, 2048, 4096]) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--compute_type", type=str, default="bf16", + choices=["bf16", "fp16"]) + parser.add_argument("--outdir", type=str, default=None, + help="输出目录 (默认: <项目根>/data)") + return parser.parse_args() + + +def generate_and_save(rows, cols, blocksize, seed, compute_type, outdir): + os.makedirs(outdir, exist_ok=True) + + # 1. 生成随机权重 + torch.manual_seed(seed) + weight = torch.randn(rows, cols, dtype=torch.float16, device="cuda") + print(f"[generate] shape=({rows}, {cols}), dtype=float16, seed={seed}") + + # 2. NF4 量化 (双重量化) + quant, state = F.quantize_4bit( + weight, quant_type="nf4", blocksize=blocksize, + compress_statistics=True, + ) + + n_elements = rows * cols + num_blocks = math.ceil(n_elements / blocksize) + + assert state.state2 is not None, "compress_statistics=True 应启用双重量化" + s2 = state.state2 + num_groups = s2.absmax.shape[0] + + print(f"[generate] blocksize={blocksize}, num_blocks={num_blocks}, " + f"num_groups={num_groups}, s2_blocksize={s2.blocksize}") + + # 3. 提取数据 + packed_weights = quant.cpu().numpy().astype(np.uint8).flatten() + assert packed_weights.shape[0] == n_elements // 2 + + absmax_q = state.absmax.cpu().numpy().astype(np.uint8) + assert absmax_q.shape[0] == num_blocks + + absmax2 = s2.absmax.cpu().numpy().astype(np.float16) + code2 = s2.code.cpu().numpy().astype(np.float16) + assert code2.shape[0] == 256 + + offset_val = state.offset.cpu().item() if state.offset is not None else 0.0 + + # 4. 写二进制权重文件 + tag = f"{rows}x{cols}_bs{blocksize}" + weight_file = os.path.join(outdir, f"nf4_weights_{tag}.bin") + + with open(weight_file, "wb") as f: + f.write(struct.pack("