InfiniTensor · trudging · Jan 19, 2026 · Jan 19, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/03_nf4_dequant/trudging/CMakeLists.txt b/03_nf4_dequant/trudging/CMakeLists.txt
@@ -0,0 +1,49 @@
+cmake_minimum_required(VERSION 3.18)
+
+# 1. 项目与语言
+project(nf4_dequantizer LANGUAGES CXX CUDA)
+
+# 2. 版本与架构设置
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+# 目标架构：T4 (75), A100 (80), 4090 (89)
+set(CMAKE_CUDA_ARCHITECTURES 75 80 89)
+
+# 3. 默认构建类型
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# 4. 可执行文件
+add_executable(nf4_dequantizer main.cu src/dequantize.cu)
+
+# 5. 头文件目录
+target_include_directories(nf4_dequantizer PRIVATE src)
+
+# 6. 编译优化选项 (Release 模式)
+# -O3 为最高级别优化 (Release 默认包含 -O3，但我们可以显式保证)
+target_compile_options(nf4_dequantizer PRIVATE 
+    $<$<CONFIG:Release>:-O3>
+)
+
+# 7. 性能分析与优化 (CUDA)
+# -lineinfo: 生成行号信息，用于 Nsight Compute 对照源码
+# --ptxas-options=-v: 显示 PTX 汇编详细信息 (如寄存器使用量)
+# -use_fast_math: 启用快速数学库
+target_compile_options(nf4_dequantizer PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:
+        -lineinfo
+        --ptxas-options=-v
+        -use_fast_math
+        -O3 
+    >
+)
+
+# 8. 链接选项 (如有必要)
+# target_link_libraries(nf4_dequantizer PRIVATE ...)
+
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
diff --git a/03_nf4_dequant/trudging/Makefile b/03_nf4_dequant/trudging/Makefile
@@ -0,0 +1,48 @@
+# Learning-CUDA nf4_dequant Makefile
+# Target platforms supported: nvidia (default), iluvatar, moore, metax
+
+PLATFORM        ?= nvidia
+PLATFORM_DEFINE ?= -DPLATFORM_NVIDIA
+STUDENT_SUFFIX  := cu
+CFLAGS          := -std=c++17 -O3
+EXTRA_LIBS      :=
+
+ifeq ($(PLATFORM),nvidia)
+    CC                  := nvcc
+    PLATFORM_DEFINE     := -DPLATFORM_NVIDIA
+    CFLAGS              += -lineinfo --ptxas-options=-v -use_fast_math -arch=sm_80
+else ifeq ($(PLATFORM),iluvatar)
+    CC                  := clang++
+    PLATFORM_DEFINE     := -DPLATFORM_ILUVATAR
+    EXTRA_LIBS          := -lcudart -I/usr/local/corex/include -L/usr/local/corex/lib64 -fPIC
+else ifeq ($(PLATFORM),moore)
+    CC                  := mcc
+    STUDENT_SUFFIX      := mu
+    PLATFORM_DEFINE     := -DPLATFORM_MOORE
+    EXTRA_LIBS          := -I/usr/local/musa/include -L/usr/lib/gcc/x86_64-linux-gnu/11/ -L/usr/local/musa/lib -lmusart
+else ifeq ($(PLATFORM),metax)
+    CC                  := mxcc
+    STUDENT_SUFFIX      := maca
+    PLATFORM_DEFINE     := -DPLATFORM_METAX
+else
+    $(error Unsupported PLATFORM '$(PLATFORM)' (expected: nvidia, iluvatar, moore, metax))
+endif
+
+TARGET := nf4_dequantizer
+MAIN_SRC := main.$(STUDENT_SUFFIX)
+KERNEL_SRC := src/dequantize.$(STUDENT_SUFFIX)
+
+.PHONY: all build run clean
+
+all: build run
+
+build: $(TARGET)
+
+run: $(TARGET)
+	./$(TARGET)
+
+clean:
+	rm -f $(TARGET) *.o
+
+$(TARGET): $(MAIN_SRC) $(KERNEL_SRC)
+	$(CC) $(CFLAGS) $(PLATFORM_DEFINE) -o $@ $^ $(EXTRA_LIBS)
diff --git a/03_nf4_dequant/trudging/README.md b/03_nf4_dequant/trudging/README.md
@@ -0,0 +1,78 @@
+# NF4 Dequantization - Multi-Platform Support (NVIDIA & 国产芯片)
+
+具体报告于nf4_report中
+这是一个实现了 QLoRA 4-bit NormalFloat (NF4) 动态反量化算子的项目。
+当前工程不仅支持原生 NVIDIA GPU，还成功适配了国内主流的三大算力平台：
+- **NVIDIA (NVIDIA GPU)**
+- **Iluvatar (天数智芯)**
+- **Moore Threads (摩尔线程)**
+- **MetaX (沐曦)**
+
+---
+
+## 1. 环境准备 (Prerequisites)
+
+在进行编译和测试之前，需要在各自平台/容器中安装必要的 Python 依赖以生成测试用例。测试数据生成脚本依赖于 `torch`、`numpy` （和可选的 `bitsandbytes`）。
+
+```bash
+# 推荐使用国内镜像源下载依赖 (必须确保 numpy 版本为 1.x 代以防止 PyTorch 不兼容)
+pip3 install "numpy<2.0.0" torch bitsandbytes -i https://pypi.tuna.tsinghua.edu.cn/simple --force-reinstall
+```
+
+## 2. 生成测试数据
+
+在正式编译与运行算子之前，首先需要利用 PyTorch 和 Bitsandbytes 在本地生成模拟的 `test_weights.bin` 和真实基准参考文件 `ground_truth.bin` 以及配置 `params.txt` ：
+
+```bash
+python3 generate_test_data.py
+```
+> **注意**：如果在只搭载国产芯片且无正常 CUDA 执行库的镜像上，此脚本也可以无缝生成二进制文件用于后续的 C++ 端纯前向推理测试。
+
+## 3. 多平台编译与测试指令
+
+项目采用了一套统一的 `Makefile` 并通过 `PLATFORM` 变量实现平台路由。只需在 `make` 时通过 `PLATFORM=` 指定目标芯片厂商环境。
+
+### 3.1 NVIDIA (默认平台)
+```bash
+make clean
+# 编译
+make PLATFORM=nvidia build
+# 运行
+./nf4_dequantizer
+```
+
+### 3.2 Iluvatar (天数智芯)
+天数智芯平台使用 `clang++` (基于 LLVM) 和 `corex` 构建库。使用前请确保你已经通过 K8s 进入了包含天数 SDK `corex` 的容器中。
+```bash
+make clean
+# 编译
+make PLATFORM=iluvatar build
+# 运行
+./nf4_dequantizer
+```
+
+### 3.3 Moore Threads (摩尔线程)
+摩尔线程平台基于 MUSA 核心架构，使用 `mcc` 编译并将自动使用 `.mu` 为拓展名的特化源码。
+```bash
+make clean
+# 编译
+make PLATFORM=moore build
+# 运行
+./nf4_dequantizer
+```
+
+### 3.4 MetaX (沐曦)
+沐曦平台基于 MACA 核心架构，使用 `mxcc` 编译并将自动使用 `.maca` 为拓展名的特化源码。
+```bash
+make clean
+# 编译
+make PLATFORM=metax build
+# 运行
+./nf4_dequantizer
+```
+
+## 4. 特性与修改点 (Changelog)
+
+- 移除了裸写 `cudaMallocHost` 的硬编码，取而代之为宏包装，兼容各个平台的 Pinned Memory 分配（如 `mcMallocHost`）。
+- 针对沐曦使用内置的 `maca_bfloat16.h` 进行完整支持。
+- 针对于摩尔线程 `__halves2musa_bfloat162` 缺失情况，使用了寄存器级位运算拼接（`bitwise packing`）完成平替保护。
diff --git a/03_nf4_dequant/trudging/fix.py b/03_nf4_dequant/trudging/fix.py
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <cuda_runtime.h>
+
+// �Զ���ɾ���������� std::unique_ptr ���� cudaMallocHost ������ڴ�
+struct CudaHostDeleter {
+    void operator()(void* ptr) const {
+        if (ptr) {
+            cudaFreeHost(ptr);
+        }
+    }
+};
+
+// �������壬����ʹ��
+template <typename T>
+using start_pinned_ptr = std::unique_ptr<T[], CudaHostDeleter>;
+
+// �������������� pinned memory
+template <typename T>
+start_pinned_ptr<T> allocate_pinned(size_t count) {
+    void* ptr = nullptr;
+    cudaError_t err = cudaMallocHost(&ptr, count * sizeof(T));
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string("cudaMallocHost failed: ") + cudaGetErrorString(err));
+    }
+    return start_pinned_ptr<T>(static_cast<T*>(ptr));
+}
+
+struct QuantizedWeights {
+    int64_t num_rows;
+    int64_t num_cols;
+    int32_t block_size;
+
+    size_t num_blocks;
+    size_t num_groups;
+    size_t packed_size;
+
+    // ʹ������ָ������� Pinned Memory ����
+    start_pinned_ptr<uint8_t> packed_weights;
+    start_pinned_ptr<uint8_t> absmax_q;
+    start_pinned_ptr<uint16_t> absmax2;
+    start_pinned_ptr<uint16_t> code2;
+
+    float offset; // ���� float ֵ
+};
+
+inline QuantizedWeights load_weights(const std::string& filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        throw std::runtime_error("Failed to open file: " + filename);
+    }
+
+    QuantizedWeights w;
+
+    // 1. ��ȡͷ��
+    if (!file.read(reinterpret_cast<char*>(&w.num_rows), sizeof(w.num_rows))) throw std::runtime_error("Failed to read num_rows");
+    if (!file.read(reinterpret_cast<char*>(&w.num_cols), sizeof(w.num_cols))) throw std::runtime_error("Failed to read num_cols");
+    if (!file.read(reinterpret_cast<char*>(&w.block_size), sizeof(w.block_size))) throw std::runtime_error("Failed to read block_size");
+
+    // 2. ��������ִ�С
+    // ע�⣺������� num_rows * num_cols ��ż�������߰��� (N*M)/2 ����ȡ����
+    // ����� 4-bit ������ͨ������Ҫȷ����Ԫ�ظ�����ż�������ߴ���β�� padding��
+    w.packed_size = (w.num_rows * w.num_cols) / 2;
+
+    // num_blocks = ceil(num_rows * num_cols / blocksize)
+    w.num_blocks = (w.num_rows * w.num_cols + w.block_size - 1) / w.block_size;
+
+    // num_groups = ceil(num_blocks / 256)
+    // ��������Ҫ��block_size_2 Ϊ�̶� 256
+    // ע��ԭ�������ᵽ "absmax2: ... ����Ϊ num_groups (����̶�Ϊ 256)" 
+    // ������׷��ָ��ӦΪ����ֵ���˴���׷���߼����� num_groups��
+    // ��� "����̶�Ϊ 256" ָ���� group_size�������¼��㣺
+    size_t group_size = 256;
+    w.num_groups = (w.num_blocks + group_size - 1) / group_size;
+
+    // 3. ���� Pinned Memory
+    try {
+        w.packed_weights = allocate_pinned<uint8_t>(w.packed_size);
+        w.absmax_q = allocate_pinned<uint8_t>(w.num_blocks);
+        w.absmax2 = allocate_pinned<uint16_t>(w.num_groups);
+        w.code2 = allocate_pinned<uint16_t>(256); // �̶� 256 Ԫ��
+    } catch (const std::exception& e) {
+        file.close();
+        throw;
+    }
+
+    // 4. ��ȡ��������
+    auto read_array = [&](char* dst, size_t size, const char* name) {
+        file.read(dst, size);
+        if (file.gcount() != static_cast<std::streamsize>(size)) {
+            throw std::runtime_error(std::string("Failed to read ") + name + ". Expected " + std::to_string(size) + " bytes, got " + std::to_string(file.gcount()));
+        }
+    };
+
+    read_array(reinterpret_cast<char*>(w.packed_weights.get()), w.packed_size * sizeof(uint8_t), "packed_weights");
+    read_array(reinterpret_cast<char*>(w.absmax_q.get()), w.num_blocks * sizeof(uint8_t), "absmax_q");
+    read_array(reinterpret_cast<char*>(w.absmax2.get()), w.num_groups * sizeof(uint16_t), "absmax2");
+    read_array(reinterpret_cast<char*>(w.code2.get()), 256 * sizeof(uint16_t), "code2");
+
+    // 5. ��ȡ offset
+    if (!file.read(reinterpret_cast<char*>(&w.offset), sizeof(w.offset))) {
+        throw std::runtime_error("Failed to read offset");
+    }
+
+    // 6. ����Ƿ���ʣ�����ݣ���ѡ�����ļ���ʽ�ϸ�̶ȶ�����
+    if (file.peek() != EOF) {
+        std::cerr << "Warning: Extra data found at the end of the file " << filename << std::endl;
+    }
+
+    file.close();
+    return w;
+}