Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
fcdc822
feat.: add 2025_winter assignment
Ziminli Jan 19, 2026
b52d284
docs: add more info in README.md
Ziminli Jan 19, 2026
6ab326b
docs: enrich README with detailed domestic GPU platform support info.
Ziminli Jan 20, 2026
9e7376e
fix: rebuild tester_nv.o with CUDA 11.8 for compatibility
Ziminli Jan 20, 2026
72ff7d2
refactor: relax the tolerance settings for flashAttention
Ziminli Jan 27, 2026
2e3eb4f
Implement trace and flashAttention CUDA kernels
trudging Feb 4, 2026
d4808b6
Optimize kernels: warp shuffle reduction, tiled attention, improved c…
trudging Feb 4, 2026
b2ce4d6
Performance tuning: grid-stride trace, larger attention tiles, __ldg …
trudging Feb 4, 2026
e30445f
Fix attention kernel: correct warp reduction, stable online softmax
trudging Feb 4, 2026
dd16a3a
Debug: force fallback kernel
trudging Feb 4, 2026
4715318
Restrict optimized kernel to known-good dimensions
trudging Feb 4, 2026
0f85a1c
Disable optimized kernel for correctness
trudging Feb 4, 2026
b047f61
Add MetaX (沐曦) platform adaptation
trudging Feb 4, 2026
a19d735
Add Moore Threads (摩尔线程) platform adaptation
trudging Feb 4, 2026
6179d29
Add Iluvatar (天数) platform compatibility with LDG macro
trudging Feb 4, 2026
4481ea3
Fix min() to std::min() for Iluvatar compatibility
trudging Feb 5, 2026
300069b
Fix numerical stability for MetaX and Iluvatar platforms
trudging Feb 5, 2026
72f2937
Restore float version for cross-platform compatibility (NVIDIA 90/90,…
trudging Feb 5, 2026
813c697
fix(moore): improve numerical stability in softmac calculation
trudging Feb 5, 2026
64c4470
fix(iluvatar): use double precision accumulator for stability
trudging Feb 5, 2026
309f99a
chore: update test standards (tester object files) from commit 65b2898
trudging Feb 6, 2026
2a468e9
fix: remove extraneous closing brace in kernels.cu
trudging Feb 6, 2026
b3d44b4
revert: restore float softmax for attention
trudging Feb 6, 2026
8ad6610
feat(nf4): add NF4 double-dequantization CUDA kernel and test scripts
trudging Mar 14, 2026
e43ef1e
Merge remote README
trudging Mar 14, 2026
5cb34c0
refactor(nf4): move trudging user files into trudging subdirectory
trudging Mar 14, 2026
09d90c5
fix(nf4): correctly apply templates, params reading, and output gener…
trudging Mar 14, 2026
7c2364a
Refactor nf4_dequant to support templates and speedup calculation
trudging Mar 14, 2026
9c1f288
Support MUSA, MACA, and ILUVATAR platforms seamlessly
trudging Mar 14, 2026
a3717a2
Fix CUDA runtime macro abstraction for all platforms
trudging Mar 15, 2026
907e3dd
fix(metax): update maca headers, bfloat16 types and device sync API f…
trudging Mar 15, 2026
2aeb438
fix(moore): final adaptations for Moore Threads (mt_bfloat16 type cas…
trudging Mar 16, 2026
391cd7d
docs: add multi-platform README instructions for CUDA & domestics
trudging Mar 16, 2026
4e15cd0
docs: update README and add nf4_report files
trudging Mar 16, 2026
37d3de3
docs: fix README garbled text and remove emojis
trudging Mar 16, 2026
2a93c35
docs: relocate nf4_report directory and update README
trudging Mar 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions 03_nf4_dequant/trudging/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
cmake_minimum_required(VERSION 3.18)

# 1. 项目与语言
project(nf4_dequantizer LANGUAGES CXX CUDA)

# 2. 版本与架构设置
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# 目标架构:T4 (75), A100 (80), 4090 (89)
set(CMAKE_CUDA_ARCHITECTURES 75 80 89)

# 3. 默认构建类型
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()

# 4. 可执行文件
add_executable(nf4_dequantizer main.cu src/dequantize.cu)

# 5. 头文件目录
target_include_directories(nf4_dequantizer PRIVATE src)

# 6. 编译优化选项 (Release 模式)
# -O3 为最高级别优化 (Release 默认包含 -O3,但我们可以显式保证)
target_compile_options(nf4_dequantizer PRIVATE
$<$<CONFIG:Release>:-O3>
)

# 7. 性能分析与优化 (CUDA)
# -lineinfo: 生成行号信息,用于 Nsight Compute 对照源码
# --ptxas-options=-v: 显示 PTX 汇编详细信息 (如寄存器使用量)
# -use_fast_math: 启用快速数学库
target_compile_options(nf4_dequantizer PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:
-lineinfo
--ptxas-options=-v
-use_fast_math
-O3
>
)

# 8. 链接选项 (如有必要)
# target_link_libraries(nf4_dequantizer PRIVATE ...)

message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
48 changes: 48 additions & 0 deletions 03_nf4_dequant/trudging/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Learning-CUDA nf4_dequant Makefile
# Target platforms supported: nvidia (default), iluvatar, moore, metax

PLATFORM ?= nvidia
PLATFORM_DEFINE ?= -DPLATFORM_NVIDIA
STUDENT_SUFFIX := cu
CFLAGS := -std=c++17 -O3
EXTRA_LIBS :=

ifeq ($(PLATFORM),nvidia)
CC := nvcc
PLATFORM_DEFINE := -DPLATFORM_NVIDIA
CFLAGS += -lineinfo --ptxas-options=-v -use_fast_math -arch=sm_80
else ifeq ($(PLATFORM),iluvatar)
CC := clang++
PLATFORM_DEFINE := -DPLATFORM_ILUVATAR
EXTRA_LIBS := -lcudart -I/usr/local/corex/include -L/usr/local/corex/lib64 -fPIC
else ifeq ($(PLATFORM),moore)
CC := mcc
STUDENT_SUFFIX := mu
PLATFORM_DEFINE := -DPLATFORM_MOORE
EXTRA_LIBS := -I/usr/local/musa/include -L/usr/lib/gcc/x86_64-linux-gnu/11/ -L/usr/local/musa/lib -lmusart
else ifeq ($(PLATFORM),metax)
CC := mxcc
STUDENT_SUFFIX := maca
PLATFORM_DEFINE := -DPLATFORM_METAX
else
$(error Unsupported PLATFORM '$(PLATFORM)' (expected: nvidia, iluvatar, moore, metax))
endif

TARGET := nf4_dequantizer
MAIN_SRC := main.$(STUDENT_SUFFIX)
KERNEL_SRC := src/dequantize.$(STUDENT_SUFFIX)

.PHONY: all build run clean

all: build run

build: $(TARGET)

run: $(TARGET)
./$(TARGET)

clean:
rm -f $(TARGET) *.o

$(TARGET): $(MAIN_SRC) $(KERNEL_SRC)
$(CC) $(CFLAGS) $(PLATFORM_DEFINE) -o $@ $^ $(EXTRA_LIBS)
78 changes: 78 additions & 0 deletions 03_nf4_dequant/trudging/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# NF4 Dequantization - Multi-Platform Support (NVIDIA & 国产芯片)

具体报告于nf4_report中
这是一个实现了 QLoRA 4-bit NormalFloat (NF4) 动态反量化算子的项目。
当前工程不仅支持原生 NVIDIA GPU,还成功适配了国内主流的三大算力平台:
- **NVIDIA (NVIDIA GPU)**
- **Iluvatar (天数智芯)**
- **Moore Threads (摩尔线程)**
- **MetaX (沐曦)**

---

## 1. 环境准备 (Prerequisites)

在进行编译和测试之前,需要在各自平台/容器中安装必要的 Python 依赖以生成测试用例。测试数据生成脚本依赖于 `torch`、`numpy` (和可选的 `bitsandbytes`)。

```bash
# 推荐使用国内镜像源下载依赖 (必须确保 numpy 版本为 1.x 代以防止 PyTorch 不兼容)
pip3 install "numpy<2.0.0" torch bitsandbytes -i https://pypi.tuna.tsinghua.edu.cn/simple --force-reinstall
```

## 2. 生成测试数据

在正式编译与运行算子之前,首先需要利用 PyTorch 和 Bitsandbytes 在本地生成模拟的 `test_weights.bin` 和真实基准参考文件 `ground_truth.bin` 以及配置 `params.txt` :

```bash
python3 generate_test_data.py
```
> **注意**:如果在只搭载国产芯片且无正常 CUDA 执行库的镜像上,此脚本也可以无缝生成二进制文件用于后续的 C++ 端纯前向推理测试。

## 3. 多平台编译与测试指令

项目采用了一套统一的 `Makefile` 并通过 `PLATFORM` 变量实现平台路由。只需在 `make` 时通过 `PLATFORM=` 指定目标芯片厂商环境。

### 3.1 NVIDIA (默认平台)
```bash
make clean
# 编译
make PLATFORM=nvidia build
# 运行
./nf4_dequantizer
```

### 3.2 Iluvatar (天数智芯)
天数智芯平台使用 `clang++` (基于 LLVM) 和 `corex` 构建库。使用前请确保你已经通过 K8s 进入了包含天数 SDK `corex` 的容器中。
```bash
make clean
# 编译
make PLATFORM=iluvatar build
# 运行
./nf4_dequantizer
```

### 3.3 Moore Threads (摩尔线程)
摩尔线程平台基于 MUSA 核心架构,使用 `mcc` 编译并将自动使用 `.mu` 为拓展名的特化源码。
```bash
make clean
# 编译
make PLATFORM=moore build
# 运行
./nf4_dequantizer
```

### 3.4 MetaX (沐曦)
沐曦平台基于 MACA 核心架构,使用 `mxcc` 编译并将自动使用 `.maca` 为拓展名的特化源码。
```bash
make clean
# 编译
make PLATFORM=metax build
# 运行
./nf4_dequantizer
```

## 4. 特性与修改点 (Changelog)

- 移除了裸写 `cudaMallocHost` 的硬编码,取而代之为宏包装,兼容各个平台的 Pinned Memory 分配(如 `mcMallocHost`)。
- 针对沐曦使用内置的 `maca_bfloat16.h` 进行完整支持。
- 针对于摩尔线程 `__halves2musa_bfloat162` 缺失情况,使用了寄存器级位运算拼接(`bitwise packing`)完成平替保护。
120 changes: 120 additions & 0 deletions 03_nf4_dequant/trudging/fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#pragma once

#include <string>
#include <vector>
#include <fstream>
#include <iostream>
#include <memory>
#include <cmath>
#include <cstdint>
#include <stdexcept>
#include <cuda_runtime.h>

// �Զ���ɾ���������� std::unique_ptr ���� cudaMallocHost ������ڴ�
struct CudaHostDeleter {
void operator()(void* ptr) const {
if (ptr) {
cudaFreeHost(ptr);
}
}
};

// �������壬����ʹ��
template <typename T>
using start_pinned_ptr = std::unique_ptr<T[], CudaHostDeleter>;

// �������������� pinned memory
template <typename T>
start_pinned_ptr<T> allocate_pinned(size_t count) {
void* ptr = nullptr;
cudaError_t err = cudaMallocHost(&ptr, count * sizeof(T));
if (err != cudaSuccess) {
throw std::runtime_error(std::string("cudaMallocHost failed: ") + cudaGetErrorString(err));
}
return start_pinned_ptr<T>(static_cast<T*>(ptr));
}

struct QuantizedWeights {
int64_t num_rows;
int64_t num_cols;
int32_t block_size;

size_t num_blocks;
size_t num_groups;
size_t packed_size;

// ʹ������ָ������� Pinned Memory ����
start_pinned_ptr<uint8_t> packed_weights;
start_pinned_ptr<uint8_t> absmax_q;
start_pinned_ptr<uint16_t> absmax2;
start_pinned_ptr<uint16_t> code2;

float offset; // ���� float ֵ
};

inline QuantizedWeights load_weights(const std::string& filename) {
std::ifstream file(filename, std::ios::binary);
if (!file.is_open()) {
throw std::runtime_error("Failed to open file: " + filename);
}

QuantizedWeights w;

// 1. ��ȡͷ��
if (!file.read(reinterpret_cast<char*>(&w.num_rows), sizeof(w.num_rows))) throw std::runtime_error("Failed to read num_rows");
if (!file.read(reinterpret_cast<char*>(&w.num_cols), sizeof(w.num_cols))) throw std::runtime_error("Failed to read num_cols");
if (!file.read(reinterpret_cast<char*>(&w.block_size), sizeof(w.block_size))) throw std::runtime_error("Failed to read block_size");

// 2. ��������ִ�С
// ע�⣺������� num_rows * num_cols ��ż�������߰��� (N*M)/2 ����ȡ����
// ����� 4-bit ������ͨ������Ҫȷ����Ԫ�ظ�����ż�������ߴ���β�� padding��
w.packed_size = (w.num_rows * w.num_cols) / 2;

// num_blocks = ceil(num_rows * num_cols / blocksize)
w.num_blocks = (w.num_rows * w.num_cols + w.block_size - 1) / w.block_size;

// num_groups = ceil(num_blocks / 256)
// ��������Ҫ��block_size_2 Ϊ�̶� 256
// ע��ԭ�������ᵽ "absmax2: ... ����Ϊ num_groups (����̶�Ϊ 256)"
// ������׷��ָ��ӦΪ����ֵ���˴���׷���߼����� num_groups��
// ��� "����̶�Ϊ 256" ָ���� group_size�������¼��㣺
size_t group_size = 256;
w.num_groups = (w.num_blocks + group_size - 1) / group_size;

// 3. ���� Pinned Memory
try {
w.packed_weights = allocate_pinned<uint8_t>(w.packed_size);
w.absmax_q = allocate_pinned<uint8_t>(w.num_blocks);
w.absmax2 = allocate_pinned<uint16_t>(w.num_groups);
w.code2 = allocate_pinned<uint16_t>(256); // �̶� 256 Ԫ��
} catch (const std::exception& e) {
file.close();
throw;
}

// 4. ��ȡ��������
auto read_array = [&](char* dst, size_t size, const char* name) {
file.read(dst, size);
if (file.gcount() != static_cast<std::streamsize>(size)) {
throw std::runtime_error(std::string("Failed to read ") + name + ". Expected " + std::to_string(size) + " bytes, got " + std::to_string(file.gcount()));
}
};

read_array(reinterpret_cast<char*>(w.packed_weights.get()), w.packed_size * sizeof(uint8_t), "packed_weights");
read_array(reinterpret_cast<char*>(w.absmax_q.get()), w.num_blocks * sizeof(uint8_t), "absmax_q");
read_array(reinterpret_cast<char*>(w.absmax2.get()), w.num_groups * sizeof(uint16_t), "absmax2");
read_array(reinterpret_cast<char*>(w.code2.get()), 256 * sizeof(uint16_t), "code2");

// 5. ��ȡ offset
if (!file.read(reinterpret_cast<char*>(&w.offset), sizeof(w.offset))) {
throw std::runtime_error("Failed to read offset");
}

// 6. ����Ƿ���ʣ�����ݣ���ѡ�����ļ���ʽ�ϸ�̶ȶ�����
if (file.peek() != EOF) {
std::cerr << "Warning: Extra data found at the end of the file " << filename << std::endl;
}

file.close();
return w;
}
Loading