diff --git a/03_nf4_dequant/trudging/CMakeLists.txt b/03_nf4_dequant/trudging/CMakeLists.txt
new file mode 100644
index 00000000..43569405
--- /dev/null
+++ b/03_nf4_dequant/trudging/CMakeLists.txt
@@ -0,0 +1,49 @@
+cmake_minimum_required(VERSION 3.18)
+
+# 1. é¡¹ç›®ä¸Žè¯­è¨€
+project(nf4_dequantizer LANGUAGES CXX CUDA)
+
+# 2. ç‰ˆæœ¬ä¸Žæž¶æž„è®¾ç½®
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+# ç›®æ ‡æž¶æž„ï¼šT4 (75), A100 (80), 4090 (89)
+set(CMAKE_CUDA_ARCHITECTURES 75 80 89)
+
+# 3. é»˜è®¤æž„å»ºç±»åž‹
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# 4. å¯æ‰§è¡Œæ–‡ä»¶
+add_executable(nf4_dequantizer main.cu src/dequantize.cu)
+
+# 5. å¤´æ–‡ä»¶ç›®å½•
+target_include_directories(nf4_dequantizer PRIVATE src)
+
+# 6. ç¼–è¯‘ä¼˜åŒ–é€‰é¡¹ (Release æ¨¡å¼)
+# -O3 ä¸ºæœ€é«˜çº§åˆ«ä¼˜åŒ– (Release é»˜è®¤åŒ…å« -O3ï¼Œä½†æˆ‘ä»¬å¯ä»¥æ˜¾å¼ä¿è¯)
+target_compile_options(nf4_dequantizer PRIVATE 
+    $<$<CONFIG:Release>:-O3>
+)
+
+# 7. æ€§èƒ½åˆ†æžä¸Žä¼˜åŒ– (CUDA)
+# -lineinfo: ç”Ÿæˆè¡Œå·ä¿¡æ¯ï¼Œç”¨äºŽ Nsight Compute å¯¹ç…§æºç 
+# --ptxas-options=-v: æ˜¾ç¤º PTX æ±‡ç¼–è¯¦ç»†ä¿¡æ¯ (å¦‚å¯„å­˜å™¨ä½¿ç”¨é‡)
+# -use_fast_math: å¯ç”¨å¿«é€Ÿæ•°å­¦åº“
+target_compile_options(nf4_dequantizer PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:
+        -lineinfo
+        --ptxas-options=-v
+        -use_fast_math
+        -O3 
+    >
+)
+
+# 8. é“¾æŽ¥é€‰é¡¹ (å¦‚æœ‰å¿…è¦)
+# target_link_libraries(nf4_dequantizer PRIVATE ...)
+
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
diff --git a/03_nf4_dequant/trudging/Makefile b/03_nf4_dequant/trudging/Makefile
new file mode 100644
index 00000000..62c872fd
--- /dev/null
+++ b/03_nf4_dequant/trudging/Makefile
@@ -0,0 +1,48 @@
+# Learning-CUDA nf4_dequant Makefile
+# Target platforms supported: nvidia (default), iluvatar, moore, metax
+
+PLATFORM        ?= nvidia
+PLATFORM_DEFINE ?= -DPLATFORM_NVIDIA
+STUDENT_SUFFIX  := cu
+CFLAGS          := -std=c++17 -O3
+EXTRA_LIBS      :=
+
+ifeq ($(PLATFORM),nvidia)
+    CC                  := nvcc
+    PLATFORM_DEFINE     := -DPLATFORM_NVIDIA
+    CFLAGS              += -lineinfo --ptxas-options=-v -use_fast_math -arch=sm_80
+else ifeq ($(PLATFORM),iluvatar)
+    CC                  := clang++
+    PLATFORM_DEFINE     := -DPLATFORM_ILUVATAR
+    EXTRA_LIBS          := -lcudart -I/usr/local/corex/include -L/usr/local/corex/lib64 -fPIC
+else ifeq ($(PLATFORM),moore)
+    CC                  := mcc
+    STUDENT_SUFFIX      := mu
+    PLATFORM_DEFINE     := -DPLATFORM_MOORE
+    EXTRA_LIBS          := -I/usr/local/musa/include -L/usr/lib/gcc/x86_64-linux-gnu/11/ -L/usr/local/musa/lib -lmusart
+else ifeq ($(PLATFORM),metax)
+    CC                  := mxcc
+    STUDENT_SUFFIX      := maca
+    PLATFORM_DEFINE     := -DPLATFORM_METAX
+else
+    $(error Unsupported PLATFORM '$(PLATFORM)' (expected: nvidia, iluvatar, moore, metax))
+endif
+
+TARGET := nf4_dequantizer
+MAIN_SRC := main.$(STUDENT_SUFFIX)
+KERNEL_SRC := src/dequantize.$(STUDENT_SUFFIX)
+
+.PHONY: all build run clean
+
+all: build run
+
+build: $(TARGET)
+
+run: $(TARGET)
+	./$(TARGET)
+
+clean:
+	rm -f $(TARGET) *.o
+
+$(TARGET): $(MAIN_SRC) $(KERNEL_SRC)
+	$(CC) $(CFLAGS) $(PLATFORM_DEFINE) -o $@ $^ $(EXTRA_LIBS)
diff --git a/03_nf4_dequant/trudging/README.md b/03_nf4_dequant/trudging/README.md
new file mode 100644
index 00000000..6922a9d8
--- /dev/null
+++ b/03_nf4_dequant/trudging/README.md
@@ -0,0 +1,78 @@
+ï»¿# NF4 Dequantization - Multi-Platform Support (NVIDIA & å›½äº§èŠ¯ç‰‡)
+
+å…·ä½“æŠ¥å‘ŠäºŽnf4_reportä¸­
+è¿™æ˜¯ä¸€ä¸ªå®žçŽ°äº† QLoRA 4-bit NormalFloat (NF4) åŠ¨æ€åé‡åŒ–ç®—å­çš„é¡¹ç›®ã€‚
+å½“å‰å·¥ç¨‹ä¸ä»…æ”¯æŒåŽŸç”Ÿ NVIDIA GPUï¼Œè¿˜æˆåŠŸé€‚é…äº†å›½å†…ä¸»æµçš„ä¸‰å¤§ç®—åŠ›å¹³å°ï¼š
+- **NVIDIA (NVIDIA GPU)**
+- **Iluvatar (å¤©æ•°æ™ºèŠ¯)**
+- **Moore Threads (æ‘©å°”çº¿ç¨‹)**
+- **MetaX (æ²æ›¦)**
+
+---
+
+## 1. çŽ¯å¢ƒå‡†å¤‡ (Prerequisites)
+
+åœ¨è¿›è¡Œç¼–è¯‘å’Œæµ‹è¯•ä¹‹å‰ï¼Œéœ€è¦åœ¨å„è‡ªå¹³å°/å®¹å™¨ä¸­å®‰è£…å¿…è¦çš„ Python ä¾èµ–ä»¥ç”Ÿæˆæµ‹è¯•ç”¨ä¾‹ã€‚æµ‹è¯•æ•°æ®ç”Ÿæˆè„šæœ¬ä¾èµ–äºŽ `torch`ã€`numpy` ï¼ˆå’Œå¯é€‰çš„ `bitsandbytes`ï¼‰ã€‚
+
+```bash
+# æŽ¨èä½¿ç”¨å›½å†…é•œåƒæºä¸‹è½½ä¾èµ– (å¿…é¡»ç¡®ä¿ numpy ç‰ˆæœ¬ä¸º 1.x ä»£ä»¥é˜²æ­¢ PyTorch ä¸å…¼å®¹)
+pip3 install "numpy<2.0.0" torch bitsandbytes -i https://pypi.tuna.tsinghua.edu.cn/simple --force-reinstall
+```
+
+## 2. ç”Ÿæˆæµ‹è¯•æ•°æ®
+
+åœ¨æ­£å¼ç¼–è¯‘ä¸Žè¿è¡Œç®—å­ä¹‹å‰ï¼Œé¦–å…ˆéœ€è¦åˆ©ç”¨ PyTorch å’Œ Bitsandbytes åœ¨æœ¬åœ°ç”Ÿæˆæ¨¡æ‹Ÿçš„ `test_weights.bin` å’ŒçœŸå®žåŸºå‡†å‚è€ƒæ–‡ä»¶ `ground_truth.bin` ä»¥åŠé…ç½® `params.txt` ï¼š
+
+```bash
+python3 generate_test_data.py
+```
+> **æ³¨æ„**ï¼šå¦‚æžœåœ¨åªæ­è½½å›½äº§èŠ¯ç‰‡ä¸”æ— æ­£å¸¸ CUDA æ‰§è¡Œåº“çš„é•œåƒä¸Šï¼Œæ­¤è„šæœ¬ä¹Ÿå¯ä»¥æ— ç¼ç”ŸæˆäºŒè¿›åˆ¶æ–‡ä»¶ç”¨äºŽåŽç»­çš„ C++ ç«¯çº¯å‰å‘æŽ¨ç†æµ‹è¯•ã€‚
+
+## 3. å¤šå¹³å°ç¼–è¯‘ä¸Žæµ‹è¯•æŒ‡ä»¤
+
+é¡¹ç›®é‡‡ç”¨äº†ä¸€å¥—ç»Ÿä¸€çš„ `Makefile` å¹¶é€šè¿‡ `PLATFORM` å˜é‡å®žçŽ°å¹³å°è·¯ç”±ã€‚åªéœ€åœ¨ `make` æ—¶é€šè¿‡ `PLATFORM=` æŒ‡å®šç›®æ ‡èŠ¯ç‰‡åŽ‚å•†çŽ¯å¢ƒã€‚
+
+### 3.1 NVIDIA (é»˜è®¤å¹³å°)
+```bash
+make clean
+# ç¼–è¯‘
+make PLATFORM=nvidia build
+# è¿è¡Œ
+./nf4_dequantizer
+```
+
+### 3.2 Iluvatar (å¤©æ•°æ™ºèŠ¯)
+å¤©æ•°æ™ºèŠ¯å¹³å°ä½¿ç”¨ `clang++` (åŸºäºŽ LLVM) å’Œ `corex` æž„å»ºåº“ã€‚ä½¿ç”¨å‰è¯·ç¡®ä¿ä½ å·²ç»é€šè¿‡ K8s è¿›å…¥äº†åŒ…å«å¤©æ•° SDK `corex` çš„å®¹å™¨ä¸­ã€‚
+```bash
+make clean
+# ç¼–è¯‘
+make PLATFORM=iluvatar build
+# è¿è¡Œ
+./nf4_dequantizer
+```
+
+### 3.3 Moore Threads (æ‘©å°”çº¿ç¨‹)
+æ‘©å°”çº¿ç¨‹å¹³å°åŸºäºŽ MUSA æ ¸å¿ƒæž¶æž„ï¼Œä½¿ç”¨ `mcc` ç¼–è¯‘å¹¶å°†è‡ªåŠ¨ä½¿ç”¨ `.mu` ä¸ºæ‹“å±•åçš„ç‰¹åŒ–æºç ã€‚
+```bash
+make clean
+# ç¼–è¯‘
+make PLATFORM=moore build
+# è¿è¡Œ
+./nf4_dequantizer
+```
+
+### 3.4 MetaX (æ²æ›¦)
+æ²æ›¦å¹³å°åŸºäºŽ MACA æ ¸å¿ƒæž¶æž„ï¼Œä½¿ç”¨ `mxcc` ç¼–è¯‘å¹¶å°†è‡ªåŠ¨ä½¿ç”¨ `.maca` ä¸ºæ‹“å±•åçš„ç‰¹åŒ–æºç ã€‚
+```bash
+make clean
+# ç¼–è¯‘
+make PLATFORM=metax build
+# è¿è¡Œ
+./nf4_dequantizer
+```
+
+## 4. ç‰¹æ€§ä¸Žä¿®æ”¹ç‚¹ (Changelog)
+
+- ç§»é™¤äº†è£¸å†™ `cudaMallocHost` çš„ç¡¬ç¼–ç ï¼Œå–è€Œä»£ä¹‹ä¸ºå®åŒ…è£…ï¼Œå…¼å®¹å„ä¸ªå¹³å°çš„ Pinned Memory åˆ†é…ï¼ˆå¦‚ `mcMallocHost`ï¼‰ã€‚
+- é’ˆå¯¹æ²æ›¦ä½¿ç”¨å†…ç½®çš„ `maca_bfloat16.h` è¿›è¡Œå®Œæ•´æ”¯æŒã€‚
+- é’ˆå¯¹äºŽæ‘©å°”çº¿ç¨‹ `__halves2musa_bfloat162` ç¼ºå¤±æƒ…å†µï¼Œä½¿ç”¨äº†å¯„å­˜å™¨çº§ä½è¿ç®—æ‹¼æŽ¥ï¼ˆ`bitwise packing`ï¼‰å®Œæˆå¹³æ›¿ä¿æŠ¤ã€‚
diff --git a/03_nf4_dequant/trudging/fix.py b/03_nf4_dequant/trudging/fix.py
new file mode 100644
index 00000000..8e500c7f
--- /dev/null
+++ b/03_nf4_dequant/trudging/fix.py
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <cuda_runtime.h>
+
+// ×Ô¶¨ÒåÉ¾³ýÆ÷£¬ÓÃÓÚ std::unique_ptr ¹ÜÀí cudaMallocHost ·ÖÅäµÄÄÚ´æ
+struct CudaHostDeleter {
+    void operator()(void* ptr) const {
+        if (ptr) {
+            cudaFreeHost(ptr);
+        }
+    }
+};
+
+// ±ðÃû¶¨Òå£¬·½±ãÊ¹ÓÃ
+template <typename T>
+using start_pinned_ptr = std::unique_ptr<T[], CudaHostDeleter>;
+
+// ¸¨Öúº¯Êý£º·ÖÅä pinned memory
+template <typename T>
+start_pinned_ptr<T> allocate_pinned(size_t count) {
+    void* ptr = nullptr;
+    cudaError_t err = cudaMallocHost(&ptr, count * sizeof(T));
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string("cudaMallocHost failed: ") + cudaGetErrorString(err));
+    }
+    return start_pinned_ptr<T>(static_cast<T*>(ptr));
+}
+
+struct QuantizedWeights {
+    int64_t num_rows;
+    int64_t num_cols;
+    int32_t block_size;
+
+    size_t num_blocks;
+    size_t num_groups;
+    size_t packed_size;
+
+    // Ê¹ÓÃÖÇÄÜÖ¸Õë¹ÜÀíµÄ Pinned Memory Êý×é
+    start_pinned_ptr<uint8_t> packed_weights;
+    start_pinned_ptr<uint8_t> absmax_q;
+    start_pinned_ptr<uint16_t> absmax2;
+    start_pinned_ptr<uint16_t> code2;
+    
+    float offset; // µ¥¸ö float Öµ
+};
+
+inline QuantizedWeights load_weights(const std::string& filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        throw std::runtime_error("Failed to open file: " + filename);
+    }
+
+    QuantizedWeights w;
+
+    // 1. ¶ÁÈ¡Í·²¿
+    if (!file.read(reinterpret_cast<char*>(&w.num_rows), sizeof(w.num_rows))) throw std::runtime_error("Failed to read num_rows");
+    if (!file.read(reinterpret_cast<char*>(&w.num_cols), sizeof(w.num_cols))) throw std::runtime_error("Failed to read num_cols");
+    if (!file.read(reinterpret_cast<char*>(&w.block_size), sizeof(w.block_size))) throw std::runtime_error("Failed to read block_size");
+
+    // 2. ¼ÆËã¸÷²¿·Ö´óÐ¡
+    // ×¢Òâ£ºÕâÀï¼ÙÉè num_rows * num_cols ÊÇÅ¼Êý£¬»òÕß°´ÕÕ (N*M)/2 ÏòÏÂÈ¡Õû¡£
+    // Èç¹ûÊÇ 4-bit Á¿»¯£¬Í¨³£ÄãÐèÒªÈ·±£×ÜÔªËØ¸öÊýÊÇÅ¼Êý£¬»òÕß´¦ÀíÎ²²¿ padding¡£
+    w.packed_size = (w.num_rows * w.num_cols) / 2;
+    
+    // num_blocks = ceil(num_rows * num_cols / blocksize)
+    w.num_blocks = (w.num_rows * w.num_cols + w.block_size - 1) / w.block_size;
+    
+    // num_groups = ceil(num_blocks / 256)
+    // ¸ù¾ÝÄúµÄÒªÇó£ºblock_size_2 Îª¹Ì¶¨ 256
+    // ×¢£ºÔ­ÎÊÌâÖÐÌáµ½ "absmax2: ... ³¤¶ÈÎª num_groups (¼ÙÉè¹Ì¶¨Îª 256)" 
+    // µ«ºóÐø×·ÎÊÖ¸³öÓ¦Îª¼ÆËãÖµ¡£´Ë´¦°´×·ÎÊÂß¼­¼ÆËã num_groups¡£
+    // Èç¹û "¼ÙÉè¹Ì¶¨Îª 256" Ö¸µÄÊÇ group_size£¬ÔòÈçÏÂ¼ÆËã£º
+    size_t group_size = 256;
+    w.num_groups = (w.num_blocks + group_size - 1) / group_size;
+
+    // 3. ·ÖÅä Pinned Memory
+    try {
+        w.packed_weights = allocate_pinned<uint8_t>(w.packed_size);
+        w.absmax_q = allocate_pinned<uint8_t>(w.num_blocks);
+        w.absmax2 = allocate_pinned<uint16_t>(w.num_groups);
+        w.code2 = allocate_pinned<uint16_t>(256); // ¹Ì¶¨ 256 ÔªËØ
+    } catch (const std::exception& e) {
+        file.close();
+        throw;
+    }
+
+    // 4. ¶ÁÈ¡Êý¾ÝÊý×é
+    auto read_array = [&](char* dst, size_t size, const char* name) {
+        file.read(dst, size);
+        if (file.gcount() != static_cast<std::streamsize>(size)) {
+            throw std::runtime_error(std::string("Failed to read ") + name + ". Expected " + std::to_string(size) + " bytes, got " + std::to_string(file.gcount()));
+        }
+    };
+
+    read_array(reinterpret_cast<char*>(w.packed_weights.get()), w.packed_size * sizeof(uint8_t), "packed_weights");
+    read_array(reinterpret_cast<char*>(w.absmax_q.get()), w.num_blocks * sizeof(uint8_t), "absmax_q");
+    read_array(reinterpret_cast<char*>(w.absmax2.get()), w.num_groups * sizeof(uint16_t), "absmax2");
+    read_array(reinterpret_cast<char*>(w.code2.get()), 256 * sizeof(uint16_t), "code2");
+
+    // 5. ¶ÁÈ¡ offset
+    if (!file.read(reinterpret_cast<char*>(&w.offset), sizeof(w.offset))) {
+        throw std::runtime_error("Failed to read offset");
+    }
+
+    // 6. ¼ì²éÊÇ·ñ»¹ÓÐÊ£ÓàÊý¾Ý£¨¿ÉÑ¡£¬ÊÓÎÄ¼þ¸ñÊ½ÑÏ¸ñ³Ì¶È¶ø¶¨£©
+    if (file.peek() != EOF) {
+        std::cerr << "Warning: Extra data found at the end of the file " << filename << std::endl;
+    }
+
+    file.close();
+    return w;
+}
diff --git a/03_nf4_dequant/trudging/generate_test_data.py b/03_nf4_dequant/trudging/generate_test_data.py
new file mode 100644
index 00000000..3f68ce43
--- /dev/null
+++ b/03_nf4_dequant/trudging/generate_test_data.py
@@ -0,0 +1,158 @@
+import torch
+import struct
+import math
+import numpy as np
+import time
+
+try:
+    import bitsandbytes as bnb
+    HAS_BNB = True
+except ImportError:
+    print("Warning: bitsandbytes not found. Baseline profiling will be skipped.")
+    HAS_BNB = False
+
+def profile_bnb_baseline(tensor_shape, blocksize):
+    if not HAS_BNB:
+        return
+    
+    print("\n--- Profiling bitsandbytes Baseline ---")
+    # å¼ºåˆ¶åœ¨ GPU ä¸Šåˆ†é…æµ‹è¯•æ•°æ®
+    x = torch.randn(tensor_shape, dtype=torch.float16, device="cuda")
+    
+    try:
+        # åŒé‡é‡åŒ–
+        print("Quantizing tensor...")
+        quantized_tensor, quant_state = bnb.functional.quantize_4bit(
+            x, 
+            quant_type="nf4", 
+            compress_statistics=True
+        )
+        
+        # é¢„çƒ­
+        print("Warming up dequantize...")
+        for _ in range(10):
+            _ = bnb.functional.dequantize_4bit(quantized_tensor, quant_state)
+        torch.cuda.synchronize()
+        
+        # æµ‹é€Ÿ
+        print("Profiling dequantize (100 runs)...")
+        num_runs = 100
+        start_time = time.time()
+        for _ in range(num_runs):
+            _ = bnb.functional.dequantize_4bit(quantized_tensor, quant_state)
+        torch.cuda.synchronize()
+        end_time = time.time()
+        
+        avg_time_ms = ((end_time - start_time) / num_runs) * 1000
+        print(f"bitsandbytes Baseline Dequantize Avg Time: {avg_time_ms:.4f} ms")
+        
+    except Exception as e:
+        print(f"Failed to profile bitsandbytes: {e}")
+
+def create_mock_data_and_save(num_rows, num_cols, blocksize):
+    print("\n--- Generating Mock Data for C++ Test ---")
+    
+    total_elements = num_rows * num_cols
+    packed_size = total_elements // 2
+    num_blocks = math.ceil(total_elements / blocksize)
+    num_groups = math.ceil(num_blocks / 256)
+    
+    # ä¸ºäº†å¯¹æ¯”éªŒè¯è®¡ç®—é€»è¾‘ï¼Œæˆ‘ä»¬ç”Ÿæˆå›ºå®šçš„ mock æ•°æ® (æ–¹ä¾¿åæŽ¨)
+    # packed_weights: éšæœº 0~255
+    packed_weights = torch.randint(0, 256, (packed_size,), dtype=torch.uint8, device="cpu")
+    # absmax_q (ç”±äºŽæ˜¯ uint8, mock èŒƒå›´ 0~255)
+    absmax_q = torch.randint(0, 256, (num_blocks,), dtype=torch.uint8, device="cpu")
+    # absmax2 (float16: mock ä¸€äº›æœ‰æ•ˆéžé›¶æ•°å€¼, e.g. 1.0 ~ 2.0)
+    absmax2 = (torch.rand((num_groups,), dtype=torch.float32, device="cpu") + 1.0).to(torch.float16)
+    # code2 (float16: mock 256 elements)
+    code2 = (torch.rand((256,), dtype=torch.float32, device="cpu") + 1.0).to(torch.float16)
+    
+    offset_val = 0.0
+    
+    # æŒ‰ç…§å…¬å¼åœ¨ Python ç«¯æ¨¡æ‹Ÿè§£é‡åŒ–è®¡ç®—å‡º Ground Truth (fp16)
+    print("Calculating Ground Truth in PyTorch...")
+    
+    # NF4 è§„èŒƒè¡¨
+    nf4_table = torch.tensor([
+        -1.0, -0.6961928, -0.52507305, -0.3949171, 
+        -0.28444138, -0.18477343, -0.091050036, 0.0, 
+        0.07958029, 0.1609302, 0.2461123, 0.33791524, 
+        0.44070983, 0.562617, 0.72295684, 1.0
+    ], dtype=torch.float32, device="cpu")
+    
+    # è§£æžå‡º idx0 å’Œ idx1ï¼Œå±•å¼€åˆ° total_elements
+    idx0 = (packed_weights >> 4).to(torch.int64)
+    idx1 = (packed_weights & 0x0F).to(torch.int64)
+    
+    # äº¤å‰åˆå¹¶: [idx0_0, idx1_0, idx0_1, idx1_1, ...]
+    unpacked_idx = torch.empty((total_elements,), dtype=torch.int64, device="cpu")
+    unpacked_idx[0::2] = idx0
+    unpacked_idx[1::2] = idx1
+    
+    # è®¡ç®—æ‰€æœ‰å…ƒç´ çš„å…¨å±€ block_id å’Œ group_id
+    weight_indices = torch.arange(total_elements, device="cpu")
+    block_ids = weight_indices // blocksize
+    group_ids = block_ids // 256
+    
+    # å¯»å€å¹¶è®¡ç®—ç¬¬ä¸€çº§ç¼©æ”¾å› å­ S1 = (code2[absmax_q] * absmax2) + offset
+    absmax_q_val = absmax_q.to(torch.int64)[block_ids]
+    code2_val = code2[absmax_q_val].to(torch.float32)
+    absmax2_val = absmax2[group_ids].to(torch.float32)
+    
+    S1 = (code2_val * absmax2_val) + offset_val
+    
+    # è®¡ç®—æœ€ç»ˆå€¼å¹¶è½¬ä¸º fp16 å­˜å‚¨ (å¦‚æžœæ‚¨ C++ ç«¯ç”¨çš„æ˜¯ bf16, æ­¤å¤„ä¸ºäº†æ ‡å‡†å¯¹æ¯”ç”¨ fp16 ä¿å­˜)
+    # å› ä¸º NumPy/C++ æ ‡å‡†æµéƒ½æ›´å®¹æ˜“è¯»å†™ IEEE fp16
+    ground_truth = (nf4_table[unpacked_idx] * S1).to(torch.float16)
+
+    # ---------------------------------------------
+    # å†™å…¥äºŒè¿›åˆ¶æ–‡ä»¶
+    # ---------------------------------------------
+    import os
+    
+    # 1. å†™å…¥ test_weights.bin
+    bin_path = "test_weights.bin"
+    print(f"Writing packed binaries to {bin_path}...")
+    with open(bin_path, "wb") as f:
+        # Header: num_rows(8) + num_cols(8) + blocksize(4) = 20 bytes
+        f.write(struct.pack("qqi", num_rows, num_cols, blocksize))
+        
+        # Data
+        f.write(packed_weights.numpy().tobytes())
+        f.write(absmax_q.numpy().tobytes())
+        f.write(absmax2.numpy().tobytes())
+        f.write(code2.numpy().tobytes())
+        f.write(struct.pack("f", offset_val))
+        
+    # 2. å†™å…¥ ground_truth.bin
+    gt_path = "ground_truth.bin"
+    print(f"Writing Ground Truth to {gt_path}...")
+    with open(gt_path, "wb") as f:
+        f.write(ground_truth.numpy().tobytes())
+        
+    # 3. å†™å…¥ params.txt
+    params_path = "params.txt"
+    print(f"Writing parameters to {params_path}...")
+    with open(params_path, "w") as f:
+        f.write(f"blocksize = {blocksize}\n")
+        f.write("compute_type = \"bf16\"\n")  # æˆ–è€… fp16 æ ¹æ®æ‚¨çš„å†…æ ¸å®žé™…æƒ…å†µ
+        f.write("target_gpu = \"A100\"\n")
+        
+    print("Done! Files generated:")
+    print(" - test_weights.bin")
+    print(" - ground_truth.bin")
+    print(" - params.txt")
+
+if __name__ == "__main__":
+    num_rows = 4096
+    num_cols = 4096
+    blocksize = 64
+    tensor_shape = (num_rows, num_cols)
+    
+    if torch.cuda.is_available():
+        print(f"CUDA is available. Device: {torch.cuda.get_device_name(0)}")
+        profile_bnb_baseline(tensor_shape, blocksize)
+    else:
+        print("CUDA is NOT available. Skipping BitsAndBytes profiling. Will only generate files.")
+        
+    create_mock_data_and_save(num_rows, num_cols, blocksize)
diff --git a/03_nf4_dequant/trudging/main.cu b/03_nf4_dequant/trudging/main.cu
new file mode 100644
index 00000000..3201d07d
--- /dev/null
+++ b/03_nf4_dequant/trudging/main.cu
@@ -0,0 +1,243 @@
+#include "src/weights_loader.h"
+#include "src/dequantize.c.h"
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+#include <string>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+// è¾…åŠ©å®ï¼šç”¨äºŽæ£€æŸ¥ CUDA é”™è¯¯
+#define CHECK_CUDA(call)                                                 \
+    do {                                                                 \
+        cudaError_t err = call;                                          \
+        if (err != cudaSuccess) {                                        \
+            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \
+                      << " code=" << err << " \"" << cudaGetErrorString(err) << "\"" << std::endl; \
+            exit(EXIT_FAILURE);                                          \
+        }                                                                \
+    } while (0)
+
+template <typename T>
+float to_float(T val);
+
+template <>
+float to_float<__nv_bfloat16>(__nv_bfloat16 val) {
+    return __bfloat162float(val);
+}
+
+template <>
+float to_float<__half>(__half val) {
+    return __half2float(val);
+}
+
+template <typename T>
+void run_benchmark_and_check(QuantizedWeights& gt_weights,
+                             const std::vector<uint16_t>& h_ground_truth,
+                             int64_t total_elements,
+                             int blocksize,
+                             const uint8_t* d_packed_weights,
+                             const uint8_t* d_absmax_q,
+                             const uint16_t* d_absmax2,
+                             const uint16_t* d_code2)
+{
+    // 2. æ˜¾å­˜åˆ†é…
+    T* d_output = nullptr;
+    CHECK_CUDA(cudaMalloc(&d_output, total_elements * sizeof(T)));
+
+    // 3. æ€§èƒ½æµ‹é€Ÿ (CUDA Events)
+    cudaEvent_t start, stop;
+    CHECK_CUDA(cudaEventCreate(&start));
+    CHECK_CUDA(cudaEventCreate(&stop));
+
+    std::cout << "\nStarting Warmup..." << std::endl;
+    for (int i = 0; i < 10; ++i) {
+        launch_dequantize_nf4<T>(
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2,
+            gt_weights.offset, d_output, total_elements, blocksize, nullptr
+        );
+    }
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    std::cout << "Starting Profiling..." << std::endl;
+    int num_runs = 100;
+
+    CHECK_CUDA(cudaEventRecord(start));
+    for (int i = 0; i < num_runs; ++i) {
+        launch_dequantize_nf4<T>(
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2,
+            gt_weights.offset, d_output, total_elements, blocksize, nullptr
+        );
+    }
+    CHECK_CUDA(cudaEventRecord(stop));
+    CHECK_CUDA(cudaEventSynchronize(stop));
+
+    float total_ms = 0.0f;
+    CHECK_CUDA(cudaEventElapsedTime(&total_ms, start, stop));
+    float avg_time_ms = total_ms / num_runs;
+
+    // 4. æœ‰æ•ˆå†…å­˜å¸¦å®½è®¡ç®—
+    double total_bytes = gt_weights.packed_size
+                       + gt_weights.num_blocks
+                       + (gt_weights.num_groups * 2.0)
+                       + (256.0 * 2.0)
+                       + (total_elements * sizeof(T));
+
+    double bandwidth_GBs = (total_bytes / 1e9) / (avg_time_ms / 1000.0);
+
+    std::cout << "\n--- Performance Results ---" << std::endl;
+    std::cout << "Average Execution Time: " << std::fixed << std::setprecision(4) << avg_time_ms << " ms" << std::endl;
+    std::cout << "Effective Bandwidth:    " << std::setprecision(2) << bandwidth_GBs << " GB/s" << std::endl;
+
+    // Requirement 4: Speedup log
+    float baseline_ms = 2.15f;
+    std::cout << "Speedup vs bitsandbytes: " << std::fixed << std::setprecision(2) << (baseline_ms / avg_time_ms) << " x" << std::endl;
+
+    // 5. ç²¾åº¦éªŒè¯ (MAE)
+    std::cout << "\n--- Accuracy Verification ---" << std::endl;
+    std::vector<T> h_output_test(total_elements);
+    CHECK_CUDA(cudaMemcpy(h_output_test.data(), d_output, total_elements * sizeof(T), cudaMemcpyDeviceToHost));
+
+    double total_error = 0.0;
+    float max_error = 0.0f;
+
+    for (int64_t i = 0; i < total_elements; ++i) {
+        __half gt_half;
+        memcpy(&gt_half, &h_ground_truth[i], sizeof(uint16_t));
+        float gt_val = __half2float(gt_half);
+
+        float out_val = to_float<T>(h_output_test[i]);
+
+        float err = std::abs(gt_val - out_val);
+        total_error += err;
+        if (err > max_error) {
+            max_error = err;
+        }
+    }
+
+    double mae = total_error / total_elements;
+    std::cout << "Calculated elements: " << total_elements << std::endl;
+    std::cout << "Mean Absolute Error (MAE): " << std::scientific << mae << std::endl;
+    std::cout << "Max Absolute Error (MaxAE): " << max_error << std::endl;
+
+    if (mae < 1e-2) {
+         std::cout << "=> Accuracy Check PASSED!" << std::endl;
+    } else {
+         std::cout << "=> Accuracy Check WARNING (MAE might be high)" << std::endl;
+    }
+
+    // Requirement 3: Output file writing
+    std::string out_file = "output_weights.bin";
+    std::ofstream f_out(out_file, std::ios::binary);
+    if (f_out.is_open()) {
+        f_out.write(reinterpret_cast<const char*>(h_output_test.data()), total_elements * sizeof(T));
+        f_out.close();
+        std::cout << "Saved dequantized weights to " << out_file << std::endl;
+    } else {
+        std::cerr << "Failed to write " << out_file << "!" << std::endl;
+    }
+
+    // 6. èµ„æºé‡Šæ”¾
+    CHECK_CUDA(cudaEventDestroy(start));
+    CHECK_CUDA(cudaEventDestroy(stop));
+    CHECK_CUDA(cudaFree(d_output));
+}
+
+int main(int argc, char** argv) {
+    std::cout << "Starting NF4 Dequantization Kernel Test..." << std::endl;
+
+    // Requirement 2: Read params.txt to get compute_type
+    std::string compute_type = "bf16"; // default
+    std::ifstream f_params("params.txt");
+    if (f_params.is_open()) {
+        std::string line;
+        while (std::getline(f_params, line)) {
+            if (line.find("compute_type=") != std::string::npos) {
+                compute_type = line.substr(line.find("=") + 1);
+                // remove any carriage return \r if exists
+                if (!compute_type.empty() && compute_type.back() == '\r') {
+                    compute_type.pop_back();
+                }
+            }
+        }
+        f_params.close();
+    } else {
+        std::cout << "Warning: Could not open params.txt. Defaulting to bf16." << std::endl;
+    }
+    std::cout << "Compute Type is set to: " << compute_type << std::endl;
+
+    // 1. è¯»å–é‡åŒ–æƒé‡æ–‡ä»¶
+    std::string weights_file = "test_weights.bin";
+    std::string gt_file = "ground_truth.bin";
+
+    std::cout << "Loading weights from " << weights_file << "..." << std::endl;
+    QuantizedWeights gt_weights;
+    try {
+        gt_weights = load_weights(weights_file);
+    } catch (const std::exception& e) {
+        std::cerr << "Failed to load weights: " << e.what() << std::endl;
+        return -1;
+    }
+
+    int64_t num_rows = gt_weights.num_rows;
+    int64_t num_cols = gt_weights.num_cols;
+    int blocksize = gt_weights.block_size;
+
+    int64_t total_elements = num_rows * num_cols;
+    int64_t num_blocks = gt_weights.num_blocks;
+    int64_t num_groups = gt_weights.num_groups;
+    int64_t packed_size = gt_weights.packed_size;
+
+    std::cout << "Configuration:" << std::endl;
+    std::cout << "  Matrix: " << num_rows << " x " << num_cols << " (" << total_elements << " elements)" << std::endl;
+    std::cout << "  Blocksize: " << blocksize << std::endl;
+    std::cout << "  Num Blocks: " << num_blocks << std::endl;
+    std::cout << "  Num Groups: " << num_groups << std::endl;
+    std::cout << "  Packed Size: " << packed_size << " bytes" << std::endl;
+
+    // è¯»å– Ground Truth
+    std::cout << "Loading ground truth from " << gt_file << "..." << std::endl;
+    std::vector<uint16_t> h_ground_truth(total_elements); // store as fp16 bits
+    std::ifstream f_gt(gt_file, std::ios::binary);
+    if (!f_gt.is_open()) {
+        std::cerr << "Failed to open " << gt_file << std::endl;
+        return -1;
+    }
+    f_gt.read(reinterpret_cast<char*>(h_ground_truth.data()), total_elements * sizeof(uint16_t));
+    f_gt.close();
+
+    uint8_t *d_packed_weights, *d_absmax_q;
+    uint16_t *d_absmax2, *d_code2;
+
+    CHECK_CUDA(cudaMalloc(&d_packed_weights, packed_size * sizeof(uint8_t)));
+    CHECK_CUDA(cudaMalloc(&d_absmax_q, num_blocks * sizeof(uint8_t)));
+    CHECK_CUDA(cudaMalloc(&d_absmax2, num_groups * sizeof(uint16_t)));
+    CHECK_CUDA(cudaMalloc(&d_code2, 256 * sizeof(uint16_t)));
+
+    CHECK_CUDA(cudaMemcpy(d_packed_weights, gt_weights.packed_weights.get(), packed_size * sizeof(uint8_t), cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_absmax_q, gt_weights.absmax_q.get(), num_blocks * sizeof(uint8_t), cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_absmax2, gt_weights.absmax2.get(), num_groups * sizeof(uint16_t), cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_code2, gt_weights.code2.get(), 256 * sizeof(uint16_t), cudaMemcpyHostToDevice));
+
+    if (compute_type == "fp16") {
+        run_benchmark_and_check<__half>(
+            gt_weights, h_ground_truth, total_elements, blocksize,
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2
+        );
+    } else {
+        run_benchmark_and_check<__nv_bfloat16>(
+            gt_weights, h_ground_truth, total_elements, blocksize,
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2
+        );
+    }
+
+    CHECK_CUDA(cudaFree(d_packed_weights));
+    CHECK_CUDA(cudaFree(d_absmax_q));
+    CHECK_CUDA(cudaFree(d_absmax2));
+    CHECK_CUDA(cudaFree(d_code2));
+
+    std::cout << "\nDone!" << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/03_nf4_dequant/trudging/main.maca b/03_nf4_dequant/trudging/main.maca
new file mode 100644
index 00000000..35e73085
--- /dev/null
+++ b/03_nf4_dequant/trudging/main.maca
@@ -0,0 +1,243 @@
+#include "src/weights_loader.h"
+#include "src/dequantize.mc.h"
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+#include <string>
+#include <common/maca_fp16.h>
+/* #include <common/maca_bfloat16.h> */
+
+// è¾…åŠ©å®ï¼šç”¨äºŽæ£€æŸ¥ CUDA é”™è¯¯
+#define CHECK_CUDA(call)                                                 \
+    do {                                                                 \
+        mcError_t err = call;                                          \
+        if (err != mcSuccess) {                                        \
+            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \
+                      << " code=" << err << " \"" << mcGetErrorString(err) << "\"" << std::endl; \
+            exit(EXIT_FAILURE);                                          \
+        }                                                                \
+    } while (0)
+
+template <typename T>
+float to_float(T val);
+
+template <>
+float to_float<__maca_bfloat16>(__maca_bfloat16 val) {
+    return __bfloat162float(val);
+}
+
+template <>
+float to_float<__half>(__half val) {
+    return __half2float(val);
+}
+
+template <typename T>
+void run_benchmark_and_check(QuantizedWeights& gt_weights,
+                             const std::vector<uint16_t>& h_ground_truth,
+                             int64_t total_elements,
+                             int blocksize,
+                             const uint8_t* d_packed_weights,
+                             const uint8_t* d_absmax_q,
+                             const uint16_t* d_absmax2,
+                             const uint16_t* d_code2)
+{
+    // 2. æ˜¾å­˜åˆ†é…
+    T* d_output = nullptr;
+    CHECK_CUDA(mcMalloc(&d_output, total_elements * sizeof(T)));
+
+    // 3. æ€§èƒ½æµ‹é€Ÿ (CUDA Events)
+    mcEvent_t start, stop;
+    CHECK_CUDA(mcEventCreate(&start));
+    CHECK_CUDA(mcEventCreate(&stop));
+
+    std::cout << "\nStarting Warmup..." << std::endl;
+    for (int i = 0; i < 10; ++i) {
+        launch_dequantize_nf4<T>(
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2,
+            gt_weights.offset, d_output, total_elements, blocksize, nullptr
+        );
+    }
+    CHECK_CUDA(mcDeviceSynchronize());
+
+    std::cout << "Starting Profiling..." << std::endl;
+    int num_runs = 100;
+
+    CHECK_CUDA(mcEventRecord(start));
+    for (int i = 0; i < num_runs; ++i) {
+        launch_dequantize_nf4<T>(
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2,
+            gt_weights.offset, d_output, total_elements, blocksize, nullptr
+        );
+    }
+    CHECK_CUDA(mcEventRecord(stop));
+    CHECK_CUDA(mcEventSynchronize(stop));
+
+    float total_ms = 0.0f;
+    CHECK_CUDA(mcEventElapsedTime(&total_ms, start, stop));
+    float avg_time_ms = total_ms / num_runs;
+
+    // 4. æœ‰æ•ˆå†…å­˜å¸¦å®½è®¡ç®—
+    double total_bytes = gt_weights.packed_size
+                       + gt_weights.num_blocks
+                       + (gt_weights.num_groups * 2.0)
+                       + (256.0 * 2.0)
+                       + (total_elements * sizeof(T));
+
+    double bandwidth_GBs = (total_bytes / 1e9) / (avg_time_ms / 1000.0);
+
+    std::cout << "\n--- Performance Results ---" << std::endl;
+    std::cout << "Average Execution Time: " << std::fixed << std::setprecision(4) << avg_time_ms << " ms" << std::endl;
+    std::cout << "Effective Bandwidth:    " << std::setprecision(2) << bandwidth_GBs << " GB/s" << std::endl;
+
+    // Requirement 4: Speedup log
+    float baseline_ms = 2.15f;
+    std::cout << "Speedup vs bitsandbytes: " << std::fixed << std::setprecision(2) << (baseline_ms / avg_time_ms) << " x" << std::endl;
+
+    // 5. ç²¾åº¦éªŒè¯ (MAE)
+    std::cout << "\n--- Accuracy Verification ---" << std::endl;
+    std::vector<T> h_output_test(total_elements);
+    CHECK_CUDA(mcMemcpy(h_output_test.data(), d_output, total_elements * sizeof(T), mcMemcpyDeviceToHost));
+
+    double total_error = 0.0;
+    float max_error = 0.0f;
+
+    for (int64_t i = 0; i < total_elements; ++i) {
+        __half gt_half;
+        memcpy(&gt_half, &h_ground_truth[i], sizeof(uint16_t));
+        float gt_val = __half2float(gt_half);
+
+        float out_val = to_float<T>(h_output_test[i]);
+
+        float err = std::abs(gt_val - out_val);
+        total_error += err;
+        if (err > max_error) {
+            max_error = err;
+        }
+    }
+
+    double mae = total_error / total_elements;
+    std::cout << "Calculated elements: " << total_elements << std::endl;
+    std::cout << "Mean Absolute Error (MAE): " << std::scientific << mae << std::endl;
+    std::cout << "Max Absolute Error (MaxAE): " << max_error << std::endl;
+
+    if (mae < 1e-2) {
+         std::cout << "=> Accuracy Check PASSED!" << std::endl;
+    } else {
+         std::cout << "=> Accuracy Check WARNING (MAE might be high)" << std::endl;
+    }
+
+    // Requirement 3: Output file writing
+    std::string out_file = "output_weights.bin";
+    std::ofstream f_out(out_file, std::ios::binary);
+    if (f_out.is_open()) {
+        f_out.write(reinterpret_cast<const char*>(h_output_test.data()), total_elements * sizeof(T));
+        f_out.close();
+        std::cout << "Saved dequantized weights to " << out_file << std::endl;
+    } else {
+        std::cerr << "Failed to write " << out_file << "!" << std::endl;
+    }
+
+    // 6. èµ„æºé‡Šæ”¾
+    CHECK_CUDA(mcEventDestroy(start));
+    CHECK_CUDA(mcEventDestroy(stop));
+    CHECK_CUDA(mcFree(d_output));
+}
+
+int main(int argc, char** argv) {
+    std::cout << "Starting NF4 Dequantization Kernel Test..." << std::endl;
+
+    // Requirement 2: Read params.txt to get compute_type
+    std::string compute_type = "bf16"; // default
+    std::ifstream f_params("params.txt");
+    if (f_params.is_open()) {
+        std::string line;
+        while (std::getline(f_params, line)) {
+            if (line.find("compute_type=") != std::string::npos) {
+                compute_type = line.substr(line.find("=") + 1);
+                // remove any carriage return \r if exists
+                if (!compute_type.empty() && compute_type.back() == '\r') {
+                    compute_type.pop_back();
+                }
+            }
+        }
+        f_params.close();
+    } else {
+        std::cout << "Warning: Could not open params.txt. Defaulting to bf16." << std::endl;
+    }
+    std::cout << "Compute Type is set to: " << compute_type << std::endl;
+
+    // 1. è¯»å–é‡åŒ–æƒé‡æ–‡ä»¶
+    std::string weights_file = "test_weights.bin";
+    std::string gt_file = "ground_truth.bin";
+
+    std::cout << "Loading weights from " << weights_file << "..." << std::endl;
+    QuantizedWeights gt_weights;
+    try {
+        gt_weights = load_weights(weights_file);
+    } catch (const std::exception& e) {
+        std::cerr << "Failed to load weights: " << e.what() << std::endl;
+        return -1;
+    }
+
+    int64_t num_rows = gt_weights.num_rows;
+    int64_t num_cols = gt_weights.num_cols;
+    int blocksize = gt_weights.block_size;
+
+    int64_t total_elements = num_rows * num_cols;
+    int64_t num_blocks = gt_weights.num_blocks;
+    int64_t num_groups = gt_weights.num_groups;
+    int64_t packed_size = gt_weights.packed_size;
+
+    std::cout << "Configuration:" << std::endl;
+    std::cout << "  Matrix: " << num_rows << " x " << num_cols << " (" << total_elements << " elements)" << std::endl;
+    std::cout << "  Blocksize: " << blocksize << std::endl;
+    std::cout << "  Num Blocks: " << num_blocks << std::endl;
+    std::cout << "  Num Groups: " << num_groups << std::endl;
+    std::cout << "  Packed Size: " << packed_size << " bytes" << std::endl;
+
+    // è¯»å– Ground Truth
+    std::cout << "Loading ground truth from " << gt_file << "..." << std::endl;
+    std::vector<uint16_t> h_ground_truth(total_elements); // store as fp16 bits
+    std::ifstream f_gt(gt_file, std::ios::binary);
+    if (!f_gt.is_open()) {
+        std::cerr << "Failed to open " << gt_file << std::endl;
+        return -1;
+    }
+    f_gt.read(reinterpret_cast<char*>(h_ground_truth.data()), total_elements * sizeof(uint16_t));
+    f_gt.close();
+
+    uint8_t *d_packed_weights, *d_absmax_q;
+    uint16_t *d_absmax2, *d_code2;
+
+    CHECK_CUDA(mcMalloc(&d_packed_weights, packed_size * sizeof(uint8_t)));
+    CHECK_CUDA(mcMalloc(&d_absmax_q, num_blocks * sizeof(uint8_t)));
+    CHECK_CUDA(mcMalloc(&d_absmax2, num_groups * sizeof(uint16_t)));
+    CHECK_CUDA(mcMalloc(&d_code2, 256 * sizeof(uint16_t)));
+
+    CHECK_CUDA(mcMemcpy(d_packed_weights, gt_weights.packed_weights.get(), packed_size * sizeof(uint8_t), mcMemcpyHostToDevice));
+    CHECK_CUDA(mcMemcpy(d_absmax_q, gt_weights.absmax_q.get(), num_blocks * sizeof(uint8_t), mcMemcpyHostToDevice));
+    CHECK_CUDA(mcMemcpy(d_absmax2, gt_weights.absmax2.get(), num_groups * sizeof(uint16_t), mcMemcpyHostToDevice));
+    CHECK_CUDA(mcMemcpy(d_code2, gt_weights.code2.get(), 256 * sizeof(uint16_t), mcMemcpyHostToDevice));
+
+    if (compute_type == "fp16") {
+        run_benchmark_and_check<__half>(
+            gt_weights, h_ground_truth, total_elements, blocksize,
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2
+        );
+    } else {
+        run_benchmark_and_check<__maca_bfloat16>(
+            gt_weights, h_ground_truth, total_elements, blocksize,
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2
+        );
+    }
+
+    CHECK_CUDA(mcFree(d_packed_weights));
+    CHECK_CUDA(mcFree(d_absmax_q));
+    CHECK_CUDA(mcFree(d_absmax2));
+    CHECK_CUDA(mcFree(d_code2));
+
+    std::cout << "\nDone!" << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/03_nf4_dequant/trudging/main.mu b/03_nf4_dequant/trudging/main.mu
new file mode 100644
index 00000000..fc88ed9a
--- /dev/null
+++ b/03_nf4_dequant/trudging/main.mu
@@ -0,0 +1,243 @@
+#include "src/weights_loader.h"
+#include "src/dequantize.m.h"
+#include <iostream>
+#include <vector>
+#include <iomanip>
+#include <fstream>
+#include <cmath>
+#include <string>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+
+// è¾…åŠ©å®ï¼šç”¨äºŽæ£€æŸ¥ MUSA é”™è¯¯
+#define CHECK_CUDA(call)                                                 \
+    do {                                                                 \
+        musaError_t err = call;                                          \
+        if (err != musaSuccess) {                                        \
+            std::cerr << "MUSA error at " << __FILE__ << ":" << __LINE__ \
+                      << " code=" << err << " \"" << musaGetErrorString(err) << "\"" << std::endl; \
+            exit(EXIT_FAILURE);                                          \
+        }                                                                \
+    } while (0)
+
+template <typename T>
+float to_float(T val);
+
+template <>
+float to_float<__mt_bfloat16>(__mt_bfloat16 val) {
+    return float(val);
+}
+
+template <>
+float to_float<__half>(__half val) {
+    return __half2float(val);
+}
+
+template <typename T>
+void run_benchmark_and_check(QuantizedWeights& gt_weights,
+                             const std::vector<uint16_t>& h_ground_truth,
+                             int64_t total_elements,
+                             int blocksize,
+                             const uint8_t* d_packed_weights,
+                             const uint8_t* d_absmax_q,
+                             const uint16_t* d_absmax2,
+                             const uint16_t* d_code2)
+{
+    // 2. æ˜¾å­˜åˆ†é…
+    T* d_output = nullptr;
+    CHECK_CUDA(musaMalloc(&d_output, total_elements * sizeof(T)));
+
+    // 3. æ€§èƒ½æµ‹é€Ÿ (MUSA Events)
+    musaEvent_t start, stop;
+    CHECK_CUDA(musaEventCreate(&start));
+    CHECK_CUDA(musaEventCreate(&stop));
+
+    std::cout << "\nStarting Warmup..." << std::endl;
+    for (int i = 0; i < 10; ++i) {
+        launch_dequantize_nf4<T>(
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2,
+            gt_weights.offset, d_output, total_elements, blocksize, nullptr
+        );
+    }
+    CHECK_CUDA(musaDeviceSynchronize());
+
+    std::cout << "Starting Profiling..." << std::endl;
+    int num_runs = 100;
+
+    CHECK_CUDA(musaEventRecord(start));
+    for (int i = 0; i < num_runs; ++i) {
+        launch_dequantize_nf4<T>(
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2,
+            gt_weights.offset, d_output, total_elements, blocksize, nullptr
+        );
+    }
+    CHECK_CUDA(musaEventRecord(stop));
+    CHECK_CUDA(musaEventSynchronize(stop));
+
+    float total_ms = 0.0f;
+    CHECK_CUDA(musaEventElapsedTime(&total_ms, start, stop));
+    float avg_time_ms = total_ms / num_runs;
+
+    // 4. æœ‰æ•ˆå†…å­˜å¸¦å®½è®¡ç®—
+    double total_bytes = gt_weights.packed_size
+                       + gt_weights.num_blocks
+                       + (gt_weights.num_groups * 2.0)
+                       + (256.0 * 2.0)
+                       + (total_elements * sizeof(T));
+
+    double bandwidth_GBs = (total_bytes / 1e9) / (avg_time_ms / 1000.0);
+
+    std::cout << "\n--- Performance Results ---" << std::endl;
+    std::cout << "Average Execution Time: " << std::fixed << std::setprecision(4) << avg_time_ms << " ms" << std::endl;
+    std::cout << "Effective Bandwidth:    " << std::setprecision(2) << bandwidth_GBs << " GB/s" << std::endl;
+
+    // Requirement 4: Speedup log
+    float baseline_ms = 2.15f;
+    std::cout << "Speedup vs bitsandbytes: " << std::fixed << std::setprecision(2) << (baseline_ms / avg_time_ms) << " x" << std::endl;
+
+    // 5. ç²¾åº¦éªŒè¯ (MAE)
+    std::cout << "\n--- Accuracy Verification ---" << std::endl;
+    std::vector<T> h_output_test(total_elements);
+    CHECK_CUDA(musaMemcpy(h_output_test.data(), d_output, total_elements * sizeof(T), musaMemcpyDeviceToHost));
+
+    double total_error = 0.0;
+    float max_error = 0.0f;
+
+    for (int64_t i = 0; i < total_elements; ++i) {
+        __half gt_half;
+        memcpy(&gt_half, &h_ground_truth[i], sizeof(uint16_t));
+        float gt_val = __half2float(gt_half);
+
+        float out_val = to_float<T>(h_output_test[i]);
+
+        float err = std::abs(gt_val - out_val);
+        total_error += err;
+        if (err > max_error) {
+            max_error = err;
+        }
+    }
+
+    double mae = total_error / total_elements;
+    std::cout << "Calculated elements: " << total_elements << std::endl;
+    std::cout << "Mean Absolute Error (MAE): " << std::scientific << mae << std::endl;
+    std::cout << "Max Absolute Error (MaxAE): " << max_error << std::endl;
+
+    if (mae < 1e-2) {
+         std::cout << "=> Accuracy Check PASSED!" << std::endl;
+    } else {
+         std::cout << "=> Accuracy Check WARNING (MAE might be high)" << std::endl;
+    }
+
+    // Requirement 3: Output file writing
+    std::string out_file = "output_weights.bin";
+    std::ofstream f_out(out_file, std::ios::binary);
+    if (f_out.is_open()) {
+        f_out.write(reinterpret_cast<const char*>(h_output_test.data()), total_elements * sizeof(T));
+        f_out.close();
+        std::cout << "Saved dequantized weights to " << out_file << std::endl;
+    } else {
+        std::cerr << "Failed to write " << out_file << "!" << std::endl;
+    }
+
+    // 6. èµ„æºé‡Šæ”¾
+    CHECK_CUDA(musaEventDestroy(start));
+    CHECK_CUDA(musaEventDestroy(stop));
+    CHECK_CUDA(musaFree(d_output));
+}
+
+int main(int argc, char** argv) {
+    std::cout << "Starting NF4 Dequantization Kernel Test..." << std::endl;
+
+    // Requirement 2: Read params.txt to get compute_type
+    std::string compute_type = "bf16"; // default
+    std::ifstream f_params("params.txt");
+    if (f_params.is_open()) {
+        std::string line;
+        while (std::getline(f_params, line)) {
+            if (line.find("compute_type=") != std::string::npos) {
+                compute_type = line.substr(line.find("=") + 1);
+                // remove any carriage return \r if exists
+                if (!compute_type.empty() && compute_type.back() == '\r') {
+                    compute_type.pop_back();
+                }
+            }
+        }
+        f_params.close();
+    } else {
+        std::cout << "Warning: Could not open params.txt. Defaulting to bf16." << std::endl;
+    }
+    std::cout << "Compute Type is set to: " << compute_type << std::endl;
+
+    // 1. è¯»å–é‡åŒ–æƒé‡æ–‡ä»¶
+    std::string weights_file = "test_weights.bin";
+    std::string gt_file = "ground_truth.bin";
+
+    std::cout << "Loading weights from " << weights_file << "..." << std::endl;
+    QuantizedWeights gt_weights;
+    try {
+        gt_weights = load_weights(weights_file);
+    } catch (const std::exception& e) {
+        std::cerr << "Failed to load weights: " << e.what() << std::endl;
+        return -1;
+    }
+
+    int64_t num_rows = gt_weights.num_rows;
+    int64_t num_cols = gt_weights.num_cols;
+    int blocksize = gt_weights.block_size;
+
+    int64_t total_elements = num_rows * num_cols;
+    int64_t num_blocks = gt_weights.num_blocks;
+    int64_t num_groups = gt_weights.num_groups;
+    int64_t packed_size = gt_weights.packed_size;
+
+    std::cout << "Configuration:" << std::endl;
+    std::cout << "  Matrix: " << num_rows << " x " << num_cols << " (" << total_elements << " elements)" << std::endl;
+    std::cout << "  Blocksize: " << blocksize << std::endl;
+    std::cout << "  Num Blocks: " << num_blocks << std::endl;
+    std::cout << "  Num Groups: " << num_groups << std::endl;
+    std::cout << "  Packed Size: " << packed_size << " bytes" << std::endl;
+
+    // è¯»å– Ground Truth
+    std::cout << "Loading ground truth from " << gt_file << "..." << std::endl;
+    std::vector<uint16_t> h_ground_truth(total_elements); // store as fp16 bits
+    std::ifstream f_gt(gt_file, std::ios::binary);
+    if (!f_gt.is_open()) {
+        std::cerr << "Failed to open " << gt_file << std::endl;
+        return -1;
+    }
+    f_gt.read(reinterpret_cast<char*>(h_ground_truth.data()), total_elements * sizeof(uint16_t));
+    f_gt.close();
+
+    uint8_t *d_packed_weights, *d_absmax_q;
+    uint16_t *d_absmax2, *d_code2;
+
+    CHECK_CUDA(musaMalloc(&d_packed_weights, packed_size * sizeof(uint8_t)));
+    CHECK_CUDA(musaMalloc(&d_absmax_q, num_blocks * sizeof(uint8_t)));
+    CHECK_CUDA(musaMalloc(&d_absmax2, num_groups * sizeof(uint16_t)));
+    CHECK_CUDA(musaMalloc(&d_code2, 256 * sizeof(uint16_t)));
+
+    CHECK_CUDA(musaMemcpy(d_packed_weights, gt_weights.packed_weights.get(), packed_size * sizeof(uint8_t), musaMemcpyHostToDevice));
+    CHECK_CUDA(musaMemcpy(d_absmax_q, gt_weights.absmax_q.get(), num_blocks * sizeof(uint8_t), musaMemcpyHostToDevice));
+    CHECK_CUDA(musaMemcpy(d_absmax2, gt_weights.absmax2.get(), num_groups * sizeof(uint16_t), musaMemcpyHostToDevice));
+    CHECK_CUDA(musaMemcpy(d_code2, gt_weights.code2.get(), 256 * sizeof(uint16_t), musaMemcpyHostToDevice));
+
+    if (compute_type == "fp16") {
+        run_benchmark_and_check<__half>(
+            gt_weights, h_ground_truth, total_elements, blocksize,
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2
+        );
+    } else {
+        run_benchmark_and_check<__mt_bfloat16>(
+            gt_weights, h_ground_truth, total_elements, blocksize,
+            d_packed_weights, d_absmax_q, d_absmax2, d_code2
+        );
+    }
+
+    CHECK_CUDA(musaFree(d_packed_weights));
+    CHECK_CUDA(musaFree(d_absmax_q));
+    CHECK_CUDA(musaFree(d_absmax2));
+    CHECK_CUDA(musaFree(d_code2));
+
+    std::cout << "\nDone!" << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/03_nf4_dequant/trudging/nf4_report/nf4_report.md b/03_nf4_dequant/trudging/nf4_report/nf4_report.md
new file mode 100644
index 00000000..e69de29b
diff --git a/03_nf4_dequant/trudging/nf4_report/nf4_report.pdf b/03_nf4_dequant/trudging/nf4_report/nf4_report.pdf
new file mode 100644
index 00000000..91da3b12
Binary files /dev/null and b/03_nf4_dequant/trudging/nf4_report/nf4_report.pdf differ
diff --git "a/03_nf4_dequant/trudging/nf4_report/\345\244\251\346\225\260.png" "b/03_nf4_dequant/trudging/nf4_report/\345\244\251\346\225\260.png"
new file mode 100644
index 00000000..7a445554
Binary files /dev/null and "b/03_nf4_dequant/trudging/nf4_report/\345\244\251\346\225\260.png" differ
diff --git "a/03_nf4_dequant/trudging/nf4_report/\346\221\251\345\260\224.png" "b/03_nf4_dequant/trudging/nf4_report/\346\221\251\345\260\224.png"
new file mode 100644
index 00000000..6c4f1462
Binary files /dev/null and "b/03_nf4_dequant/trudging/nf4_report/\346\221\251\345\260\224.png" differ
diff --git "a/03_nf4_dequant/trudging/nf4_report/\346\262\220\346\233\246.png" "b/03_nf4_dequant/trudging/nf4_report/\346\262\220\346\233\246.png"
new file mode 100644
index 00000000..dbfe3ad8
Binary files /dev/null and "b/03_nf4_dequant/trudging/nf4_report/\346\262\220\346\233\246.png" differ
diff --git "a/03_nf4_dequant/trudging/nf4_report/\350\213\261\344\274\237\350\276\276new.png" "b/03_nf4_dequant/trudging/nf4_report/\350\213\261\344\274\237\350\276\276new.png"
new file mode 100644
index 00000000..2b1af40d
Binary files /dev/null and "b/03_nf4_dequant/trudging/nf4_report/\350\213\261\344\274\237\350\276\276new.png" differ
diff --git a/03_nf4_dequant/trudging/run_on_a100.sh b/03_nf4_dequant/trudging/run_on_a100.sh
new file mode 100644
index 00000000..b7fdaf90
--- /dev/null
+++ b/03_nf4_dequant/trudging/run_on_a100.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --job-name=nf4_dequant              # ä»»åŠ¡å
+#SBATCH --output=result_%j.log              # æ ‡å‡†è¾“å‡ºæ–‡ä»¶
+#SBATCH --error=error_%j.log                # æ ‡å‡†é”™è¯¯è¾“å‡ºæ–‡ä»¶
+#SBATCH --partition=nvidia                  # åˆ†åŒºå
+#SBATCH --nodes=1                           # èŠ‚ç‚¹æ•°
+#SBATCH --ntasks=1                          # æ€»ä»»åŠ¡æ•°
+#SBATCH --cpus-per-task=16                  # æ¯ä¸ªä»»åŠ¡éœ€è¦çš„ CPU æ ¸å¿ƒæ•°
+#SBATCH --gres=gpu:nvidia:1                 # è¯·æ±‚ 1 å— A100 GPU (å¯¹åº”æµ‹è¯•å³å¯)
+#SBATCH --mem=64G                           # è¯·æ±‚çš„å†…å­˜
+#SBATCH --time=00:10:00                     # è¿è¡Œæ—¶é—´ä¸Šé™ (10åˆ†é’Ÿè¶³å¤Ÿ)
+
+# 1. è®¾ç½® CUDA çŽ¯å¢ƒå˜é‡
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+echo "============ Starting Compilation ============"
+# ä½¿ç”¨ nvcc ç¼–è¯‘ä»£ç ã€‚å¹³å°ä¸º A100ï¼Œå›ºå®šæž¶æž„ä¸º sm_80
+nvcc -O3 -lineinfo --ptxas-options=-v -use_fast_math -arch=sm_80 main.cu src/dequantize.cu -o nf4_dequantizer
+
+if [ $? -eq 0 ]; then
+    echo "============ Compilation Success ============"
+    echo "============ Running Kernel ============"
+    # 2. è¿è¡Œç®—å­
+    srun ./nf4_dequantizer
+else
+    echo "============ Compilation Failed ============"
+fi
diff --git a/03_nf4_dequant/trudging/src/dequantize.c.h b/03_nf4_dequant/trudging/src/dequantize.c.h
new file mode 100644
index 00000000..e469ad05
--- /dev/null
+++ b/03_nf4_dequant/trudging/src/dequantize.c.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+
+template <typename T>
+__global__ void dequantize_nf4_kernel(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const uint16_t* __restrict__ absmax2,
+    const uint16_t* __restrict__ code2,
+    float offset,
+    T* __restrict__ output,
+    int64_t total_elements,
+    int blocksize
+);
+
+template <typename T>
+void launch_dequantize_nf4(
+    const uint8_t* d_packed_weights,
+    const uint8_t* d_absmax_q,
+    const uint16_t* d_absmax2,
+    const uint16_t* d_code2,
+    float offset,
+    T* d_output,
+    int64_t total_elements,
+    int blocksize,
+    cudaStream_t stream = nullptr
+);
\ No newline at end of file
diff --git a/03_nf4_dequant/trudging/src/dequantize.cu b/03_nf4_dequant/trudging/src/dequantize.cu
new file mode 100644
index 00000000..e9dbcb6c
--- /dev/null
+++ b/03_nf4_dequant/trudging/src/dequantize.cu
@@ -0,0 +1,145 @@
+ï»¿#include "dequantize.c.h"
+
+// é€‚é…ä¸åŒæ•°æ®ç±»åž‹çš„ float2T è¾…åŠ©å‡½æ•°
+template <typename T>
+__device__ inline T float2T(float v);
+
+template <>
+__device__ inline __nv_bfloat16 float2T<__nv_bfloat16>(float v) {
+    return __float2bfloat16(v);
+}
+
+template <>
+__device__ inline __half float2T<__half>(float v) {
+    return __float2half_rn(v);
+}
+
+// é’ˆå¯¹ __nv_bfloat16 çš„ç‰¹åŒ–å‘é‡åŒ–æ‰“åŒ…
+__device__ inline uint32_t pack_two_elements(__nv_bfloat16 w0, __nv_bfloat16 w1) {
+    __nv_bfloat162 packed = __halves2bfloat162(w0, w1);
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+
+// é’ˆå¯¹ __half (fp16) çš„ç‰¹åŒ–å‘é‡åŒ–æ‰“åŒ…
+__device__ inline uint32_t pack_two_elements(__half w0, __half w1) {
+    __half2 packed = __floats2half2_rn(__half2float(w0), __half2float(w1));
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+
+// 1. NF4 å¸¸é‡è¡¨ (Constant Memory)
+__constant__ float c_nf4_table[16] = {
+    -1.0f, -0.6961928f, -0.52507305f, -0.3949171f,
+    -0.28444138f, -0.18477343f, -0.091050036f, 0.0f,
+    0.07958029f, 0.1609302f, 0.2461123f, 0.33791524f,
+    0.44070983f, 0.562617f, 0.72295684f, 1.0f
+};
+
+// 2. Kernel å‡½æ•°å®žçŽ°
+template <typename T>
+__global__ void dequantize_nf4_kernel(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const uint16_t* __restrict__ absmax2,
+    const uint16_t* __restrict__ code2,
+    float offset,
+    T* __restrict__ output,
+    int64_t total_elements,
+    int blocksize)
+{
+    // 3. æ ¸å¿ƒè®¡ç®—é€»è¾‘
+    // æ¯ä¸ªçº¿ç¨‹å¤„ç† 1 ä¸ª uint8_t (å³ 2 ä¸ª 4-bit æƒé‡)
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // è¾¹ç•Œæ£€æŸ¥ï¼šç›´æŽ¥é™åˆ¶çº¿ç¨‹è¯»å– packed_weights çš„èŒƒå›´
+    if (tid >= (total_elements + 1) / 2) {
+        return;
+    }
+
+    // å…¨å±€æƒé‡çš„èµ·å§‹ç´¢å¼•
+    int64_t weight_idx = (int64_t)tid * 2;
+
+    // è¯»å– 1 å­—èŠ‚å¹¶è§£ç é«˜/ä½Ž 4 ä½
+    uint8_t packed = packed_weights[tid];
+    uint8_t idx0 = packed >> 4;           // é«˜ 4 ä½å¯¹åº”ç¬¬ä¸€ä¸ªæƒé‡
+    uint8_t idx1 = packed & 0x0F;         // ä½Ž 4 ä½å¯¹åº”ç¬¬äºŒä¸ªæƒé‡
+
+    // ä¸º w0 è®¡ç®—ç¼©æ”¾å› å­
+    int block_id0 = weight_idx / blocksize;
+    int group_id0 = block_id0 / 256;
+
+    __half code2_half0 = *reinterpret_cast<const __half*>(&code2[absmax_q[block_id0]]);     
+    __half absmax2_half0 = *reinterpret_cast<const __half*>(&absmax2[group_id0]);
+    float S1_0 = (__half2float(code2_half0) * __half2float(absmax2_half0)) + offset;        
+
+    // ä¸º w1 è®¡ç®—ç¼©æ”¾å› å­ (æ³¨æ„é˜²èŒƒ weight_idx + 1 è¶Šç•Œ)
+    float S1_1 = 0.0f;
+    if (weight_idx + 1 < total_elements) {
+        int block_id1 = (weight_idx + 1) / blocksize;
+        int group_id1 = block_id1 / 256;
+        __half code2_half1 = *reinterpret_cast<const __half*>(&code2[absmax_q[block_id1]]); 
+        __half absmax2_half1 = *reinterpret_cast<const __half*>(&absmax2[group_id1]);       
+        S1_1 = (__half2float(code2_half1) * __half2float(absmax2_half1)) + offset;
+    }
+
+    // æŸ¥è¡¨å¹¶è§£é‡åŒ–
+    float w0 = c_nf4_table[idx0] * S1_0;
+    float w1 = c_nf4_table[idx1] * S1_1;
+
+    // 4. å‘é‡åŒ–å†™å…¥ (Packed Store) ä¸Žå°¾éƒ¨è¾¹ç•Œå¤„ç†
+    // è½¬æ¢ä¸ºæŒ‡å®šçš„æµ®ç‚¹ç±»åž‹ (fp16 æˆ– bf16)
+    T out_w0 = float2T<T>(w0);
+    T out_w1 = float2T<T>(w1);
+
+    if (weight_idx + 1 < total_elements) {
+        // æ­£å¸¸æƒ…å†µï¼šåŒ…å« 2 ä¸ªæœ‰æ•ˆæƒé‡ï¼Œä½¿ç”¨å‘é‡åŒ–å†™å…¥
+        uint32_t packed_bits = pack_two_elements(out_w0, out_w1);
+
+        // å¼ºè½¬ä¸º uint32_t è¿›è¡Œä¸€æ¬¡ 32-bit åˆå¹¶è®¿é—®å†™å…¥
+        reinterpret_cast<uint32_t*>(output)[tid] = packed_bits;
+    } else {
+        // å°¾éƒ¨è¾¹ç•Œå¤„ç†ï¼šæ€»å…ƒç´ æ•°æ˜¯å¥‡æ•°ï¼Œå¹¶ä¸”è¿™æ˜¯æœ€åŽä¸€ä¸ªå•å…ƒç´ 
+        // é€€åŒ–ä¸ºæ ‡é‡å†™å…¥ï¼Œé¿å…è¶Šç•Œè®¿é—®
+        output[weight_idx] = out_w0;
+    }
+}
+
+// Host å¯åŠ¨å‡½æ•°
+template <typename T>
+void launch_dequantize_nf4(
+    const uint8_t* d_packed_weights,
+    const uint8_t* d_absmax_q,
+    const uint16_t* d_absmax2,
+    const uint16_t* d_code2,
+    float offset,
+    T* d_output, 
+    int64_t total_elements,
+    int blocksize,
+    cudaStream_t stream)
+{
+    // æ¯ä¸ªçº¿ç¨‹å¤„ç† 2 ä¸ªå…ƒç´ ï¼Œå› æ­¤æ€»çº¿ç¨‹æ•° = ceil(total_elements / 2)
+    int64_t num_threads = (total_elements + 1) / 2;
+
+    // é…ç½® Block å’Œ Grid ç»´åº¦
+    int threads_per_block = 256;
+    int blocks_per_grid = (num_threads + threads_per_block - 1) / threads_per_block;        
+
+    dequantize_nf4_kernel<T><<<blocks_per_grid, threads_per_block, 0, stream>>>(
+        d_packed_weights,
+        d_absmax_q,
+        d_absmax2,
+        d_code2,
+        offset,
+        d_output,
+        total_elements,
+        blocksize
+    );
+}
+
+// æ˜¾å¼å®žä¾‹åŒ–æ¨¡æ¿
+template void launch_dequantize_nf4<__nv_bfloat16>(
+    const uint8_t*, const uint8_t*, const uint16_t*, const uint16_t*,
+    float, __nv_bfloat16*, int64_t, int, cudaStream_t);
+
+template void launch_dequantize_nf4<__half>(
+    const uint8_t*, const uint8_t*, const uint16_t*, const uint16_t*,
+    float, __half*, int64_t, int, cudaStream_t);
\ No newline at end of file
diff --git a/03_nf4_dequant/trudging/src/dequantize.m.h b/03_nf4_dequant/trudging/src/dequantize.m.h
new file mode 100644
index 00000000..9886db0a
--- /dev/null
+++ b/03_nf4_dequant/trudging/src/dequantize.m.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <musa_runtime.h>
+#include <musa_bf16.h>
+#include <musa_fp16.h>
+#include <cstdint>
+
+template <typename T>
+__global__ void dequantize_nf4_kernel(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const uint16_t* __restrict__ absmax2,
+    const uint16_t* __restrict__ code2,
+    float offset,
+    T* __restrict__ output,
+    int64_t total_elements,
+    int blocksize
+);
+
+template <typename T>
+void launch_dequantize_nf4(
+    const uint8_t* d_packed_weights,
+    const uint8_t* d_absmax_q,
+    const uint16_t* d_absmax2,
+    const uint16_t* d_code2,
+    float offset,
+    T* d_output,
+    int64_t total_elements,
+    int blocksize,
+    musaStream_t stream = nullptr
+);
\ No newline at end of file
diff --git a/03_nf4_dequant/trudging/src/dequantize.maca b/03_nf4_dequant/trudging/src/dequantize.maca
new file mode 100644
index 00000000..08397c51
--- /dev/null
+++ b/03_nf4_dequant/trudging/src/dequantize.maca
@@ -0,0 +1,145 @@
+ï»¿#include "dequantize.mc.h"
+
+// é€‚é…ä¸åŒæ•°æ®ç±»åž‹çš„ float2T è¾…åŠ©å‡½æ•°
+template <typename T>
+__device__ inline T float2T(float v);
+
+template <>
+__device__ inline __maca_bfloat16 float2T<__maca_bfloat16>(float v) {
+    return __float2bfloat16(v);
+}
+
+template <>
+__device__ inline __half float2T<__half>(float v) {
+    return __float2half_rn(v);
+}
+
+// é’ˆå¯¹ __maca_bfloat16 çš„ç‰¹åŒ–å‘é‡åŒ–æ‰“åŒ…
+__device__ inline uint32_t pack_two_elements(__maca_bfloat16 w0, __maca_bfloat16 w1) {
+    __maca_bfloat162 packed = __halves2bfloat162(w0, w1);
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+
+// é’ˆå¯¹ __half (fp16) çš„ç‰¹åŒ–å‘é‡åŒ–æ‰“åŒ…
+__device__ inline uint32_t pack_two_elements(__half w0, __half w1) {
+    __half2 packed = __floats2half2_rn(__half2float(w0), __half2float(w1));
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+
+// 1. NF4 å¸¸é‡è¡¨ (Constant Memory)
+__constant__ float c_nf4_table[16] = {
+    -1.0f, -0.6961928f, -0.52507305f, -0.3949171f,
+    -0.28444138f, -0.18477343f, -0.091050036f, 0.0f,
+    0.07958029f, 0.1609302f, 0.2461123f, 0.33791524f,
+    0.44070983f, 0.562617f, 0.72295684f, 1.0f
+};
+
+// 2. Kernel å‡½æ•°å®žçŽ°
+template <typename T>
+__global__ void dequantize_nf4_kernel(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const uint16_t* __restrict__ absmax2,
+    const uint16_t* __restrict__ code2,
+    float offset,
+    T* __restrict__ output,
+    int64_t total_elements,
+    int blocksize)
+{
+    // 3. æ ¸å¿ƒè®¡ç®—é€»è¾‘
+    // æ¯ä¸ªçº¿ç¨‹å¤„ç† 1 ä¸ª uint8_t (å³ 2 ä¸ª 4-bit æƒé‡)
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // è¾¹ç•Œæ£€æŸ¥ï¼šç›´æŽ¥é™åˆ¶çº¿ç¨‹è¯»å– packed_weights çš„èŒƒå›´
+    if (tid >= (total_elements + 1) / 2) {
+        return;
+    }
+
+    // å…¨å±€æƒé‡çš„èµ·å§‹ç´¢å¼•
+    int64_t weight_idx = (int64_t)tid * 2;
+
+    // è¯»å– 1 å­—èŠ‚å¹¶è§£ç é«˜/ä½Ž 4 ä½
+    uint8_t packed = packed_weights[tid];
+    uint8_t idx0 = packed >> 4;           // é«˜ 4 ä½å¯¹åº”ç¬¬ä¸€ä¸ªæƒé‡
+    uint8_t idx1 = packed & 0x0F;         // ä½Ž 4 ä½å¯¹åº”ç¬¬äºŒä¸ªæƒé‡
+
+    // ä¸º w0 è®¡ç®—ç¼©æ”¾å› å­
+    int block_id0 = weight_idx / blocksize;
+    int group_id0 = block_id0 / 256;
+
+    __half code2_half0 = *reinterpret_cast<const __half*>(&code2[absmax_q[block_id0]]);     
+    __half absmax2_half0 = *reinterpret_cast<const __half*>(&absmax2[group_id0]);
+    float S1_0 = (__half2float(code2_half0) * __half2float(absmax2_half0)) + offset;        
+
+    // ä¸º w1 è®¡ç®—ç¼©æ”¾å› å­ (æ³¨æ„é˜²èŒƒ weight_idx + 1 è¶Šç•Œ)
+    float S1_1 = 0.0f;
+    if (weight_idx + 1 < total_elements) {
+        int block_id1 = (weight_idx + 1) / blocksize;
+        int group_id1 = block_id1 / 256;
+        __half code2_half1 = *reinterpret_cast<const __half*>(&code2[absmax_q[block_id1]]); 
+        __half absmax2_half1 = *reinterpret_cast<const __half*>(&absmax2[group_id1]);       
+        S1_1 = (__half2float(code2_half1) * __half2float(absmax2_half1)) + offset;
+    }
+
+    // æŸ¥è¡¨å¹¶è§£é‡åŒ–
+    float w0 = c_nf4_table[idx0] * S1_0;
+    float w1 = c_nf4_table[idx1] * S1_1;
+
+    // 4. å‘é‡åŒ–å†™å…¥ (Packed Store) ä¸Žå°¾éƒ¨è¾¹ç•Œå¤„ç†
+    // è½¬æ¢ä¸ºæŒ‡å®šçš„æµ®ç‚¹ç±»åž‹ (fp16 æˆ– bf16)
+    T out_w0 = float2T<T>(w0);
+    T out_w1 = float2T<T>(w1);
+
+    if (weight_idx + 1 < total_elements) {
+        // æ­£å¸¸æƒ…å†µï¼šåŒ…å« 2 ä¸ªæœ‰æ•ˆæƒé‡ï¼Œä½¿ç”¨å‘é‡åŒ–å†™å…¥
+        uint32_t packed_bits = pack_two_elements(out_w0, out_w1);
+
+        // å¼ºè½¬ä¸º uint32_t è¿›è¡Œä¸€æ¬¡ 32-bit åˆå¹¶è®¿é—®å†™å…¥
+        reinterpret_cast<uint32_t*>(output)[tid] = packed_bits;
+    } else {
+        // å°¾éƒ¨è¾¹ç•Œå¤„ç†ï¼šæ€»å…ƒç´ æ•°æ˜¯å¥‡æ•°ï¼Œå¹¶ä¸”è¿™æ˜¯æœ€åŽä¸€ä¸ªå•å…ƒç´ 
+        // é€€åŒ–ä¸ºæ ‡é‡å†™å…¥ï¼Œé¿å…è¶Šç•Œè®¿é—®
+        output[weight_idx] = out_w0;
+    }
+}
+
+// Host å¯åŠ¨å‡½æ•°
+template <typename T>
+void launch_dequantize_nf4(
+    const uint8_t* d_packed_weights,
+    const uint8_t* d_absmax_q,
+    const uint16_t* d_absmax2,
+    const uint16_t* d_code2,
+    float offset,
+    T* d_output, 
+    int64_t total_elements,
+    int blocksize,
+    mcStream_t stream)
+{
+    // æ¯ä¸ªçº¿ç¨‹å¤„ç† 2 ä¸ªå…ƒç´ ï¼Œå› æ­¤æ€»çº¿ç¨‹æ•° = ceil(total_elements / 2)
+    int64_t num_threads = (total_elements + 1) / 2;
+
+    // é…ç½® Block å’Œ Grid ç»´åº¦
+    int threads_per_block = 256;
+    int blocks_per_grid = (num_threads + threads_per_block - 1) / threads_per_block;        
+
+    dequantize_nf4_kernel<T><<<blocks_per_grid, threads_per_block, 0, stream>>>(
+        d_packed_weights,
+        d_absmax_q,
+        d_absmax2,
+        d_code2,
+        offset,
+        d_output,
+        total_elements,
+        blocksize
+    );
+}
+
+// æ˜¾å¼å®žä¾‹åŒ–æ¨¡æ¿
+template void launch_dequantize_nf4<__maca_bfloat16>(
+    const uint8_t*, const uint8_t*, const uint16_t*, const uint16_t*,
+    float, __maca_bfloat16*, int64_t, int, mcStream_t);
+
+template void launch_dequantize_nf4<__half>(
+    const uint8_t*, const uint8_t*, const uint16_t*, const uint16_t*,
+    float, __half*, int64_t, int, mcStream_t);
\ No newline at end of file
diff --git a/03_nf4_dequant/trudging/src/dequantize.mc.h b/03_nf4_dequant/trudging/src/dequantize.mc.h
new file mode 100644
index 00000000..733947ab
--- /dev/null
+++ b/03_nf4_dequant/trudging/src/dequantize.mc.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <mcr/mc_runtime.h>
+/* #include <common/maca_bfloat16.h> */
+#include <common/maca_fp16.h>
+#include <cstdint>
+
+template <typename T>
+__global__ void dequantize_nf4_kernel(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const uint16_t* __restrict__ absmax2,
+    const uint16_t* __restrict__ code2,
+    float offset,
+    T* __restrict__ output,
+    int64_t total_elements,
+    int blocksize
+);
+
+template <typename T>
+void launch_dequantize_nf4(
+    const uint8_t* d_packed_weights,
+    const uint8_t* d_absmax_q,
+    const uint16_t* d_absmax2,
+    const uint16_t* d_code2,
+    float offset,
+    T* d_output,
+    int64_t total_elements,
+    int blocksize,
+    mcStream_t stream = nullptr
+);
\ No newline at end of file
diff --git a/03_nf4_dequant/trudging/src/dequantize.mu b/03_nf4_dequant/trudging/src/dequantize.mu
new file mode 100644
index 00000000..39f9fa10
--- /dev/null
+++ b/03_nf4_dequant/trudging/src/dequantize.mu
@@ -0,0 +1,146 @@
+ï»¿#include "dequantize.m.h"
+
+// é€‚é…ä¸åŒæ•°æ®ç±»åž‹çš„ float2T è¾…åŠ©å‡½æ•°
+template <typename T>
+__device__ inline T float2T(float v);
+
+template <>
+__device__ inline __mt_bfloat16 float2T<__mt_bfloat16>(float v) {
+    return __float2bfloat16(v);
+}
+
+template <>
+__device__ inline __half float2T<__half>(float v) {
+    return __float2half_rn(v);
+}
+
+// é’ˆå¯¹ __mt_bfloat16 çš„ç‰¹åŒ–å‘é‡åŒ–æ‰“åŒ…
+__device__ inline uint32_t pack_two_elements(__mt_bfloat16 w0, __mt_bfloat16 w1) {
+    uint16_t u0 = *reinterpret_cast<uint16_t*>(&w0);
+    uint16_t u1 = *reinterpret_cast<uint16_t*>(&w1);
+    return (static_cast<uint32_t>(u1) << 16) | u0;
+}
+
+// é’ˆå¯¹ __half (fp16) çš„ç‰¹åŒ–å‘é‡åŒ–æ‰“åŒ…
+__device__ inline uint32_t pack_two_elements(__half w0, __half w1) {
+    __half2 packed = __floats2half2_rn(__half2float(w0), __half2float(w1));
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+
+// 1. NF4 å¸¸é‡è¡¨ (Constant Memory)
+__constant__ float c_nf4_table[16] = {
+    -1.0f, -0.6961928f, -0.52507305f, -0.3949171f,
+    -0.28444138f, -0.18477343f, -0.091050036f, 0.0f,
+    0.07958029f, 0.1609302f, 0.2461123f, 0.33791524f,
+    0.44070983f, 0.562617f, 0.72295684f, 1.0f
+};
+
+// 2. Kernel å‡½æ•°å®žçŽ°
+template <typename T>
+__global__ void dequantize_nf4_kernel(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const uint16_t* __restrict__ absmax2,
+    const uint16_t* __restrict__ code2,
+    float offset,
+    T* __restrict__ output,
+    int64_t total_elements,
+    int blocksize)
+{
+    // 3. æ ¸å¿ƒè®¡ç®—é€»è¾‘
+    // æ¯ä¸ªçº¿ç¨‹å¤„ç† 1 ä¸ª uint8_t (å³ 2 ä¸ª 4-bit æƒé‡)
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // è¾¹ç•Œæ£€æŸ¥ï¼šç›´æŽ¥é™åˆ¶çº¿ç¨‹è¯»å– packed_weights çš„èŒƒå›´
+    if (tid >= (total_elements + 1) / 2) {
+        return;
+    }
+
+    // å…¨å±€æƒé‡çš„èµ·å§‹ç´¢å¼•
+    int64_t weight_idx = (int64_t)tid * 2;
+
+    // è¯»å– 1 å­—èŠ‚å¹¶è§£ç é«˜/ä½Ž 4 ä½
+    uint8_t packed = packed_weights[tid];
+    uint8_t idx0 = packed >> 4;           // é«˜ 4 ä½å¯¹åº”ç¬¬ä¸€ä¸ªæƒé‡
+    uint8_t idx1 = packed & 0x0F;         // ä½Ž 4 ä½å¯¹åº”ç¬¬äºŒä¸ªæƒé‡
+
+    // ä¸º w0 è®¡ç®—ç¼©æ”¾å› å­
+    int block_id0 = weight_idx / blocksize;
+    int group_id0 = block_id0 / 256;
+
+    __half code2_half0 = *reinterpret_cast<const __half*>(&code2[absmax_q[block_id0]]);     
+    __half absmax2_half0 = *reinterpret_cast<const __half*>(&absmax2[group_id0]);
+    float S1_0 = (__half2float(code2_half0) * __half2float(absmax2_half0)) + offset;        
+
+    // ä¸º w1 è®¡ç®—ç¼©æ”¾å› å­ (æ³¨æ„é˜²èŒƒ weight_idx + 1 è¶Šç•Œ)
+    float S1_1 = 0.0f;
+    if (weight_idx + 1 < total_elements) {
+        int block_id1 = (weight_idx + 1) / blocksize;
+        int group_id1 = block_id1 / 256;
+        __half code2_half1 = *reinterpret_cast<const __half*>(&code2[absmax_q[block_id1]]); 
+        __half absmax2_half1 = *reinterpret_cast<const __half*>(&absmax2[group_id1]);       
+        S1_1 = (__half2float(code2_half1) * __half2float(absmax2_half1)) + offset;
+    }
+
+    // æŸ¥è¡¨å¹¶è§£é‡åŒ–
+    float w0 = c_nf4_table[idx0] * S1_0;
+    float w1 = c_nf4_table[idx1] * S1_1;
+
+    // 4. å‘é‡åŒ–å†™å…¥ (Packed Store) ä¸Žå°¾éƒ¨è¾¹ç•Œå¤„ç†
+    // è½¬æ¢ä¸ºæŒ‡å®šçš„æµ®ç‚¹ç±»åž‹ (fp16 æˆ– bf16)
+    T out_w0 = float2T<T>(w0);
+    T out_w1 = float2T<T>(w1);
+
+    if (weight_idx + 1 < total_elements) {
+        // æ­£å¸¸æƒ…å†µï¼šåŒ…å« 2 ä¸ªæœ‰æ•ˆæƒé‡ï¼Œä½¿ç”¨å‘é‡åŒ–å†™å…¥
+        uint32_t packed_bits = pack_two_elements(out_w0, out_w1);
+
+        // å¼ºè½¬ä¸º uint32_t è¿›è¡Œä¸€æ¬¡ 32-bit åˆå¹¶è®¿é—®å†™å…¥
+        reinterpret_cast<uint32_t*>(output)[tid] = packed_bits;
+    } else {
+        // å°¾éƒ¨è¾¹ç•Œå¤„ç†ï¼šæ€»å…ƒç´ æ•°æ˜¯å¥‡æ•°ï¼Œå¹¶ä¸”è¿™æ˜¯æœ€åŽä¸€ä¸ªå•å…ƒç´ 
+        // é€€åŒ–ä¸ºæ ‡é‡å†™å…¥ï¼Œé¿å…è¶Šç•Œè®¿é—®
+        output[weight_idx] = out_w0;
+    }
+}
+
+// Host å¯åŠ¨å‡½æ•°
+template <typename T>
+void launch_dequantize_nf4(
+    const uint8_t* d_packed_weights,
+    const uint8_t* d_absmax_q,
+    const uint16_t* d_absmax2,
+    const uint16_t* d_code2,
+    float offset,
+    T* d_output, 
+    int64_t total_elements,
+    int blocksize,
+    musaStream_t stream)
+{
+    // æ¯ä¸ªçº¿ç¨‹å¤„ç† 2 ä¸ªå…ƒç´ ï¼Œå› æ­¤æ€»çº¿ç¨‹æ•° = ceil(total_elements / 2)
+    int64_t num_threads = (total_elements + 1) / 2;
+
+    // é…ç½® Block å’Œ Grid ç»´åº¦
+    int threads_per_block = 256;
+    int blocks_per_grid = (num_threads + threads_per_block - 1) / threads_per_block;        
+
+    dequantize_nf4_kernel<T><<<blocks_per_grid, threads_per_block, 0, stream>>>(
+        d_packed_weights,
+        d_absmax_q,
+        d_absmax2,
+        d_code2,
+        offset,
+        d_output,
+        total_elements,
+        blocksize
+    );
+}
+
+// æ˜¾å¼å®žä¾‹åŒ–æ¨¡æ¿
+template void launch_dequantize_nf4<__mt_bfloat16>(
+    const uint8_t*, const uint8_t*, const uint16_t*, const uint16_t*,
+    float, __mt_bfloat16*, int64_t, int, musaStream_t);
+
+template void launch_dequantize_nf4<__half>(
+    const uint8_t*, const uint8_t*, const uint16_t*, const uint16_t*,
+    float, __half*, int64_t, int, musaStream_t);
\ No newline at end of file
diff --git a/03_nf4_dequant/trudging/src/weights_loader.h b/03_nf4_dequant/trudging/src/weights_loader.h
new file mode 100644
index 00000000..e70ea233
--- /dev/null
+++ b/03_nf4_dequant/trudging/src/weights_loader.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#if defined(PLATFORM_METAX)
+    #include <mcr/mc_runtime.h>
+    
+    #define CUDA_MALLOC_HOST mcMallocHost
+    #define CUDA_FREE_HOST mcFreeHost
+    #define CUDA_SUCCESS mcSuccess
+    #define CUDA_GET_ERROR_STRING mcGetErrorString
+    #define CUDA_ERROR_T mcError_t
+#elif defined(PLATFORM_MOORE)
+    #include <musa_runtime.h>
+    #define CUDA_MALLOC_HOST musaMallocHost
+    #define CUDA_FREE_HOST musaFreeHost
+    #define CUDA_SUCCESS musaSuccess
+    #define CUDA_GET_ERROR_STRING musaGetErrorString
+    #define CUDA_ERROR_T musaError_t
+#else
+    #include <cuda_runtime.h>
+    #define CUDA_MALLOC_HOST cudaMallocHost
+    #define CUDA_FREE_HOST cudaFreeHost
+    #define CUDA_SUCCESS cudaSuccess
+    #define CUDA_GET_ERROR_STRING cudaGetErrorString
+    #define CUDA_ERROR_T cudaError_t
+#endif
+
+// Custom deleter
+struct CudaHostDeleter {
+    void operator()(void* ptr) const {
+        if (ptr) {
+            CUDA_FREE_HOST(ptr);
+        }
+    }
+};
+
+// åˆ«åå®šä¹‰ï¼Œæ–¹ä¾¿ä½¿ç”¨
+template <typename T>
+using start_pinned_ptr = std::unique_ptr<T[], CudaHostDeleter>;
+
+// è¾…åŠ©å‡½æ•°ï¼šåˆ†é… pinned memory
+template <typename T>
+start_pinned_ptr<T> allocate_pinned(size_t count) {
+    void* ptr = nullptr;
+    CUDA_ERROR_T err = CUDA_MALLOC_HOST(&ptr, count * sizeof(T));
+    if (err != CUDA_SUCCESS) {
+        throw std::runtime_error(std::string("CUDA_MALLOC_HOST failed: ") + CUDA_GET_ERROR_STRING(err));
+    }
+    return start_pinned_ptr<T>(static_cast<T*>(ptr));
+}
+
+struct QuantizedWeights {
+    int64_t num_rows;
+    int64_t num_cols;
+    int32_t block_size;
+
+    size_t num_blocks;
+    size_t num_groups;
+    size_t packed_size;
+
+    // ä½¿ç”¨æ™ºèƒ½æŒ‡é’ˆç®¡ç†çš„ Pinned Memory æ•°ç»„
+    start_pinned_ptr<uint8_t> packed_weights;
+    start_pinned_ptr<uint8_t> absmax_q;
+    start_pinned_ptr<uint16_t> absmax2;
+    start_pinned_ptr<uint16_t> code2;
+    
+    float offset; // å•ä¸ª float å€¼
+};
+
+inline QuantizedWeights load_weights(const std::string& filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        throw std::runtime_error("Failed to open file: " + filename);
+    }
+
+    QuantizedWeights w;
+
+    // 1. è¯»å–å¤´éƒ¨
+    if (!file.read(reinterpret_cast<char*>(&w.num_rows), sizeof(w.num_rows))) throw std::runtime_error("Failed to read num_rows");
+    if (!file.read(reinterpret_cast<char*>(&w.num_cols), sizeof(w.num_cols))) throw std::runtime_error("Failed to read num_cols");
+    if (!file.read(reinterpret_cast<char*>(&w.block_size), sizeof(w.block_size))) throw std::runtime_error("Failed to read block_size");
+
+    // 2. è®¡ç®—å„éƒ¨åˆ†å¤§å°
+    // æ³¨æ„ï¼šè¿™é‡Œå‡è®¾ num_rows * num_cols æ˜¯å¶æ•°ï¼Œæˆ–è€…æŒ‰ç…§ (N*M)/2 å‘ä¸‹å–æ•´ã€‚
+    // å¦‚æžœæ˜¯ 4-bit é‡åŒ–ï¼Œé€šå¸¸ä½ éœ€è¦ç¡®ä¿æ€»å…ƒç´ ä¸ªæ•°æ˜¯å¶æ•°ï¼Œæˆ–è€…å¤„ç†å°¾éƒ¨ paddingã€‚
+    w.packed_size = (w.num_rows * w.num_cols) / 2;
+    
+    // num_blocks = ceil(num_rows * num_cols / blocksize)
+    w.num_blocks = (w.num_rows * w.num_cols + w.block_size - 1) / w.block_size;
+    
+    // num_groups = ceil(num_blocks / 256)
+    // æ ¹æ®æ‚¨çš„è¦æ±‚ï¼šblock_size_2 ä¸ºå›ºå®š 256
+    // æ³¨ï¼šåŽŸé—®é¢˜ä¸­æåˆ° "absmax2: ... é•¿åº¦ä¸º num_groups (å‡è®¾å›ºå®šä¸º 256)" 
+    // ä½†åŽç»­è¿½é—®æŒ‡å‡ºåº”ä¸ºè®¡ç®—å€¼ã€‚æ­¤å¤„æŒ‰è¿½é—®é€»è¾‘è®¡ç®— num_groupsã€‚
+    // å¦‚æžœ "å‡è®¾å›ºå®šä¸º 256" æŒ‡çš„æ˜¯ group_sizeï¼Œåˆ™å¦‚ä¸‹è®¡ç®—ï¼š
+    size_t group_size = 256;
+    w.num_groups = (w.num_blocks + group_size - 1) / group_size;
+
+    // 3. åˆ†é… Pinned Memory
+    try {
+        w.packed_weights = allocate_pinned<uint8_t>(w.packed_size);
+        w.absmax_q = allocate_pinned<uint8_t>(w.num_blocks);
+        w.absmax2 = allocate_pinned<uint16_t>(w.num_groups);
+        w.code2 = allocate_pinned<uint16_t>(256); // å›ºå®š 256 å…ƒç´ 
+    } catch (const std::exception& e) {
+        file.close();
+        throw;
+    }
+
+    // 4. è¯»å–æ•°æ®æ•°ç»„
+    auto read_array = [&](char* dst, size_t size, const char* name) {
+        file.read(dst, size);
+        if (file.gcount() != static_cast<std::streamsize>(size)) {
+            throw std::runtime_error(std::string("Failed to read ") + name + ". Expected " + std::to_string(size) + " bytes, got " + std::to_string(file.gcount()));
+        }
+    };
+
+    read_array(reinterpret_cast<char*>(w.packed_weights.get()), w.packed_size * sizeof(uint8_t), "packed_weights");
+    read_array(reinterpret_cast<char*>(w.absmax_q.get()), w.num_blocks * sizeof(uint8_t), "absmax_q");
+    read_array(reinterpret_cast<char*>(w.absmax2.get()), w.num_groups * sizeof(uint16_t), "absmax2");
+    read_array(reinterpret_cast<char*>(w.code2.get()), 256 * sizeof(uint16_t), "code2");
+
+    // 5. è¯»å– offset
+    if (!file.read(reinterpret_cast<char*>(&w.offset), sizeof(w.offset))) {
+        throw std::runtime_error("Failed to read offset");
+    }
+
+    // 6. æ£€æŸ¥æ˜¯å¦è¿˜æœ‰å‰©ä½™æ•°æ®ï¼ˆå¯é€‰ï¼Œè§†æ–‡ä»¶æ ¼å¼ä¸¥æ ¼ç¨‹åº¦è€Œå®šï¼‰
+    if (file.peek() != EOF) {
+        std::cerr << "Warning: Extra data found at the end of the file " << filename << std::endl;
+    }
+
+    file.close();
+    return w;
+}
diff --git a/03_nf4_dequant/trudging/xmake.lua b/03_nf4_dequant/trudging/xmake.lua
new file mode 100644
index 00000000..98df1ff9
--- /dev/null
+++ b/03_nf4_dequant/trudging/xmake.lua
@@ -0,0 +1,27 @@
+add_rules("mode.debug", "mode.release")
+
+target("nf4_dequantizer")
+    set_kind("binary")
+    add_files("main.cu", "src/dequantize.cu")
+    
+    -- è¯­è¨€è®¾ç½®: C++17 å’Œ CUDA
+    set_languages("cxx17", "cuda")
+    
+    -- ç›®æ ‡ GPU æž¶æž„: T4 (75), A100 (80), 4090 (89)
+    add_cugencodes("compute_75,sm_75")
+    add_cugencodes("compute_80,sm_80")
+    add_cugencodes("compute_89,sm_89")
+
+    -- ç¼–è¯‘é€‰é¡¹
+    if is_mode("release") then
+        set_optimize("fastest") -- å¯¹åº” -O3
+    end
+
+    -- CUDA ç‰¹æœ‰æ ‡å¿—
+    -- -lineinfo: ç”Ÿæˆè¡Œå·ä¿¡æ¯ï¼Œç”¨äºŽ Nsight Compute
+    -- --ptxas-options=-v: æ˜¾ç¤º PTX æ±‡ç¼–è¯¦ç»†ä¿¡æ¯ (å¯„å­˜å™¨ä½¿ç”¨é‡ç­‰)
+    -- -use_fast_math: å¯ç”¨å¿«é€Ÿæ•°å­¦åº“
+    add_cuflags("-lineinfo", "--ptxas-options=-v", "-use_fast_math")
+    
+    -- å¤´æ–‡ä»¶ç›®å½•
+    add_includedirs("src")
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..883d452a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,103 @@
+# *********************************************************************
+# Learning-CUDA Makefile
+# Targets:
+#   make               		: Build + run tests (default, non-verbose)
+#   make build         		: Only compile (no run)
+#   make run           		: Run tests (after build, non-verbose)
+#   make run VERBOSE=true 	: Run tests with verbose output
+#   make clean         		: Delete temporary files
+# *********************************************************************
+
+# -------------------------------
+# Configuration
+# -------------------------------
+PLATFORM        ?= nvidia
+PLATFORM_DEFINE ?= -DPLATFORM_NVIDIA
+STUDENT_SUFFIX  := cu
+CFLAGS          := -std=c++17 -O0
+EXTRA_LIBS     	:= 
+
+# Compiler & Tester object selection based on PLATFORM
+ifeq ($(PLATFORM),nvidia)
+    CC          	:= nvcc
+    TEST_OBJ    	:= tester/tester_nv.o
+	PLATFORM_DEFINE := -DPLATFORM_NVIDIA
+else ifeq ($(PLATFORM),iluvatar)
+    CC          	:= clang++
+	CFLAGS          := -std=c++17 -O3
+    TEST_OBJ    	:= tester/tester_iluvatar.o
+	PLATFORM_DEFINE := -DPLATFORM_ILUVATAR
+	EXTRA_LIBS		:= -lcudart -I/usr/local/corex/include -L/usr/local/corex/lib64 -fPIC
+else ifeq ($(PLATFORM),moore)
+    CC          	:= mcc
+	CFLAGS          := -std=c++11 -O3
+    TEST_OBJ    	:= tester/tester_moore.o
+	STUDENT_SUFFIX  := mu
+	PLATFORM_DEFINE := -DPLATFORM_MOORE
+	EXTRA_LIBS		:= -I/usr/local/musa/include -L/usr/lib/gcc/x86_64-linux-gnu/11/ -L/usr/local/musa/lib -lmusart
+else ifeq ($(PLATFORM),metax)
+    CC          	:= mxcc
+    TEST_OBJ    	:= tester/tester_metax.o
+	STUDENT_SUFFIX  := maca
+	PLATFORM_DEFINE := -DPLATFORM_METAX
+else
+    $(error Unsupported PLATFORM '$(PLATFORM)' (expected: nvidia, iluvatar, moore, metax))
+endif
+
+# Executable name
+TARGET          	:= test_kernels
+# Kernel implementation
+STUDENT_SRC     	:= src/kernels.$(STUDENT_SUFFIX) 
+# Compiled student object (auto-generated)
+STUDENT_OBJ  		:= $(addsuffix .o,$(basename $(STUDENT_SRC)))
+# Tester's actual verbose argument (e.g., --verbose, -v)
+TEST_VERBOSE_FLAG 	:= --verbose
+# User-provided verbose mode (true/false; default: false)
+VERBOSE         	:=  
+
+# -------------------------------
+# Process User Input (VERBOSE â†’ Tester Flag)
+# -------------------------------
+# Translates `VERBOSE=true` (case-insensitive) to the tester's verbose flag.
+# If VERBOSE is not "true" (or empty), no flag is passed.
+VERBOSE_ARG := $(if $(filter true True TRUE, $(VERBOSE)), $(TEST_VERBOSE_FLAG), )
+
+# -------------------------------
+# Phony Targets
+# -------------------------------
+.PHONY: all build run clean
+
+# Default target: Build + run tests (non-verbose)
+all: build run
+
+# Build target: Compile student code + link with test logic
+build: $(TARGET)
+
+# Run target: Execute tests (supports `VERBOSE=true` for verbose output)
+run: $(TARGET)
+	@echo "=== Running tests (output from $(STUDENT_OBJ)) ==="
+	@# Show verbose mode status (friendly for users)
+	@if [ -n "$(VERBOSE_ARG)" ]; then \
+	    echo "=== Verbose mode: Enabled (using '$(TEST_VERBOSE_FLAG)') ==="; \
+	else \
+	    echo "=== Verbose mode: Disabled ==="; \
+	fi
+	./$(TARGET) $(VERBOSE_ARG)
+
+# Clean target: Delete temporary files (executable + src object)
+clean:
+	@echo "=== Cleaning temporary files ==="
+	rm -f $(TARGET) $(STUDENT_OBJ)
+
+# -------------------------------
+# Dependency Rules (Core Logic)
+# -------------------------------
+# Generate executable: Link kernel code (kernels.o) with test logic (tester.o)
+$(TARGET): $(STUDENT_OBJ) $(TEST_OBJ)
+	@echo "=== Linking executable (student code + test logic) ==="
+	$(CC) $(CFLAGS) $(PLATFORM_DEFINE) -o $@ $^ $(EXTRA_LIBS)
+
+# Generate src object: Compile kernels.cu (triggers template instantiation)
+$(STUDENT_OBJ): $(STUDENT_SRC)
+	@echo "=== Compiling student code ($(STUDENT_SRC)) ==="
+	$(CC) $(CFLAGS) $(PLATFORM_DEFINE) -c $< -o $@
diff --git a/apply_fix.sh b/apply_fix.sh
new file mode 100644
index 00000000..20ee32b8
--- /dev/null
+++ b/apply_fix.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Script to apply double precision fix to flashAttentionFallback kernel
+
+cd /data1/kppppp/Learning-CUDA
+
+# Backup original file
+cp src/kernels.cu src/kernels.cu.backup
+
+# Apply the fix using sed
+sed -i '
+/flashAttentionFallback/,/^}/ {
+    s/float maxVal = -INFINITY;/double maxVal = -INFINITY;/
+    s/float sumExp = 0\.0f;/double sumExp = 0.0;/
+    s/float result = 0\.0f;/double result = 0.0;/
+    s/float dot = 0\.0f;/double dot = 0.0;/
+    s/float prevMax = maxVal;/double prevMax = maxVal;/
+    s/float correction =/double correction =/
+    s/float weight =/double weight =/
+    s/fmaxf(maxVal, dot)/fmax(maxVal, dot)/
+    s/expf(/exp(/g
+    s/0\.0f/0.0/g
+}
+' src/kernels.cu
+
+# Also update the comment
+sed -i 's/\/\/ Online softmax approach$/\/\/ Online softmax approach - use double precision for accumulation/' src/kernels.cu
+
+echo "Fix applied! Verifying changes..."
+grep -A 30 "// Online softmax approach" src/kernels.cu | head -35
+
+echo ""
+echo "Now compile and test:"
+echo "  make PLATFORM=iluvatar build"
+echo "  ./test_kernels"
diff --git a/fix_iluvatar_float.py b/fix_iluvatar_float.py
new file mode 100644
index 00000000..8a4c939e
--- /dev/null
+++ b/fix_iluvatar_float.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""
+Fix script for Iluvatar BI-V100 Flash Attention float precision issue
+Changes float accumulation to double precision in flashAttentionFallback kernel
+"""
+
+import os
+import sys
+
+def apply_fix():
+    filepath = 'src/kernels.cu'
+    
+    # Check if file exists
+    if not os.path.exists(filepath):
+        print(f"ERROR: {filepath} not found!")
+        print(f"Current directory: {os.getcwd()}")
+        sys.exit(1)
+    
+    # Read the file
+    print(f"Reading {filepath}...")
+    with open(filepath, 'r') as f:
+        content = f.read()
+    
+    # Backup
+    backup_path = filepath + '.before_double_fix'
+    with open(backup_path, 'w') as f:
+        f.write(content)
+    print(f"Backup created: {backup_path}")
+    
+    # Apply fixes - be very specific to avoid changing other parts
+    replacements = [
+        # In flashAttentionFallback only
+        ('    // Online softmax approach\n    float maxVal = -INFINITY;',
+         '    // Online softmax approach - use double precision for accumulation\n    double maxVal = -INFINITY;'),
+        ('    float sumExp = 0.0f;', '    double sumExp = 0.0;'),
+        ('    float result = 0.0f;', '    double result = 0.0;'),
+        ('        float dot = 0.0f;', '        double dot = 0.0;'),
+        ('        float prevMax = maxVal;', '        double prevMax = maxVal;'),
+        ('        maxVal = fmaxf(maxVal, dot);', '        maxVal = fmax(maxVal, dot);'),
+        ('        float correction = (prevMax == -INFINITY) ? 0.0f : expf(prevMax - maxVal);',
+         '        double correction = (prevMax == -INFINITY) ? 0.0 : exp(prevMax - maxVal);'),
+        ('        float weight = expf(dot - maxVal);',
+         '        double weight = exp(dot - maxVal);'),
+        ('    O[oIdx] = TypeConverter<T>::fromFloat((sumExp > 0.0f) ? (result / sumExp) : 0.0f);',
+         '    O[oIdx] = TypeConverter<T>::fromFloat((sumExp > 0.0) ? (result / sumExp) : 0.0);'),
+    ]
+    
+    print("\nApplying fixes...")
+    for i, (old, new) in enumerate(replacements, 1):
+        if old in content:
+            content = content.replace(old, new, 1)  # Replace only first occurrence
+            print(f"  âœ“ Fix {i}/9 applied")
+        else:
+            print(f"  âœ— Fix {i}/9 FAILED - pattern not found:")
+            print(f"    Looking for: {old[:60]}...")
+            # Don't exit, continue to see all failures
+    
+    # Write the modified content
+    with open(filepath, 'w') as f:
+        f.write(content)
+    
+    print(f"\nâœ“ Changes written to {filepath}")
+    
+    # Show the modified section
+    print("\n" + "="*70)
+    print("Modified flashAttentionFallback kernel (lines with double):")
+    print("="*70)
+    
+    lines = content.split('\n')
+    in_section = False
+    line_count = 0
+    for i, line in enumerate(lines, 1):
+        if '// Online softmax approach' in line:
+            in_section = True
+        if in_section:
+            print(f"{i:4d}: {line}")
+            line_count += 1
+            if 'O[oIdx] = TypeConverter' in line:
+                break
+    
+    print("\n" + "="*70)
+    print("Verification:")
+    print("="*70)
+    
+    # Count occurrences to verify
+    double_count = content.count('double maxVal')
+    double_sumexp = content.count('double sumExp')
+    double_result = content.count('double result')
+    
+    print(f"  double maxVal occurrences: {double_count} (expected: 1)")
+    print(f"  double sumExp occurrences: {double_sumexp} (expected: 1)")
+    print(f"  double result occurrences: {double_result} (expected: 1)")
+    
+    if double_count >= 1 and double_sumexp >= 1 and double_result >= 1:
+        print("\nâœ“ Fix appears successful!")
+        print("\nNext steps:")
+        print("  1. Compile: make PLATFORM=iluvatar build")
+        print("  2. Test: ./test_kernels")
+        print("  3. Check if all 90 tests pass")
+    else:
+        print("\nâœ— Fix may not have been fully applied. Check the output above.")
+        return False
+    
+    return True
+
+if __name__ == '__main__':
+    print("="*70)
+    print("Iluvatar Flash Attention Float Fix - Double Precision Patch")
+    print("="*70)
+    
+    # Change to the right directory if needed
+    if not os.path.exists('src/kernels.cu'):
+        expected_dir = '/data1/kppppp/Learning-CUDA'
+        if os.path.exists(expected_dir):
+            os.chdir(expected_dir)
+            print(f"Changed directory to: {expected_dir}")
+        else:
+            print(f"ERROR: Cannot find kernels.cu")
+            sys.exit(1)
+    
+    success = apply_fix()
+    sys.exit(0 if success else 1)
diff --git a/src/kernels.cu b/src/kernels.cu
new file mode 100644
index 00000000..71b9b2fd
--- /dev/null
+++ b/src/kernels.cu
@@ -0,0 +1,456 @@
+/**
+ * @file kernels.cu
+ * @brief CUDA kernel implementations for matrix trace and Flash Attention
+ * @author Training Camp Student
+ * @date 2026-02
+ * 
+ * This file contains highly optimized CUDA implementations of:
+ * 1. Matrix trace computation with parallel reduction
+ * 2. Flash Attention with causal masking and GQA support
+ * 
+ * Supported platforms: NVIDIA, Iluvatar (å¤©æ•°)
+ * 
+ * Optimization techniques used:
+ * - Warp shuffle for fast intra-warp reduction
+ * - Grid-stride loops for handling large inputs
+ * - Shared memory tiling with bank conflict avoidance
+ * - Memory coalescing and vectorized loads
+ * - Online softmax for single-pass attention
+ * - __ldg() for cached global memory reads (NVIDIA only)
+ * - Loop unrolling for reduced instruction overhead
+ */
+
+#include <vector>
+#include <cuda_fp16.h>
+#include <cmath>
+#include <algorithm>
+
+#include "../tester/utils.h"
+
+// ============================================================================
+// PLATFORM COMPATIBILITY MACROS
+// ============================================================================
+
+// Iluvatar may not support __ldg(), provide fallback
+#if defined(PLATFORM_ILUVATAR)
+    #define LDG(ptr) (*(ptr))
+#else
+    #define LDG(ptr) __ldg(ptr)
+#endif
+
+// ============================================================================
+// CONSTANTS AND CONFIGURATION
+// ============================================================================
+
+constexpr int WARP_SIZE = 32;
+constexpr int TRACE_BLOCK_SIZE = 256;
+constexpr int ATTN_BLOCK_SIZE = 128;      // Threads per block for attention
+constexpr int ATTN_TILE_SIZE = 64;        // Larger tile for better data reuse
+
+// ============================================================================
+// UTILITY FUNCTIONS
+// ============================================================================
+
+/**
+ * @brief Warp-level reduction using shuffle instructions (optimized)
+ */
+template <typename T>
+__device__ __forceinline__ T warpReduceSum(T val) {
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
+        val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+    return val;
+}
+
+/**
+ * @brief Block-level reduction with minimal synchronization
+ */
+template <typename T>
+__device__ __forceinline__ T blockReduceSum(T val, T* shared) {
+    const int lane = threadIdx.x % WARP_SIZE;
+    const int wid = threadIdx.x / WARP_SIZE;
+    
+    val = warpReduceSum(val);
+    
+    if (lane == 0) shared[wid] = val;
+    __syncthreads();
+    
+    const int numWarps = blockDim.x / WARP_SIZE;
+    val = (threadIdx.x < numWarps) ? shared[threadIdx.x] : T(0);
+    
+    if (wid == 0) val = warpReduceSum(val);
+    
+    return val;
+}
+
+// ============================================================================
+// TRACE KERNEL - HIGHLY OPTIMIZED
+// ============================================================================
+
+/**
+ * @brief Optimized trace kernel with grid-stride loop
+ * 
+ * Features:
+ * - Grid-stride loop handles matrices of any size with minimal blocks
+ * - Each thread accumulates multiple diagonal elements
+ * - Warp shuffle reduction for fast summation
+ */
+template <typename T>
+__global__ void traceKernelOptimized(const T* __restrict__ input, 
+                                      T* __restrict__ output, 
+                                      size_t rows, 
+                                      size_t cols) {
+    __shared__ T sharedMem[TRACE_BLOCK_SIZE / WARP_SIZE];
+    
+    const size_t diagLen = min(rows, cols);
+    const size_t stride = gridDim.x * blockDim.x;
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    // Grid-stride loop: each thread accumulates multiple elements
+    T localSum = T(0);
+    while (idx < diagLen) {
+        localSum += input[idx * cols + idx];
+        idx += stride;
+    }
+    
+    // Block reduction
+    localSum = blockReduceSum(localSum, sharedMem);
+    
+    if (threadIdx.x == 0) {
+        atomicAdd(output, localSum);
+    }
+}
+
+/**
+ * @brief Computes the trace of a matrix using CUDA
+ *
+ * The trace of a matrix is defined as the sum of its diagonal elements.
+ * This implementation uses parallel reduction on GPU for efficient computation.
+ *
+ * Algorithm complexity: O(n/p) where n = min(rows, cols) and p = #threads
+ * 
+ * @tparam T The numeric type of matrix elements (int or float)
+ * @param h_input A flattened row-major matrix of size rows * cols
+ * @param rows Number of rows in the matrix
+ * @param cols Number of columns in the matrix
+ * @return The trace (sum of diagonal values) of the matrix
+ */
+template <typename T>
+T trace(const std::vector<T>& h_input, size_t rows, size_t cols) {
+    const size_t diagLen = std::min(rows, cols);
+    if (diagLen == 0) return T(0);
+    
+    const size_t inputBytes = rows * cols * sizeof(T);
+    const size_t outputBytes = sizeof(T);
+    
+    T* d_input = nullptr;
+    T* d_output = nullptr;
+    cudaMalloc(&d_input, inputBytes);
+    cudaMalloc(&d_output, outputBytes);
+    
+    cudaMemcpy(d_input, h_input.data(), inputBytes, cudaMemcpyHostToDevice);
+    cudaMemset(d_output, 0, outputBytes);
+    
+    // Use fewer blocks with grid-stride loop for better efficiency
+    const int blockSize = TRACE_BLOCK_SIZE;
+    const int numBlocks = std::min((int)((diagLen + blockSize - 1) / blockSize), 128);
+    
+    traceKernelOptimized<T><<<numBlocks, blockSize>>>(d_input, d_output, rows, cols);
+    
+    T result;
+    cudaMemcpy(&result, d_output, outputBytes, cudaMemcpyDeviceToHost);
+    
+    cudaFree(d_input);
+    cudaFree(d_output);
+    
+    return result;
+}
+
+// ============================================================================
+// FLASH ATTENTION IMPLEMENTATION - OPTIMIZED
+// ============================================================================
+
+/**
+ * @brief Type conversion utilities for mixed-precision computation
+ */
+template <typename T>
+struct TypeConverter {
+    __device__ __forceinline__ static float toFloat(T val);
+    __device__ __forceinline__ static T fromFloat(float val);
+};
+
+template <>
+struct TypeConverter<float> {
+    __device__ __forceinline__ static float toFloat(float val) { return val; }
+    __device__ __forceinline__ static float fromFloat(float val) { return val; }
+};
+
+template <>
+struct TypeConverter<half> {
+    __device__ __forceinline__ static float toFloat(half val) { return __half2float(val); }
+    __device__ __forceinline__ static half fromFloat(float val) { return __float2half(val); }
+};
+
+/**
+ * @brief Optimized Flash Attention kernel with online softmax
+ * 
+ * Features:
+ * - Online softmax for single-pass computation
+ * - Shared memory tiling for K/V
+ * - Warp shuffle for efficient reduction
+ * - __ldg() for cached global memory reads
+ */
+template <typename T>
+__global__ void flashAttentionKernelOpt(
+    const T* __restrict__ Q,
+    const T* __restrict__ K,
+    const T* __restrict__ V,
+    T* __restrict__ O,
+    const int batchSize,
+    const int tgtSeqLen,
+    const int srcSeqLen,
+    const int queryHeads,
+    const int kvHeads,
+    const int headDim,
+    const bool isCausal,
+    const float scale) {
+    
+    // Shared memory for K and V tiles
+    extern __shared__ float sharedMem[];
+    float* sK = sharedMem;
+    float* sV = sK + ATTN_TILE_SIZE * headDim;
+    
+    const int batchIdx = blockIdx.z;
+    const int headIdx = blockIdx.y;
+    const int tgtPos = blockIdx.x;
+    const int tid = threadIdx.x;
+    
+    if (batchIdx >= batchSize || headIdx >= queryHeads || tgtPos >= tgtSeqLen) return;
+    
+    // GQA mapping
+    const int kvHeadIdx = headIdx / (queryHeads / kvHeads);
+    
+    // Base offsets
+    const size_t qBase = ((size_t)batchIdx * tgtSeqLen + tgtPos) * queryHeads * headDim + headIdx * headDim;
+    const size_t kvBase = (size_t)batchIdx * srcSeqLen * kvHeads * headDim + kvHeadIdx * headDim;
+    
+    // Load Q into registers
+    float qReg[8] = {0.0f};
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+        int d = tid + i * blockDim.x;
+        if (d < headDim) {
+            qReg[i] = TypeConverter<T>::toFloat(LDG(&Q[qBase + d]));
+        }
+    }
+    
+    // Online softmax state
+    float rowMax = -INFINITY;
+    float rowSum = 0.0f;
+    float outReg[8] = {0.0f};
+    
+    // Effective length with causal masking
+    const int maxSrc = isCausal ? min(tgtPos + 1, srcSeqLen) : srcSeqLen;
+    
+    // Process in tiles
+    for (int tileStart = 0; tileStart < maxSrc; tileStart += ATTN_TILE_SIZE) {
+        const int tileEnd = min(tileStart + ATTN_TILE_SIZE, maxSrc);
+        const int tileLen = tileEnd - tileStart;
+        
+        // Load K and V tiles cooperatively
+        for (int idx = tid; idx < tileLen * headDim; idx += blockDim.x) {
+            int s = idx / headDim;
+            int d = idx % headDim;
+            size_t kvIdx = kvBase + (size_t)(tileStart + s) * kvHeads * headDim + d;
+            sK[s * headDim + d] = TypeConverter<T>::toFloat(LDG(&K[kvIdx]));
+            sV[s * headDim + d] = TypeConverter<T>::toFloat(LDG(&V[kvIdx]));
+        }
+        __syncthreads();
+        
+        // Process each K position
+        for (int s = 0; s < tileLen; s++) {
+            // Compute dot product
+            float dot = 0.0f;
+            #pragma unroll
+            for (int i = 0; i < 8; i++) {
+                int d = tid + i * blockDim.x;
+                if (d < headDim) {
+                    dot += qReg[i] * sK[s * headDim + d];
+                }
+            }
+            
+            // Warp reduction
+            #pragma unroll
+            for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
+                dot += __shfl_down_sync(0xffffffff, dot, offset);
+            }
+            // Broadcast to all threads in warp
+            dot = __shfl_sync(0xffffffff, dot, 0);
+            dot *= scale;
+            
+            // Online softmax with improved numerical stability
+            float prevMax = rowMax;
+            rowMax = fmaxf(rowMax, dot);
+            float correction = (prevMax == -INFINITY) ? 0.0f : expf(prevMax - rowMax);
+            float weight = expf(dot - rowMax);
+            rowSum = rowSum * correction + weight;
+            
+            // Update output
+            #pragma unroll
+            for (int i = 0; i < 8; i++) {
+                int d = tid + i * blockDim.x;
+                if (d < headDim) {
+                    outReg[i] = outReg[i] * correction + weight * sV[s * headDim + d];
+                }
+            }
+        }
+        __syncthreads();
+    }
+    
+    // Write output
+    float invSum = (rowSum > 0.0f) ? (1.0f / rowSum) : 0.0f;
+    size_t oBase = ((size_t)batchIdx * tgtSeqLen + tgtPos) * queryHeads * headDim + headIdx * headDim;
+    
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+        int d = tid + i * blockDim.x;
+        if (d < headDim) {
+            O[oBase + d] = TypeConverter<T>::fromFloat(outReg[i] * invSum);
+        }
+    }
+}
+
+/**
+ * @brief Fallback kernel for non-standard dimensions
+ */
+template <typename T>
+__global__ void flashAttentionFallback(
+    const T* __restrict__ Q,
+    const T* __restrict__ K,
+    const T* __restrict__ V,
+    T* __restrict__ O,
+    const int batchSize,
+    const int tgtSeqLen,
+    const int srcSeqLen,
+    const int queryHeads,
+    const int kvHeads,
+    const int headDim,
+    const bool isCausal,
+    const float scale) {
+    
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total = batchSize * tgtSeqLen * queryHeads * headDim;
+    if (idx >= total) return;
+    
+    const int d = idx % headDim;
+    const int h = (idx / headDim) % queryHeads;
+    const int t = (idx / (headDim * queryHeads)) % tgtSeqLen;
+    const int b = idx / (headDim * queryHeads * tgtSeqLen);
+    
+    const int kvH = h / (queryHeads / kvHeads);
+    const int maxSrc = isCausal ? min(t + 1, srcSeqLen) : srcSeqLen;
+    
+    // Online softmax approach
+    // Standard float implementation for NVIDIA and others (verified 90/90 passed)
+    float maxVal = -INFINITY;
+    float sumExp = 0.0f;
+    float result = 0.0f;
+    
+    for (int s = 0; s < maxSrc; s++) {
+        float dot = 0.0f;
+        for (int dd = 0; dd < headDim; dd++) {
+            int qIdx = ((b * tgtSeqLen + t) * queryHeads + h) * headDim + dd;
+            int kIdx = ((b * srcSeqLen + s) * kvHeads + kvH) * headDim + dd;
+            dot += TypeConverter<T>::toFloat(Q[qIdx]) * TypeConverter<T>::toFloat(K[kIdx]);
+        }
+        dot *= scale;
+        
+        float prevMax = maxVal;
+        maxVal = fmaxf(maxVal, dot);
+        float correction = (prevMax == -INFINITY) ? 0.0f : expf(prevMax - maxVal);
+        float weight = expf(dot - maxVal);
+        sumExp = sumExp * correction + weight;
+        
+        int vIdx = ((b * srcSeqLen + s) * kvHeads + kvH) * headDim + d;
+        result = result * correction + weight * TypeConverter<T>::toFloat(V[vIdx]);
+    }
+    
+    int oIdx = ((b * tgtSeqLen + t) * queryHeads + h) * headDim + d;
+    O[oIdx] = TypeConverter<T>::fromFloat((sumExp > 0.0f) ? (result / sumExp) : 0.0f);
+}
+
+/**
+ * @brief Computes Flash Attention for given query, key, and value tensors
+ */
+template <typename T>
+void flashAttention(const std::vector<T>& h_q, const std::vector<T>& h_k,
+                    const std::vector<T>& h_v, std::vector<T>& h_o,
+                    int batch_size, int target_seq_len, int src_seq_len, 
+                    int query_heads, int kv_heads, int head_dim, bool is_causal) {
+    
+    const size_t qSize = batch_size * target_seq_len * query_heads * head_dim;
+    const size_t kvSize = batch_size * src_seq_len * kv_heads * head_dim;
+    
+    h_o.resize(qSize);
+    
+    T *d_q, *d_k, *d_v, *d_o;
+    cudaMalloc(&d_q, qSize * sizeof(T));
+    cudaMalloc(&d_k, kvSize * sizeof(T));
+    cudaMalloc(&d_v, kvSize * sizeof(T));
+    cudaMalloc(&d_o, qSize * sizeof(T));
+    
+    cudaMemcpy(d_q, h_q.data(), qSize * sizeof(T), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_k, h_k.data(), kvSize * sizeof(T), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_v, h_v.data(), kvSize * sizeof(T), cudaMemcpyHostToDevice);
+    
+    const float scale = 1.0f / sqrtf(static_cast<float>(head_dim));
+    
+    // Use optimized kernel only for strictly tested dimensions
+    // Disabled for now to ensure correctness - fallback kernel passes all tests
+    const bool useOptimized = false;
+    
+    if (useOptimized) {
+        const int blockSize = WARP_SIZE;  // Single warp per block for correctness
+        const size_t sharedBytes = 2 * ATTN_TILE_SIZE * head_dim * sizeof(float);
+        
+        dim3 grid(target_seq_len, query_heads, batch_size);
+        dim3 block(blockSize);
+        
+        flashAttentionKernelOpt<T><<<grid, block, sharedBytes>>>(
+            d_q, d_k, d_v, d_o,
+            batch_size, target_seq_len, src_seq_len,
+            query_heads, kv_heads, head_dim,
+            is_causal, scale);
+    } else {
+        const int total = batch_size * target_seq_len * query_heads * head_dim;
+        const int blockSize = 256;
+        const int numBlocks = (total + blockSize - 1) / blockSize;
+        
+        flashAttentionFallback<T><<<numBlocks, blockSize>>>(
+            d_q, d_k, d_v, d_o,
+            batch_size, target_seq_len, src_seq_len,
+            query_heads, kv_heads, head_dim,
+            is_causal, scale);
+    }
+    
+    cudaMemcpy(h_o.data(), d_o, qSize * sizeof(T), cudaMemcpyDeviceToHost);
+    
+    cudaFree(d_q);
+    cudaFree(d_k);
+    cudaFree(d_v);
+    cudaFree(d_o);
+}
+
+// ============================================================================
+// EXPLICIT TEMPLATE INSTANTIATIONS
+// Required for linking with the tester - DO NOT MODIFY
+// ============================================================================
+template int trace<int>(const std::vector<int>&, size_t, size_t);
+template float trace<float>(const std::vector<float>&, size_t, size_t);
+template void flashAttention<float>(const std::vector<float>&, const std::vector<float>&,
+  const std::vector<float>&, std::vector<float>&,
+  int, int, int, int, int, int, bool);
+template void flashAttention<half>(const std::vector<half>&, const std::vector<half>&,
+  const std::vector<half>&, std::vector<half>&,
+  int, int, int, int, int, int, bool);
diff --git a/src/kernels.maca b/src/kernels.maca
new file mode 100644
index 00000000..df162b99
--- /dev/null
+++ b/src/kernels.maca
@@ -0,0 +1,260 @@
+/**
+ * @file kernels.maca
+ * @brief CUDA kernel implementations for MetaX (æ²æ›¦) GPU platform
+ * @author Training Camp Student
+ * @date 2026-02
+ * 
+ * This file contains implementations adapted for MetaX GPU:
+ * 1. Matrix trace computation with parallel reduction
+ * 2. Flash Attention with causal masking and GQA support
+ */
+
+#include <vector>
+#include <common/maca_fp16.h>
+#include <cmath>
+#include <algorithm>
+
+#include "../tester/utils.h"
+
+// ============================================================================
+// CONSTANTS
+// ============================================================================
+
+constexpr int WARP_SIZE = 64;           // MetaX uses 64-thread wavefronts
+constexpr int TRACE_BLOCK_SIZE = 256;
+
+// ============================================================================
+// UTILITY FUNCTIONS
+// ============================================================================
+
+/**
+ * @brief Warp-level reduction using shuffle instructions
+ */
+template <typename T>
+__device__ __forceinline__ T warpReduceSum(T val) {
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
+        val += __shfl_down(val, offset);
+    }
+    return val;
+}
+
+/**
+ * @brief Block-level reduction
+ */
+template <typename T>
+__device__ __forceinline__ T blockReduceSum(T val, T* shared) {
+    const int lane = threadIdx.x % WARP_SIZE;
+    const int wid = threadIdx.x / WARP_SIZE;
+    
+    val = warpReduceSum(val);
+    
+    if (lane == 0) shared[wid] = val;
+    __syncthreads();
+    
+    const int numWarps = blockDim.x / WARP_SIZE;
+    val = (threadIdx.x < numWarps) ? shared[threadIdx.x] : T(0);
+    
+    if (wid == 0) val = warpReduceSum(val);
+    
+    return val;
+}
+
+// ============================================================================
+// TRACE KERNEL
+// ============================================================================
+
+template <typename T>
+__global__ void traceKernel(const T* __restrict__ input, 
+                            T* __restrict__ output, 
+                            size_t rows, 
+                            size_t cols) {
+    __shared__ T sharedMem[TRACE_BLOCK_SIZE / WARP_SIZE];
+    
+    const size_t diagLen = min(rows, cols);
+    const size_t stride = gridDim.x * blockDim.x;
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    T localSum = T(0);
+    while (idx < diagLen) {
+        localSum += input[idx * cols + idx];
+        idx += stride;
+    }
+    
+    localSum = blockReduceSum(localSum, sharedMem);
+    
+    if (threadIdx.x == 0) {
+        atomicAdd(output, localSum);
+    }
+}
+
+/**
+ * @brief Computes the trace of a matrix using GPU
+ */
+template <typename T>
+T trace(const std::vector<T>& h_input, size_t rows, size_t cols) {
+    const size_t diagLen = std::min(rows, cols);
+    if (diagLen == 0) return T(0);
+    
+    const size_t inputBytes = rows * cols * sizeof(T);
+    const size_t outputBytes = sizeof(T);
+    
+    T* d_input = nullptr;
+    T* d_output = nullptr;
+    mcMalloc(&d_input, inputBytes);
+    mcMalloc(&d_output, outputBytes);
+    
+    mcMemcpy(d_input, h_input.data(), inputBytes, mcMemcpyHostToDevice);
+    mcMemset(d_output, 0, outputBytes);
+    
+    const int blockSize = TRACE_BLOCK_SIZE;
+    const int numBlocks = std::min((size_t)((diagLen + blockSize - 1) / blockSize), (size_t)128);
+    
+    traceKernel<T><<<numBlocks, blockSize>>>(d_input, d_output, rows, cols);
+    
+    T result;
+    mcMemcpy(&result, d_output, outputBytes, mcMemcpyDeviceToHost);
+    
+    mcFree(d_input);
+    mcFree(d_output);
+    
+    return result;
+}
+
+// ============================================================================
+// FLASH ATTENTION IMPLEMENTATION
+// ============================================================================
+
+/**
+ * @brief Type conversion utilities for mixed-precision computation
+ */
+template <typename T>
+struct TypeConverter {
+    __device__ __forceinline__ static float toFloat(T val);
+    __device__ __forceinline__ static T fromFloat(float val);
+};
+
+template <>
+struct TypeConverter<float> {
+    __device__ __forceinline__ static float toFloat(float val) { return val; }
+    __device__ __forceinline__ static float fromFloat(float val) { return val; }
+};
+
+template <>
+struct TypeConverter<half> {
+    __device__ __forceinline__ static float toFloat(half val) { return __half2float(val); }
+    __device__ __forceinline__ static half fromFloat(float val) { return __float2half(val); }
+};
+
+/**
+ * @brief Flash Attention kernel with online softmax
+ */
+template <typename T>
+__global__ void flashAttentionKernel(
+    const T* __restrict__ Q,
+    const T* __restrict__ K,
+    const T* __restrict__ V,
+    T* __restrict__ O,
+    const int batchSize,
+    const int tgtSeqLen,
+    const int srcSeqLen,
+    const int queryHeads,
+    const int kvHeads,
+    const int headDim,
+    const bool isCausal,
+    const float scale) {
+    
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total = batchSize * tgtSeqLen * queryHeads * headDim;
+    if (idx >= total) return;
+    
+    const int d = idx % headDim;
+    const int h = (idx / headDim) % queryHeads;
+    const int t = (idx / (headDim * queryHeads)) % tgtSeqLen;
+    const int b = idx / (headDim * queryHeads * tgtSeqLen);
+    
+    const int kvH = h / (queryHeads / kvHeads);
+    const int maxSrc = isCausal ? min(t + 1, srcSeqLen) : srcSeqLen;
+    
+    // Online softmax
+    float maxVal = -INFINITY;
+    float sumExp = 0.0f;
+    float result = 0.0f;
+    
+    for (int s = 0; s < maxSrc; s++) {
+        float dot = 0.0f;
+        for (int dd = 0; dd < headDim; dd++) {
+            int qIdx = ((b * tgtSeqLen + t) * queryHeads + h) * headDim + dd;
+            int kIdx = ((b * srcSeqLen + s) * kvHeads + kvH) * headDim + dd;
+            dot += TypeConverter<T>::toFloat(Q[qIdx]) * TypeConverter<T>::toFloat(K[kIdx]);
+        }
+        dot *= scale;
+        
+        float prevMax = maxVal;
+        maxVal = fmaxf(maxVal, dot);
+        float correction = expf(prevMax - maxVal);
+        sumExp = sumExp * correction + expf(dot - maxVal);
+        
+        int vIdx = ((b * srcSeqLen + s) * kvHeads + kvH) * headDim + d;
+        result = result * correction + expf(dot - maxVal) * TypeConverter<T>::toFloat(V[vIdx]);
+    }
+    
+    int oIdx = ((b * tgtSeqLen + t) * queryHeads + h) * headDim + d;
+    O[oIdx] = TypeConverter<T>::fromFloat((sumExp > 0.0f) ? (result / sumExp) : 0.0f);
+}
+
+/**
+ * @brief Computes Flash Attention for given query, key, and value tensors
+ */
+template <typename T>
+void flashAttention(const std::vector<T>& h_q, const std::vector<T>& h_k,
+                    const std::vector<T>& h_v, std::vector<T>& h_o,
+                    int batch_size, int target_seq_len, int src_seq_len, 
+                    int query_heads, int kv_heads, int head_dim, bool is_causal) {
+    
+    const size_t qSize = batch_size * target_seq_len * query_heads * head_dim;
+    const size_t kvSize = batch_size * src_seq_len * kv_heads * head_dim;
+    
+    h_o.resize(qSize);
+    
+    T *d_q, *d_k, *d_v, *d_o;
+    mcMalloc(&d_q, qSize * sizeof(T));
+    mcMalloc(&d_k, kvSize * sizeof(T));
+    mcMalloc(&d_v, kvSize * sizeof(T));
+    mcMalloc(&d_o, qSize * sizeof(T));
+    
+    mcMemcpy(d_q, h_q.data(), qSize * sizeof(T), mcMemcpyHostToDevice);
+    mcMemcpy(d_k, h_k.data(), kvSize * sizeof(T), mcMemcpyHostToDevice);
+    mcMemcpy(d_v, h_v.data(), kvSize * sizeof(T), mcMemcpyHostToDevice);
+    
+    const float scale = 1.0f / sqrtf(static_cast<float>(head_dim));
+    
+    const int total = batch_size * target_seq_len * query_heads * head_dim;
+    const int blockSize = 256;
+    const int numBlocks = (total + blockSize - 1) / blockSize;
+    
+    flashAttentionKernel<T><<<numBlocks, blockSize>>>(
+        d_q, d_k, d_v, d_o,
+        batch_size, target_seq_len, src_seq_len,
+        query_heads, kv_heads, head_dim,
+        is_causal, scale);
+    
+    mcMemcpy(h_o.data(), d_o, qSize * sizeof(T), mcMemcpyDeviceToHost);
+    
+    mcFree(d_q);
+    mcFree(d_k);
+    mcFree(d_v);
+    mcFree(d_o);
+}
+
+// *********************************************************************
+// Explicit Template Instantiations (REQUIRED FOR LINKING WITH TESTER.O)
+// DO NOT MODIFY THIS SECTION
+// *********************************************************************
+template int trace<int>(const std::vector<int>&, size_t, size_t);
+template float trace<float>(const std::vector<float>&, size_t, size_t);
+template void flashAttention<float>(const std::vector<float>&, const std::vector<float>&,
+  const std::vector<float>&, std::vector<float>&,
+  int, int, int, int, int, int, bool);
+template void flashAttention<half>(const std::vector<half>&, const std::vector<half>&,
+  const std::vector<half>&, std::vector<half>&,
+  int, int, int, int, int, int, bool);
diff --git a/src/kernels.mu b/src/kernels.mu
new file mode 100644
index 00000000..0f56ab1a
--- /dev/null
+++ b/src/kernels.mu
@@ -0,0 +1,261 @@
+/**
+ * @file kernels.mu
+ * @brief CUDA kernel implementations for Moore Threads (æ‘©å°”çº¿ç¨‹) GPU platform
+ * @author Training Camp Student
+ * @date 2026-02
+ * 
+ * This file contains implementations adapted for Moore Threads GPU:
+ * 1. Matrix trace computation with parallel reduction
+ * 2. Flash Attention with causal masking and GQA support
+ */
+
+#include <vector>
+#include <musa_fp16.h>
+#include <cmath>
+#include <algorithm>
+
+#include "../tester/utils.h"
+
+// ============================================================================
+// CONSTANTS
+// ============================================================================
+
+constexpr int WARP_SIZE = 32;           // Moore Threads uses 32-thread warps
+constexpr int TRACE_BLOCK_SIZE = 256;
+
+// ============================================================================
+// UTILITY FUNCTIONS
+// ============================================================================
+
+/**
+ * @brief Warp-level reduction using shuffle instructions
+ */
+template <typename T>
+__device__ __forceinline__ T warpReduceSum(T val) {
+    for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
+        val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+    return val;
+}
+
+/**
+ * @brief Block-level reduction
+ */
+template <typename T>
+__device__ __forceinline__ T blockReduceSum(T val, T* shared) {
+    const int lane = threadIdx.x % WARP_SIZE;
+    const int wid = threadIdx.x / WARP_SIZE;
+    
+    val = warpReduceSum(val);
+    
+    if (lane == 0) shared[wid] = val;
+    __syncthreads();
+    
+    const int numWarps = blockDim.x / WARP_SIZE;
+    val = (threadIdx.x < numWarps) ? shared[threadIdx.x] : T(0);
+    
+    if (wid == 0) val = warpReduceSum(val);
+    
+    return val;
+}
+
+// ============================================================================
+// TRACE KERNEL
+// ============================================================================
+
+template <typename T>
+__global__ void traceKernel(const T* __restrict__ input, 
+                            T* __restrict__ output, 
+                            size_t rows, 
+                            size_t cols) {
+    __shared__ T sharedMem[TRACE_BLOCK_SIZE / WARP_SIZE];
+    
+    const size_t diagLen = min(rows, cols);
+    const size_t stride = gridDim.x * blockDim.x;
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    T localSum = T(0);
+    while (idx < diagLen) {
+        localSum += input[idx * cols + idx];
+        idx += stride;
+    }
+    
+    localSum = blockReduceSum(localSum, sharedMem);
+    
+    if (threadIdx.x == 0) {
+        atomicAdd(output, localSum);
+    }
+}
+
+/**
+ * @brief Computes the trace of a matrix using GPU
+ */
+template <typename T>
+T trace(const std::vector<T>& h_input, size_t rows, size_t cols) {
+    const size_t diagLen = std::min(rows, cols);
+    if (diagLen == 0) return T(0);
+    
+    const size_t inputBytes = rows * cols * sizeof(T);
+    const size_t outputBytes = sizeof(T);
+    
+    T* d_input = nullptr;
+    T* d_output = nullptr;
+    musaMalloc(&d_input, inputBytes);
+    musaMalloc(&d_output, outputBytes);
+    
+    musaMemcpy(d_input, h_input.data(), inputBytes, musaMemcpyHostToDevice);
+    musaMemset(d_output, 0, outputBytes);
+    
+    const int blockSize = TRACE_BLOCK_SIZE;
+    const int numBlocks = std::min((size_t)((diagLen + blockSize - 1) / blockSize), (size_t)128);
+    
+    traceKernel<T><<<numBlocks, blockSize>>>(d_input, d_output, rows, cols);
+    
+    T result;
+    musaMemcpy(&result, d_output, outputBytes, musaMemcpyDeviceToHost);
+    
+    musaFree(d_input);
+    musaFree(d_output);
+    
+    return result;
+}
+
+// ============================================================================
+// FLASH ATTENTION IMPLEMENTATION
+// ============================================================================
+
+/**
+ * @brief Type conversion utilities for mixed-precision computation
+ */
+template <typename T>
+struct TypeConverter {
+    __device__ __forceinline__ static float toFloat(T val);
+    __device__ __forceinline__ static T fromFloat(float val);
+};
+
+template <>
+struct TypeConverter<float> {
+    __device__ __forceinline__ static float toFloat(float val) { return val; }
+    __device__ __forceinline__ static float fromFloat(float val) { return val; }
+};
+
+template <>
+struct TypeConverter<half> {
+    __device__ __forceinline__ static float toFloat(half val) { return __half2float(val); }
+    __device__ __forceinline__ static half fromFloat(float val) { return __float2half(val); }
+};
+
+/**
+ * @brief Flash Attention kernel with online softmax
+ */
+template <typename T>
+__global__ void flashAttentionKernel(
+    const T* __restrict__ Q,
+    const T* __restrict__ K,
+    const T* __restrict__ V,
+    T* __restrict__ O,
+    const int batchSize,
+    const int tgtSeqLen,
+    const int srcSeqLen,
+    const int queryHeads,
+    const int kvHeads,
+    const int headDim,
+    const bool isCausal,
+    const float scale) {
+    
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total = batchSize * tgtSeqLen * queryHeads * headDim;
+    if (idx >= total) return;
+    
+    const int d = idx % headDim;
+    const int h = (idx / headDim) % queryHeads;
+    const int t = (idx / (headDim * queryHeads)) % tgtSeqLen;
+    const int b = idx / (headDim * queryHeads * tgtSeqLen);
+    
+    const int kvH = h / (queryHeads / kvHeads);
+    const int maxSrc = isCausal ? min(t + 1, srcSeqLen) : srcSeqLen;
+    
+    // Online softmax
+    float maxVal = -INFINITY;
+    float sumExp = 0.0f;
+    float result = 0.0f;
+    
+    for (int s = 0; s < maxSrc; s++) {
+        float dot = 0.0f;
+        for (int dd = 0; dd < headDim; dd++) {
+            int qIdx = ((b * tgtSeqLen + t) * queryHeads + h) * headDim + dd;
+            int kIdx = ((b * srcSeqLen + s) * kvHeads + kvH) * headDim + dd;
+            dot += TypeConverter<T>::toFloat(Q[qIdx]) * TypeConverter<T>::toFloat(K[kIdx]);
+        }
+        dot *= scale;
+        
+        float prevMax = maxVal;
+        maxVal = fmaxf(maxVal, dot);
+        float correction = (prevMax == -INFINITY) ? 0.0f : expf(prevMax - maxVal);
+        float weight = expf(dot - maxVal);
+        sumExp = sumExp * correction + weight;
+        
+        int vIdx = ((b * srcSeqLen + s) * kvHeads + kvH) * headDim + d;
+        result = result * correction + weight * TypeConverter<T>::toFloat(V[vIdx]);
+    }
+    
+    int oIdx = ((b * tgtSeqLen + t) * queryHeads + h) * headDim + d;
+    O[oIdx] = TypeConverter<T>::fromFloat((sumExp > 0.0f) ? (result / sumExp) : 0.0f);
+}
+
+/**
+ * @brief Computes Flash Attention for given query, key, and value tensors
+ */
+template <typename T>
+void flashAttention(const std::vector<T>& h_q, const std::vector<T>& h_k,
+                    const std::vector<T>& h_v, std::vector<T>& h_o,
+                    int batch_size, int target_seq_len, int src_seq_len, 
+                    int query_heads, int kv_heads, int head_dim, bool is_causal) {
+    
+    const size_t qSize = batch_size * target_seq_len * query_heads * head_dim;
+    const size_t kvSize = batch_size * src_seq_len * kv_heads * head_dim;
+    
+    h_o.resize(qSize);
+    
+    T *d_q, *d_k, *d_v, *d_o;
+    musaMalloc(&d_q, qSize * sizeof(T));
+    musaMalloc(&d_k, kvSize * sizeof(T));
+    musaMalloc(&d_v, kvSize * sizeof(T));
+    musaMalloc(&d_o, qSize * sizeof(T));
+    
+    musaMemcpy(d_q, h_q.data(), qSize * sizeof(T), musaMemcpyHostToDevice);
+    musaMemcpy(d_k, h_k.data(), kvSize * sizeof(T), musaMemcpyHostToDevice);
+    musaMemcpy(d_v, h_v.data(), kvSize * sizeof(T), musaMemcpyHostToDevice);
+    
+    const float scale = 1.0f / sqrtf(static_cast<float>(head_dim));
+    
+    const int total = batch_size * target_seq_len * query_heads * head_dim;
+    const int blockSize = 256;
+    const int numBlocks = (total + blockSize - 1) / blockSize;
+    
+    flashAttentionKernel<T><<<numBlocks, blockSize>>>(
+        d_q, d_k, d_v, d_o,
+        batch_size, target_seq_len, src_seq_len,
+        query_heads, kv_heads, head_dim,
+        is_causal, scale);
+    
+    musaMemcpy(h_o.data(), d_o, qSize * sizeof(T), musaMemcpyDeviceToHost);
+    
+    musaFree(d_q);
+    musaFree(d_k);
+    musaFree(d_v);
+    musaFree(d_o);
+}
+
+// *********************************************************************
+// Explicit Template Instantiations (REQUIRED FOR LINKING WITH TESTER.O)
+// DO NOT MODIFY THIS SECTION
+// *********************************************************************
+template int trace<int>(const std::vector<int>&, size_t, size_t);
+template float trace<float>(const std::vector<float>&, size_t, size_t);
+template void flashAttention<float>(const std::vector<float>&, const std::vector<float>&,
+  const std::vector<float>&, std::vector<float>&,
+  int, int, int, int, int, int, bool);
+template void flashAttention<half>(const std::vector<half>&, const std::vector<half>&,
+  const std::vector<half>&, std::vector<half>&,
+  int, int, int, int, int, int, bool);
diff --git a/tester/tester_iluvatar.o b/tester/tester_iluvatar.o
new file mode 100644
index 00000000..34ff8011
Binary files /dev/null and b/tester/tester_iluvatar.o differ
diff --git a/tester/tester_metax.o b/tester/tester_metax.o
new file mode 100644
index 00000000..0112a230
Binary files /dev/null and b/tester/tester_metax.o differ
diff --git a/tester/tester_moore.o b/tester/tester_moore.o
new file mode 100644
index 00000000..0ccba85f
Binary files /dev/null and b/tester/tester_moore.o differ
diff --git a/tester/tester_nv.o b/tester/tester_nv.o
new file mode 100644
index 00000000..5adca5a5
Binary files /dev/null and b/tester/tester_nv.o differ
diff --git a/tester/utils.h b/tester/utils.h
new file mode 100644
index 00000000..a2bd9c86
--- /dev/null
+++ b/tester/utils.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <iostream>
+
+#if defined(PLATFORM_NVIDIA) || defined(PLATFORM_ILUVATAR)
+#include <cuda_runtime.h>
+#define RUNTIME_ERR_TYPE cudaError_t
+#define RUNTIME_SUCCESS_CODE cudaSuccess
+#define RUNTIME_GET_ERROR_STR cudaGetErrorString
+
+#elif defined(PLATFORM_MOORE)
+#include <musa_runtime.h>
+#define RUNTIME_ERR_TYPE musaError_t
+#define RUNTIME_SUCCESS_CODE musaSuccess
+#define RUNTIME_GET_ERROR_STR musaGetErrorString
+
+#elif defined(PLATFORM_METAX)
+#include <mcr/mc_runtime.h>
+#define RUNTIME_ERR_TYPE mcError_t
+#define RUNTIME_SUCCESS_CODE mcSuccess
+#define RUNTIME_GET_ERROR_STR mcGetErrorString
+
+#else
+#error "Unknown PLATFORM for RUNTIME_CHECK"
+#endif
+
+#define RUNTIME_CHECK(call)                                                    \
+  do {                                                                         \
+    RUNTIME_ERR_TYPE err = call;                                               \
+    if (err != RUNTIME_SUCCESS_CODE) {                                         \
+      std::cerr << "Runtime error at " << __FILE__ << ":" << __LINE__ << " - " \
+                << RUNTIME_GET_ERROR_STR(err) << "\n";                         \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)