diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..185649a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.venv/
+*.pyc
+*.sqlite
+*.nsys-rep
+*.bin
\ No newline at end of file
diff --git a/03_nf4_dequant/ikko/gen_data.py b/03_nf4_dequant/ikko/gen_data.py
new file mode 100644
index 0000000..b894d47
--- /dev/null
+++ b/03_nf4_dequant/ikko/gen_data.py
@@ -0,0 +1,94 @@
+import torch
+import bitsandbytes as bnb
+import struct
+import os
+import numpy as np
+
+def generate_inputs(rows=16384, cols=16384, blocksize=64, output_dir="03_nf4_dequant/ikko/data", compute_type="bf16"):
+    os.makedirs(output_dir, exist_ok=True)
+    
+    compute_type = compute_type.lower()
+    if compute_type not in {"bf16", "fp16"}:
+        raise ValueError("compute_type must be 'bf16' or 'fp16'")
+
+    print(f"Generating data: {rows}x{cols}, blocksize={blocksize}, compute_type={compute_type}")
+    
+    # 1. 准备原始权重 (使用 GPU 和 BF16)
+    device = torch.device("cuda")
+    # 模拟真实权重分布 (Normal Float 4 针对正态分布优化)
+    orig_weight = torch.randn(rows, cols, dtype=torch.bfloat16, device=device)
+    
+    # 2. 使用 bitsandbytes 进行 NF4 + Double Quantization
+    # quant_type='nf4', compress_statistics=True 开启双重量化
+    packed_weight, quant_state = bnb.functional.quantize_4bit(
+        orig_weight, 
+        blocksize=blocksize, 
+        quant_type='nf4', 
+        compress_statistics=True
+    )
+    
+    # 3. 生成官方参考结果 (Ground Truth)
+    # CUDA Kernel 输出必须逼近这个结果
+    ref_output = bnb.functional.dequantize_4bit(
+        packed_weight, 
+        quant_state, 
+        quant_type='nf4', 
+        blocksize=blocksize
+    )
+    
+    # 4. 提取双重量化参数 (为了写入 input bin 文件)
+    # bitsandbytes 的 QuantState 结构解析:
+    # - absmax: 一级量化因子 (已被二级量化，uint8)
+    # - nested quant state: 兼容 nested_quant_state / state2
+    absmax_q = quant_state.absmax.to(torch.uint8)  # uint8
+    nested_state = getattr(quant_state, "nested_quant_state", None)
+    if nested_state is None:
+        nested_state = getattr(quant_state, "state2", None)
+    if nested_state is None:
+        raise RuntimeError(
+            "Double-quantization state not found. "
+            "Please ensure bitsandbytes supports compress_statistics=True "
+            "and provides nested quantization fields."
+        )
+    absmax2 = nested_state.absmax  # float32 (需转 float16)
+    code2 = nested_state.code      # float32 (需转 float16)
+    offset = getattr(quant_state, "offset", 0.0)
+    if isinstance(offset, torch.Tensor):
+        offset = float(offset.item())
+    
+    # 5. 写入题目要求的二进制输入文件 (weight.bin)
+    input_path = os.path.join(output_dir, "weight_data.bin")
+    with open(input_path, "wb") as f:
+        # [Header]
+        f.write(struct.pack("qqi", rows, cols, blocksize))
+        
+        # [Data]
+        # packed_weights (uint8)
+        f.write(packed_weight.cpu().numpy().tobytes())
+        # absmax_q (uint8)
+        f.write(absmax_q.cpu().numpy().tobytes())
+        # absmax2 (float16)
+        f.write(absmax2.to(torch.float16).cpu().numpy().tobytes())
+        # code2 (float16)
+        f.write(code2.to(torch.float16).cpu().numpy().tobytes())
+        # offset (float32)
+        f.write(struct.pack("f", float(offset)))
+        
+    print(f"-> Input file saved to: {input_path}")
+    
+    # 6. 保存 Ground Truth 用于后续验证 (truth.bin)
+    truth_path = os.path.join(output_dir, "ground_truth.bin")
+    with open(truth_path, "wb") as f:
+        # 保存为纯二进制流 (row-major, bf16/fp16)
+        if compute_type == "bf16":
+            ref_out = ref_output.to(torch.bfloat16)
+            ref_bytes = ref_out.view(torch.int16).cpu().numpy().tobytes()
+        else:
+            ref_out = ref_output.to(torch.float16)
+            ref_bytes = ref_out.cpu().numpy().tobytes()
+        f.write(ref_bytes)
+    
+    print(f"-> Ground truth saved to: {truth_path}")
+
+if __name__ == "__main__":
+    generate_inputs(compute_type="bf16")
\ No newline at end of file
diff --git a/03_nf4_dequant/ikko/main.cu b/03_nf4_dequant/ikko/main.cu
new file mode 100644
index 0000000..fd42929
--- /dev/null
+++ b/03_nf4_dequant/ikko/main.cu
@@ -0,0 +1,244 @@
+#include <cuda_runtime.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+struct Header {
+    int64_t num_rows;
+    int64_t num_cols;
+    int32_t blocksize;
+};
+__constant__ float NF4_LUT[16] = {
+    -1.0f,                 // 0b0000
+    -0.6961928009986877f,  // 0b0001
+    -0.5250730514526367f,  // 0b0010
+    -0.39491748809814453f, // 0b0011
+    -0.28444138169288635f, // 0b0100
+    -0.18477343022823334f, // 0b0101
+    -0.09105003625154495f, // 0b0110
+    0.0f,                  // 0b0111
+    0.07958029955625534f,  // 0b1000
+    0.16093020141124725f,  // 0b1001
+    0.24611230194568634f,  // 0b1010
+    0.33791524171829224f,  // 0b1011
+    0.44070982933044434f,  // 0b1100
+    0.5626170039176941f,   // 0b1101
+    0.7229568362236023f,   // 0b1110
+    1.0f                   // 0b1111
+};
+void checkCuda(cudaError_t result, const char *func, const char *file, int line) {
+    if (result != cudaSuccess) {
+        std::cerr << "CUDA error at " << file << ":" << line << " code=" << result << " \"" << func << "\" \n";
+        std::cerr << "Error string: " << cudaGetErrorString(result) << std::endl;
+        exit(99);
+    }
+}
+#define CHECK_CUDA(val) checkCuda((val), #val, __FILE__, __LINE__)
+
+__global__ void nf4_decode_kernel_native(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const half* __restrict__ absmax2,
+    const half* __restrict__ code2,
+    const float offset, // 通常为 0
+    __nv_bfloat16* __restrict__ output,
+    int64_t num_elements,
+    int blocksize
+) {
+    // 1. 全局一维线程索引
+    // 每个线程负责 1 个字节（即 2 个 4-bit 权重）
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t total_bytes = num_elements / 2;
+
+    // 边界检查：多余的线程直接退出
+    if (tid >= total_bytes) return;
+
+    // 2. 读取这 1 个字节，并解包成两个 4-bit 索引
+    uint8_t packed = packed_weights[tid];
+    uint8_t idx1 = packed & 0x0F;
+    uint8_t idx2 = (packed >> 4) & 0x0F;
+
+    // 3. 计算当前字节属于哪一个量化 Block 和 Group
+    // 每一个 Block 包含 blocksize 个权重，即 blocksize / 2 个字节
+    int bytes_per_block = blocksize / 2; 
+    int block_id = tid / bytes_per_block; // 当前 byte 属于第几个 64-weight block
+    int group_id = block_id / 256; // bitsandbytes 默认每组 256 个 block，256 个 block 共享一个 absmax2
+
+    // 4. 从全局内存 (Global Memory) 读取双重量化参数
+    float a2 = __half2float(absmax2[group_id]);   // 读取二级缩放
+    uint8_t qa = absmax_q[block_id];              // 读取一级缩放索引
+    float c2 = __half2float(code2[qa]);           // 查码表解码一级缩放
+    float real_absmax = c2 * a2;                  // 计算最终缩放因子
+
+    // 5. 结合 NF4 查表，计算真实的浮点权重
+    float w1_fp32 = NF4_LUT[idx1] * real_absmax + offset;
+    float w2_fp32 = NF4_LUT[idx2] * real_absmax + offset;
+
+    // 6. 最朴素的分别写回 
+    output[tid * 2]     = __float2bfloat16(w1_fp32);
+    output[tid * 2 + 1] = __float2bfloat16(w2_fp32);
+}
+
+__global__ void nf4_decode_kernel(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const half* __restrict__ absmax2,
+    const half* __restrict__ code2,
+    const float offset, // 通常为 0
+    __nv_bfloat16* __restrict__ output,
+    int64_t num_elements,
+    int blocksize
+) {
+
+}
+
+
+
+int main(int argc, char** argv) {
+//     1.输入解析，读取二进制文件
+    std::string input_file = "test_data/weight_data.bin";
+    std::string output_file = "test_data/output.bin";
+    std::ifstream infile(input_file, std::ios::binary);
+    if (!infile) {
+        std::cerr << "Error: Cannot open input file." << std::endl;
+        return 1;
+    }
+
+// 1. 读取 Header
+    int64_t num_rows, num_cols;
+    int32_t blocksize;
+    infile.read(reinterpret_cast<char*>(&num_rows), sizeof(int64_t));
+    infile.read(reinterpret_cast<char*>(&num_cols), sizeof(int64_t));
+    infile.read(reinterpret_cast<char*>(&blocksize), sizeof(int32_t));
+// 2.内存规划
+    Header header{num_rows, num_cols, blocksize};
+    int64_t num_elements = num_rows * num_cols;
+    int64_t num_blocks = (num_elements + blocksize - 1) / blocksize;
+    int64_t num_groups = (num_blocks + 255) / 256; // 每个 block 256 个线程
+
+// 3.数据加载，分配显存
+    size_t size_packed = num_elements >>1; //需要 num_elements / 2 个字节，一个 byte 存 2 个权重
+    size_t size_absmax_q = num_blocks * sizeof(uint8_t);
+    size_t size_absmax2 = num_groups * sizeof(half); // float16
+    size_t size_code2 = 256 * sizeof(half); // float16
+    float offset; // float32
+
+    std::vector<uint8_t> h_packed(size_packed);
+    std::vector<uint8_t> h_absmax_q(num_blocks);
+    std::vector<half> h_absmax2(num_groups);
+    std::vector<half> h_code2(256);
+
+    infile.read(reinterpret_cast<char*>(h_packed.data()), size_packed);
+    infile.read(reinterpret_cast<char*>(h_absmax_q.data()), size_absmax_q);
+    infile.read(reinterpret_cast<char*>(h_absmax2.data()), size_absmax2);
+    infile.read(reinterpret_cast<char*>(h_code2.data()), size_code2);
+    infile.read(reinterpret_cast<char*>(&offset), sizeof(float)); 
+
+    infile.close();
+    // 分配 device   内存
+    uint8_t* d_packed = nullptr;
+    uint8_t* d_absmax_q = nullptr;
+    half* d_absmax2 = nullptr;
+    half* d_code2 = nullptr;
+    __nv_bfloat16 *d_output;
+
+    CHECK_CUDA(cudaMalloc(&d_packed, size_packed));
+    CHECK_CUDA(cudaMalloc(&d_absmax_q, size_absmax_q));
+    CHECK_CUDA(cudaMalloc(&d_absmax2, size_absmax2));
+    CHECK_CUDA(cudaMalloc(&d_code2, size_code2));
+    CHECK_CUDA(cudaMalloc(&d_output, num_elements * sizeof(__nv_bfloat16)));
+    CHECK_CUDA(cudaMemcpy(d_packed, h_packed.data(), size_packed, cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), size_absmax_q, cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), size_absmax2, cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_code2, h_code2.data(), size_code2, cudaMemcpyHostToDevice));
+    
+
+
+// 4. 启动 CUDA Kernel
+    dim3 blockDim(256);
+    dim3 blockDim(256);
+    int64_t total_bytes = (num_elements + 1) / 2;
+    int64_t total_words = (total_bytes + 15) / 16;
+    int sm_count = 0;
+    CHECK_CUDA(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, 0));
+    int max_active_blocks = 0;
+    CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        nf4_decode_kernel,
+        blockDim.x,
+        0));
+    int grid_x = sm_count * max_active_blocks;
+    int64_t max_grid = (total_words + blockDim.x - 1) / blockDim.x;
+    if (grid_x > max_grid) {
+        grid_x = static_cast<int>(max_grid);
+    }
+    if (grid_x < 1) {
+        grid_x = 1;
+    }
+    dim3 gridDim(grid_x);
+    std::cout << "SM count: " << sm_count
+              << ", max active blocks/SM: " << max_active_blocks
+              << ", grid_x: " << grid_x << std::endl;
+    int group_size = static_cast<int>((num_blocks + num_groups - 1) / num_groups);
+    // kernel 函数需要完成 NF4 解码的核心计算逻辑
+    // 计时事件
+    cudaEvent_t start, stop;
+    CHECK_CUDA(cudaEventCreate(&start));
+    CHECK_CUDA(cudaEventCreate(&stop));
+
+    // Warmup
+    nf4_decode_kernel<<<gridDim, blockDim>>>(
+        d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output, num_elements, blocksize
+    );
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    const int iters = 100;
+    CHECK_CUDA(cudaEventRecord(start));
+    for (int i = 0; i < iters; ++i) {
+        nf4_decode_kernel<<<gridDim, blockDim>>>(
+            d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output, num_elements, blocksize
+        );
+    }
+    CHECK_CUDA(cudaEventRecord(stop));
+// 5.记录性能，写入数据
+    CHECK_CUDA(cudaGetLastError());
+    CHECK_CUDA(cudaEventSynchronize(stop));
+
+    float milliseconds = 0;
+    CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
+    milliseconds /= iters;
+
+// 6. D2H 拷贝结果
+    std::vector<__nv_bfloat16> h_output(num_elements);
+    CHECK_CUDA(cudaMemcpy(h_output.data(), d_output, num_elements * sizeof(__nv_bfloat16), cudaMemcpyDeviceToHost));
+
+// 7. 计算并打印性能
+    double total_bytes = static_cast<double>(size_packed + size_absmax_q + size_absmax2 + size_code2) +
+                         static_cast<double>(num_elements * 2);
+    double bandwidth = total_bytes / (milliseconds / 1000.0) / 1e9;
+    std::cout << "Kernel Time: " << milliseconds << " ms" << std::endl;
+    std::cout << "Effective Bandwidth (approx): " << bandwidth << " GB/s" << std::endl;
+
+// 8. 写入输出文件
+    std::ofstream outfile(output_file, std::ios::binary);
+    outfile.write(reinterpret_cast<char*>(h_output.data()), num_elements * sizeof(__nv_bfloat16));
+    outfile.close();
+    std::cout << "Output written to " << output_file << std::endl;
+
+    // 清理
+    cudaFree(d_packed);
+    cudaFree(d_absmax_q);
+    cudaFree(d_absmax2);
+    cudaFree(d_code2);
+    cudaFree(d_output);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/03_nf4_dequant/ikko/mainla b/03_nf4_dequant/ikko/mainla
new file mode 100755
index 0000000..62abc85
Binary files /dev/null and b/03_nf4_dequant/ikko/mainla differ
diff --git a/03_nf4_dequant/ikko/mainla.cu b/03_nf4_dequant/ikko/mainla.cu
new file mode 100644
index 0000000..2614b56
--- /dev/null
+++ b/03_nf4_dequant/ikko/mainla.cu
@@ -0,0 +1,406 @@
+#include <cuda_runtime.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <cstring>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <unistd.h>
+// struct Header {
+//     int64_t num_rows;
+//     int64_t num_cols;
+//     int32_t blocksize;
+// };
+__constant__ float NF4_LUT[16] = {
+    -1.0f,                 // 0b0000
+    -0.6961928009986877f,  // 0b0001
+    -0.5250730514526367f,  // 0b0010
+    -0.39491748809814453f, // 0b0011
+    -0.28444138169288635f, // 0b0100
+    -0.18477343022823334f, // 0b0101
+    -0.09105003625154495f, // 0b0110
+    0.0f,                  // 0b0111
+    0.07958029955625534f,  // 0b1000
+    0.16093020141124725f,  // 0b1001
+    0.24611230194568634f,  // 0b1010
+    0.33791524171829224f,  // 0b1011
+    0.44070982933044434f,  // 0b1100
+    0.5626170039176941f,   // 0b1101
+    0.7229568362236023f,   // 0b1110
+    1.0f                   // 0b1111
+};
+void checkCuda(cudaError_t result, const char *func, const char *file, int line) {
+    if (result != cudaSuccess) {
+        std::cerr << "CUDA error at " << file << ":" << line << " code=" << result << " \"" << func << "\" \n";
+        std::cerr << "Error string: " << cudaGetErrorString(result) << std::endl;
+        exit(99);
+    }
+}
+#define CHECK_CUDA(val) checkCuda((val), #val, __FILE__, __LINE__)
+
+template <typename T>
+__device__ __forceinline__ T float_to_out(float v);
+
+template <>
+__device__ __forceinline__ __nv_bfloat16 float_to_out<__nv_bfloat16>(float v) {
+    return __float2bfloat16(v);
+}
+
+template <>
+__device__ __forceinline__ half float_to_out<half>(float v) {
+    return __float2half(v);
+}
+
+template <typename T>
+__device__ __forceinline__ uint32_t pack_pair_to_u32(float v1, float v2);
+
+template <>
+__device__ __forceinline__ uint32_t pack_pair_to_u32<__nv_bfloat16>(float v1, float v2) {
+    __nv_bfloat162 packed = __floats2bfloat162_rn(v1, v2);
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+
+template <>
+__device__ __forceinline__ uint32_t pack_pair_to_u32<half>(float v1, float v2) {
+    half2 packed = __floats2half2_rn(v1, v2);
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+
+// __global__ void nf4_decode_kernel_native(
+//     const uint8_t* __restrict__ packed_weights,
+//     const uint8_t* __restrict__ absmax_q,
+//     const half* __restrict__ absmax2,
+//     const half* __restrict__ code2,
+//     const float offset, // 通常为 0
+//     __nv_bfloat16* __restrict__ output,
+//     int64_t num_elements,
+//     int blocksize,
+//     int group_size
+// ) {
+//     // 1. 全局一维线程索引
+//     // 每个线程负责 1 个字节（即 2 个 4-bit 权重）
+//     int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+//     int64_t total_bytes = (num_elements + 1) / 2;
+//     int64_t stride = static_cast<int64_t>(gridDim.x) * blockDim.x;
+
+//     for (int64_t byte_idx = tid; byte_idx < total_bytes; byte_idx += stride) {
+//         // 2. 读取这 1 个字节，并解包成两个 4-bit 索引
+//         uint8_t packed = packed_weights[byte_idx];
+//         uint8_t idx1 = (packed >> 4) & 0x0F; // 高 4 位对应 output[byte_idx * 2]
+//         uint8_t idx2 = packed & 0x0F;        // 低 4 位对应 output[byte_idx * 2 + 1]
+
+//         // 3. 计算当前字节属于哪一个量化 Block 和 Group
+//         // 每一个 Block 包含 blocksize 个权重，即 blocksize / 2 个字节
+//         int bytes_per_block = blocksize / 2;
+//         int block_id = static_cast<int>(byte_idx / bytes_per_block);
+//         int group_id = block_id / group_size;
+
+//         // 4. 暴力从全局内存 (Global Memory) 读取双重量化参数
+//         float a2 = __half2float(absmax2[group_id]);
+//         uint8_t qa = absmax_q[block_id];
+//         float c2 = __half2float(code2[qa]);
+//         float real_absmax = c2 * a2 + offset;
+
+//         // 5. 结合 NF4 查表，计算真实的浮点权重
+//         float w1_fp32 = NF4_LUT[idx1] * real_absmax;
+//         float w2_fp32 = NF4_LUT[idx2] * real_absmax;
+
+//         // 6. 分别写回 
+//         int64_t out_idx = byte_idx * 2;
+//         if (out_idx < num_elements) {
+//             output[out_idx] = __float2bfloat16(w1_fp32);
+//         }
+//         if (out_idx + 1 < num_elements) {
+//             output[out_idx + 1] = __float2bfloat16(w2_fp32);
+//         }
+//     }
+// }
+
+template <typename OutT>
+__global__ void nf4_decode_kernel(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const half* __restrict__ absmax2,
+    const half* __restrict__ code2,
+    const float offset,
+    OutT* __restrict__ output,
+    int64_t num_elements,
+    int blocksize,
+    int group_size
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t stride = gridDim.x * blockDim.x;
+    int64_t total_bytes = (num_elements + 1) / 2;
+    int64_t full_pair_bytes = num_elements / 2;
+    unsigned warp_mask = 0xffffffffu;
+    int lane = threadIdx.x & 31;
+
+    __shared__ float s_LUT[16];
+    if (threadIdx.x < 16) {
+        s_LUT[threadIdx.x] = NF4_LUT[threadIdx.x];
+    }
+    __syncthreads();
+    uint32_t* out_u32 = reinterpret_cast<uint32_t*>(output);
+
+    for (int64_t byte_idx = tid; byte_idx < full_pair_bytes; byte_idx += stride) {
+        uint8_t packed = packed_weights[byte_idx];
+        int block_id = static_cast<int>(byte_idx / (blocksize / 2));
+        int group_id = block_id / group_size;
+        float real_absmax = 0.0f;
+        if (lane == 0) {
+            uint8_t qa = absmax_q[block_id];
+            real_absmax = (__half2float(absmax2[group_id]) * __half2float(code2[qa])) + offset;
+        }
+        real_absmax = __shfl_sync(warp_mask, real_absmax, 0);
+        float v1 = s_LUT[packed >> 4] * real_absmax;
+        float v2 = s_LUT[packed & 0x0F] * real_absmax;
+        out_u32[byte_idx] = pack_pair_to_u32<OutT>(v1, v2);
+    }
+
+    if ((num_elements & 1) != 0) {
+        int64_t tail_byte = total_bytes - 1;
+        if (tid == 0) {
+            uint8_t packed = packed_weights[tail_byte];
+            int block_id = static_cast<int>(tail_byte / (blocksize / 2));
+            int group_id = block_id / group_size;
+            float real_absmax = (__half2float(absmax2[group_id]) * __half2float(code2[absmax_q[block_id]])) + offset;
+            output[num_elements - 1] = float_to_out<OutT>(s_LUT[packed >> 4] * real_absmax);
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+//     1.输入解析，读取二进制文件
+    std::string input_file = "03_nf4_dequant/ikko/data/weight_data.bin";
+    std::string output_file = "03_nf4_dequant/ikko/data/output.bin";
+    enum class OutputType { BF16, FP16 };
+    OutputType output_type = OutputType::BF16;
+    if (argc >= 2) {
+        if (std::strcmp(argv[1], "bf16") == 0) {
+            output_type = OutputType::BF16;
+            output_file = "03_nf4_dequant/ikko/data/output_bf16.bin";
+        } else if (std::strcmp(argv[1], "fp16") == 0) {
+            output_type = OutputType::FP16;
+            output_file = "03_nf4_dequant/ikko/data/output_fp16.bin";
+        } else {
+            std::cerr << "Usage: " << argv[0] << " [bf16|fp16] [output_file]" << std::endl;
+            return 1;
+        }
+    }
+    if (argc >= 3) {
+        output_file = argv[2];
+    }
+    std::ifstream infile(input_file, std::ios::binary);
+    if (!infile) {
+        char cwd[4096];
+        if (getcwd(cwd, sizeof(cwd)) != nullptr) {
+            std::cerr << "CWD: " << cwd << std::endl;
+        }
+        std::cerr << "Error: Cannot open input file: " << input_file << std::endl;
+        std::string fallback_file = "03_nf4_dequant/ikko/data/weight_data.bin";
+        infile.open(fallback_file, std::ios::binary);
+        if (!infile) {
+            std::cerr << "Error: Cannot open fallback input file: " << fallback_file << std::endl;
+            return 1;
+        }
+        input_file = fallback_file;
+    }
+
+// 1. 读取 Header
+    int64_t num_rows, num_cols;
+    int32_t blocksize;
+    infile.read(reinterpret_cast<char*>(&num_rows), sizeof(int64_t));
+    infile.read(reinterpret_cast<char*>(&num_cols), sizeof(int64_t));
+    infile.read(reinterpret_cast<char*>(&blocksize), sizeof(int32_t));
+    std::streampos data_start = infile.tellg();
+    infile.seekg(0, std::ios::end);
+    int64_t file_size = static_cast<int64_t>(infile.tellg());
+    infile.seekg(data_start, std::ios::beg);
+// 2.内存规划
+    // Header header{num_rows, num_cols, blocksize};
+    int64_t num_elements = num_rows * num_cols;
+    int64_t num_blocks = (num_elements + blocksize - 1) / blocksize;
+    int64_t num_groups = 0;
+
+// 3.数据加载，分配显存
+    size_t size_packed = num_elements >> 1; // 需要 num_elements / 2 个字节，一个 byte 存 2 个权重
+    size_t size_packed_padded = (size_packed + 15) & ~static_cast<size_t>(15);
+    size_t size_absmax_q = num_blocks * sizeof(uint8_t);
+    size_t size_absmax2 = 0; // float16
+    size_t size_code2 = 256 * sizeof(half); // float16
+    float offset; // float32
+
+    std::vector<uint8_t> h_packed(size_packed_padded, 0);
+    std::vector<uint8_t> h_absmax_q(num_blocks);
+    // 推断 num_groups（兼容不同 bitsandbytes 版本的分组大小）
+    int64_t header_size = static_cast<int64_t>(sizeof(int64_t) * 2 + sizeof(int32_t));
+    int64_t remaining = file_size - header_size -
+                        static_cast<int64_t>(size_packed + size_absmax_q + size_code2 + sizeof(float));
+    if (remaining > 0 && (remaining % sizeof(half) == 0)) {
+        num_groups = remaining / static_cast<int64_t>(sizeof(half));
+        size_absmax2 = static_cast<size_t>(num_groups) * sizeof(half);
+    } else {
+        num_groups = (num_blocks + 255) / 256;
+        size_absmax2 = static_cast<size_t>(num_groups) * sizeof(half);
+    }
+
+    std::vector<half> h_absmax2(num_groups);
+    std::vector<half> h_code2(256);
+
+    infile.read(reinterpret_cast<char*>(h_packed.data()), size_packed);
+    infile.read(reinterpret_cast<char*>(h_absmax_q.data()), size_absmax_q);
+    infile.read(reinterpret_cast<char*>(h_absmax2.data()), size_absmax2);
+    infile.read(reinterpret_cast<char*>(h_code2.data()), size_code2);
+    infile.read(reinterpret_cast<char*>(&offset), sizeof(float)); 
+
+    infile.close();
+    // 分配 device   内存
+    uint8_t* d_packed = nullptr;
+    uint8_t* d_absmax_q = nullptr;
+    half* d_absmax2 = nullptr;
+    half* d_code2 = nullptr;
+    __nv_bfloat16 *d_output_bf16 = nullptr;
+    half *d_output_fp16 = nullptr;
+
+    CHECK_CUDA(cudaMalloc(&d_packed, size_packed_padded));
+    CHECK_CUDA(cudaMalloc(&d_absmax_q, size_absmax_q));
+    CHECK_CUDA(cudaMalloc(&d_absmax2, size_absmax2));
+    CHECK_CUDA(cudaMalloc(&d_code2, size_code2));
+    if (output_type == OutputType::BF16) {
+        CHECK_CUDA(cudaMalloc(&d_output_bf16, num_elements * sizeof(__nv_bfloat16)));
+    } else {
+        CHECK_CUDA(cudaMalloc(&d_output_fp16, num_elements * sizeof(half)));
+    }
+    CHECK_CUDA(cudaMemcpy(d_packed, h_packed.data(), size_packed_padded, cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_absmax_q, h_absmax_q.data(), size_absmax_q, cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_absmax2, h_absmax2.data(), size_absmax2, cudaMemcpyHostToDevice));
+    CHECK_CUDA(cudaMemcpy(d_code2, h_code2.data(), size_code2, cudaMemcpyHostToDevice));
+    
+
+
+// 4. 启动 CUDA Kernel
+    dim3 blockDim(256);
+    int64_t total_bytes = (num_elements + 1) / 2;
+    int sm_count = 0;
+    CHECK_CUDA(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, 0));
+    int max_active_blocks = 0;
+    if (output_type == OutputType::BF16) {
+        CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+            &max_active_blocks,
+            nf4_decode_kernel<__nv_bfloat16>,
+            blockDim.x,
+            0));
+    } else {
+        CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+            &max_active_blocks,
+            nf4_decode_kernel<half>,
+            blockDim.x,
+            0));
+    }
+    int grid_x = sm_count * max_active_blocks;
+    int64_t max_grid = (total_bytes + blockDim.x - 1) / blockDim.x;
+    if (grid_x > max_grid) {
+        grid_x = static_cast<int>(max_grid);
+    }
+    if (grid_x < 1) {
+        grid_x = 1;
+    }
+    dim3 gridDim(grid_x);
+    std::cout << "SM count: " << sm_count
+              << ", max active blocks/SM: " << max_active_blocks
+              << ", grid_x: " << grid_x << std::endl;
+    int group_size = static_cast<int>((num_blocks + num_groups - 1) / num_groups);
+    // kernel 函数需要完成 NF4 解码的核心计算逻辑
+    // 计时事件
+    cudaEvent_t start, stop;
+    CHECK_CUDA(cudaEventCreate(&start));
+    CHECK_CUDA(cudaEventCreate(&stop));
+
+    // Warmup
+    if (output_type == OutputType::BF16) {
+        nf4_decode_kernel<__nv_bfloat16><<<gridDim, blockDim>>>(
+            d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output_bf16, num_elements, blocksize, group_size
+        );
+    } else {
+        nf4_decode_kernel<half><<<gridDim, blockDim>>>(
+            d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output_fp16, num_elements, blocksize, group_size
+        );
+    }
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    const int iters = 100;
+    CHECK_CUDA(cudaEventRecord(start));
+    for (int i = 0; i < iters; ++i) {
+        if (output_type == OutputType::BF16) {
+            nf4_decode_kernel<__nv_bfloat16><<<gridDim, blockDim>>>(
+                d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output_bf16, num_elements, blocksize, group_size
+            );
+        } else {
+            nf4_decode_kernel<half><<<gridDim, blockDim>>>(
+                d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output_fp16, num_elements, blocksize, group_size
+            );
+        }
+    }
+    CHECK_CUDA(cudaEventRecord(stop));
+// 5.记录性能，写入数据
+    CHECK_CUDA(cudaGetLastError());
+    CHECK_CUDA(cudaEventSynchronize(stop));
+
+    float milliseconds = 0;
+    CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
+    milliseconds /= iters;
+
+// 6. D2H 拷贝结果
+    std::vector<__nv_bfloat16> h_output_bf16;
+    std::vector<half> h_output_fp16;
+    if (output_type == OutputType::BF16) {
+        h_output_bf16.resize(num_elements);
+        CHECK_CUDA(cudaMemcpy(h_output_bf16.data(), d_output_bf16, num_elements * sizeof(__nv_bfloat16), cudaMemcpyDeviceToHost));
+    } else {
+        h_output_fp16.resize(num_elements);
+        CHECK_CUDA(cudaMemcpy(h_output_fp16.data(), d_output_fp16, num_elements * sizeof(half), cudaMemcpyDeviceToHost));
+    }
+
+// 7. 计算并打印性能
+    double total_io_bytes = static_cast<double>(size_packed + size_absmax_q + size_absmax2 + size_code2) +
+                            static_cast<double>(num_elements * 2);
+    double bandwidth = total_io_bytes / (milliseconds / 1000.0) / 1e9;
+    constexpr double bnb_ref_ms = 1.243360;
+    constexpr double bnb_ref_bw = 543.14;
+    double speedup_vs_bnb = bnb_ref_ms / static_cast<double>(milliseconds);
+    double bw_ratio_vs_bnb = bandwidth / bnb_ref_bw;
+    std::cout << "Kernel Time: " << milliseconds << " ms" << std::endl;
+    std::cout << "Effective Bandwidth (approx): " << bandwidth << " GB/s" << std::endl;
+    std::cout << "Speedup vs bitsandbytes: " << speedup_vs_bnb << "x"
+              << " (ref " << bnb_ref_ms << " ms)" << std::endl;
+    std::cout << "Bandwidth ratio vs bitsandbytes: " << bw_ratio_vs_bnb << "x"
+              << " (ref " << bnb_ref_bw << " GB/s)" << std::endl;
+    std::cout << "Output dtype: " << (output_type == OutputType::BF16 ? "bf16" : "fp16") << std::endl;
+
+// 8. 写入输出文件
+    std::ofstream outfile(output_file, std::ios::binary);
+    if (output_type == OutputType::BF16) {
+        outfile.write(reinterpret_cast<char*>(h_output_bf16.data()), num_elements * sizeof(__nv_bfloat16));
+    } else {
+        outfile.write(reinterpret_cast<char*>(h_output_fp16.data()), num_elements * sizeof(half));
+    }
+    outfile.close();
+    std::cout << "Output written to " << output_file << std::endl;
+
+    // 清理
+    cudaFree(d_packed);
+    cudaFree(d_absmax_q);
+    cudaFree(d_absmax2);
+    cudaFree(d_code2);
+    if (d_output_bf16) cudaFree(d_output_bf16);
+    if (d_output_fp16) cudaFree(d_output_fp16);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/03_nf4_dequant/ikko/mainla.maca b/03_nf4_dequant/ikko/mainla.maca
new file mode 100644
index 0000000..6acfc63
--- /dev/null
+++ b/03_nf4_dequant/ikko/mainla.maca
@@ -0,0 +1,224 @@
+#include <mcr/mc_runtime.h>
+#include <common/maca_fp16.h>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <cstring>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <chrono>
+#include <unistd.h>
+
+#if !defined(PLATFORM_METAX)
+#define PLATFORM_METAX
+#endif
+#include "../tester/utils.h"
+
+__constant__ float NF4_LUT[16] = {
+    -1.0f,
+    -0.6961928009986877f,
+    -0.5250730514526367f,
+    -0.39491748809814453f,
+    -0.28444138169288635f,
+    -0.18477343022823334f,
+    -0.09105003625154495f,
+    0.0f,
+    0.07958029955625534f,
+    0.16093020141124725f,
+    0.24611230194568634f,
+    0.33791524171829224f,
+    0.44070982933044434f,
+    0.5626170039176941f,
+    0.7229568362236023f,
+    1.0f
+};
+
+__global__ void nf4_decode_kernel_fp16(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const half* __restrict__ absmax2,
+    const half* __restrict__ code2,
+    float offset,
+    half* __restrict__ output,
+    int64_t num_elements,
+    int blocksize,
+    int group_size) {
+    int64_t tid = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    int64_t stride = static_cast<int64_t>(gridDim.x) * blockDim.x;
+    int64_t total_bytes = (num_elements + 1) / 2;
+    int bytes_per_block = blocksize / 2;
+
+    for (int64_t byte_idx = tid; byte_idx < total_bytes; byte_idx += stride) {
+        uint8_t packed = packed_weights[byte_idx];
+        uint8_t idx_hi = (packed >> 4) & 0x0F;
+        uint8_t idx_lo = packed & 0x0F;
+
+        int block_id = static_cast<int>(byte_idx / bytes_per_block);
+        int group_id = block_id / group_size;
+
+        float a2 = __half2float(absmax2[group_id]);
+        float c2 = __half2float(code2[absmax_q[block_id]]);
+        float real_absmax = a2 * c2 + offset;
+
+        float v0 = NF4_LUT[idx_hi] * real_absmax;
+        float v1 = NF4_LUT[idx_lo] * real_absmax;
+
+        int64_t out_idx = byte_idx * 2;
+        if (out_idx < num_elements) {
+            output[out_idx] = __float2half(v0);
+        }
+        if (out_idx + 1 < num_elements) {
+            output[out_idx + 1] = __float2half(v1);
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+    std::string input_file = "03_nf4_dequant/ikko/data/weight_data.bin";
+    std::string output_file = "03_nf4_dequant/ikko/output_fp16_maca.bin";
+
+    if (argc >= 2) {
+        if (std::strcmp(argv[1], "fp16") != 0) {
+            std::cerr << "Warning: mainla.maca currently supports fp16 only, got '" << argv[1]
+                      << "'. Continue with fp16 output." << std::endl;
+        }
+    }
+    if (argc >= 3) {
+        output_file = argv[2];
+    }
+
+    std::ifstream infile(input_file, std::ios::binary);
+    if (!infile) {
+        char cwd[4096];
+        if (getcwd(cwd, sizeof(cwd)) != nullptr) {
+            std::cerr << "CWD: " << cwd << std::endl;
+        }
+        std::cerr << "Error: Cannot open input file: " << input_file << std::endl;
+        std::string fallback_file = "03_nf4_dequant/ikko/data/weight_data.bin";
+        infile.open(fallback_file, std::ios::binary);
+        if (!infile) {
+            std::cerr << "Error: Cannot open fallback input file: " << fallback_file << std::endl;
+            return 1;
+        }
+        input_file = fallback_file;
+    }
+
+    int64_t num_rows = 0;
+    int64_t num_cols = 0;
+    int32_t blocksize = 0;
+    infile.read(reinterpret_cast<char*>(&num_rows), sizeof(int64_t));
+    infile.read(reinterpret_cast<char*>(&num_cols), sizeof(int64_t));
+    infile.read(reinterpret_cast<char*>(&blocksize), sizeof(int32_t));
+
+    std::streampos data_start = infile.tellg();
+    infile.seekg(0, std::ios::end);
+    int64_t file_size = static_cast<int64_t>(infile.tellg());
+    infile.seekg(data_start, std::ios::beg);
+
+    int64_t num_elements = num_rows * num_cols;
+    int64_t num_blocks = (num_elements + blocksize - 1) / blocksize;
+
+    size_t size_packed = static_cast<size_t>((num_elements + 1) / 2);
+    size_t size_absmax_q = static_cast<size_t>(num_blocks) * sizeof(uint8_t);
+    size_t size_code2 = 256 * sizeof(half);
+    size_t size_absmax2 = 0;
+    int64_t num_groups = 0;
+
+    int64_t header_size = static_cast<int64_t>(sizeof(int64_t) * 2 + sizeof(int32_t));
+    int64_t remaining = file_size - header_size -
+                        static_cast<int64_t>(size_packed + size_absmax_q + size_code2 + sizeof(float));
+    if (remaining > 0 && (remaining % static_cast<int64_t>(sizeof(half)) == 0)) {
+        num_groups = remaining / static_cast<int64_t>(sizeof(half));
+    } else {
+        num_groups = (num_blocks + 255) / 256;
+    }
+    size_absmax2 = static_cast<size_t>(num_groups) * sizeof(half);
+
+    std::vector<uint8_t> h_packed(size_packed);
+    std::vector<uint8_t> h_absmax_q(static_cast<size_t>(num_blocks));
+    std::vector<half> h_absmax2(static_cast<size_t>(num_groups));
+    std::vector<half> h_code2(256);
+    float offset = 0.0f;
+
+    infile.read(reinterpret_cast<char*>(h_packed.data()), size_packed);
+    infile.read(reinterpret_cast<char*>(h_absmax_q.data()), size_absmax_q);
+    infile.read(reinterpret_cast<char*>(h_absmax2.data()), size_absmax2);
+    infile.read(reinterpret_cast<char*>(h_code2.data()), size_code2);
+    infile.read(reinterpret_cast<char*>(&offset), sizeof(float));
+    infile.close();
+
+    uint8_t* d_packed = nullptr;
+    uint8_t* d_absmax_q = nullptr;
+    half* d_absmax2 = nullptr;
+    half* d_code2 = nullptr;
+    half* d_output = nullptr;
+
+    RUNTIME_CHECK(mcMalloc(&d_packed, size_packed));
+    RUNTIME_CHECK(mcMalloc(&d_absmax_q, size_absmax_q));
+    RUNTIME_CHECK(mcMalloc(&d_absmax2, size_absmax2));
+    RUNTIME_CHECK(mcMalloc(&d_code2, size_code2));
+    RUNTIME_CHECK(mcMalloc(&d_output, static_cast<size_t>(num_elements) * sizeof(half)));
+
+    RUNTIME_CHECK(mcMemcpy(d_packed, h_packed.data(), size_packed, mcMemcpyHostToDevice));
+    RUNTIME_CHECK(mcMemcpy(d_absmax_q, h_absmax_q.data(), size_absmax_q, mcMemcpyHostToDevice));
+    RUNTIME_CHECK(mcMemcpy(d_absmax2, h_absmax2.data(), size_absmax2, mcMemcpyHostToDevice));
+    RUNTIME_CHECK(mcMemcpy(d_code2, h_code2.data(), size_code2, mcMemcpyHostToDevice));
+
+    dim3 block_dim(256);
+    int64_t total_bytes = (num_elements + 1) / 2;
+    int grid_x = 4096;
+    int64_t max_grid = (total_bytes + block_dim.x - 1) / block_dim.x;
+    if (grid_x > max_grid) {
+        grid_x = static_cast<int>(max_grid);
+    }
+    if (grid_x < 1) {
+        grid_x = 1;
+    }
+    dim3 grid_dim(grid_x);
+
+    int group_size = static_cast<int>((num_blocks + num_groups - 1) / num_groups);
+
+    nf4_decode_kernel_fp16<<<grid_dim, block_dim>>>(
+        d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output, num_elements, blocksize, group_size);
+    RUNTIME_CHECK(mcGetLastError());
+    RUNTIME_CHECK(mcDeviceSynchronize());
+
+    const int iters = 100;
+    auto t0 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < iters; ++i) {
+        nf4_decode_kernel_fp16<<<grid_dim, block_dim>>>(
+            d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output, num_elements, blocksize, group_size);
+    }
+    RUNTIME_CHECK(mcGetLastError());
+    RUNTIME_CHECK(mcDeviceSynchronize());
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    double milliseconds = std::chrono::duration<double, std::milli>(t1 - t0).count() / static_cast<double>(iters);
+
+    std::vector<half> h_output(static_cast<size_t>(num_elements));
+    RUNTIME_CHECK(mcMemcpy(h_output.data(), d_output, static_cast<size_t>(num_elements) * sizeof(half), mcMemcpyDeviceToHost));
+
+    double total_io_bytes = static_cast<double>(size_packed + size_absmax_q + size_absmax2 + size_code2) +
+                            static_cast<double>(num_elements * sizeof(half));
+    double bandwidth = total_io_bytes / (milliseconds / 1000.0) / 1e9;
+
+    std::cout << "grid_x: " << grid_x << std::endl;
+    std::cout << "Kernel Time: " << milliseconds << " ms" << std::endl;
+    std::cout << "Effective Bandwidth (approx): " << bandwidth << " GB/s" << std::endl;
+    std::cout << "Output dtype: fp16" << std::endl;
+
+    std::ofstream outfile(output_file, std::ios::binary);
+    outfile.write(reinterpret_cast<char*>(h_output.data()), static_cast<std::streamsize>(num_elements * sizeof(half)));
+    outfile.close();
+    std::cout << "Output written to " << output_file << std::endl;
+
+    RUNTIME_CHECK(mcFree(d_packed));
+    RUNTIME_CHECK(mcFree(d_absmax_q));
+    RUNTIME_CHECK(mcFree(d_absmax2));
+    RUNTIME_CHECK(mcFree(d_code2));
+    RUNTIME_CHECK(mcFree(d_output));
+
+    return 0;
+}
diff --git a/03_nf4_dequant/ikko/mainla.mu b/03_nf4_dequant/ikko/mainla.mu
new file mode 100644
index 0000000..c1e7678
--- /dev/null
+++ b/03_nf4_dequant/ikko/mainla.mu
@@ -0,0 +1,240 @@
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <cstring>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <chrono>
+#include <unistd.h>
+
+__constant__ float NF4_LUT[16] = {
+    -1.0f,
+    -0.6961928009986877f,
+    -0.5250730514526367f,
+    -0.39491748809814453f,
+    -0.28444138169288635f,
+    -0.18477343022823334f,
+    -0.09105003625154495f,
+    0.0f,
+    0.07958029955625534f,
+    0.16093020141124725f,
+    0.24611230194568634f,
+    0.33791524171829224f,
+    0.44070982933044434f,
+    0.5626170039176941f,
+    0.7229568362236023f,
+    1.0f
+};
+
+void checkMusa(musaError_t result, const char* func, const char* file, int line) {
+    if (result != musaSuccess) {
+        std::cerr << "MUSA error at " << file << ":" << line << " code=" << result << " \"" << func << "\"\n";
+        std::cerr << "Error string: " << musaGetErrorString(result) << std::endl;
+        std::exit(99);
+    }
+}
+
+#define CHECK_MUSA(val) checkMusa((val), #val, __FILE__, __LINE__)
+
+__device__ __forceinline__ uint32_t pack_pair_to_u32(float v1, float v2) {
+    half2 packed = __floats2half2_rn(v1, v2);
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+
+__global__ void nf4_decode_kernel_fp16(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const half* __restrict__ absmax2,
+    const half* __restrict__ code2,
+    float offset,
+    half* __restrict__ output,
+    int64_t num_elements,
+    int blocksize,
+    int group_size) {
+    int64_t tid = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    int64_t stride = static_cast<int64_t>(gridDim.x) * blockDim.x;
+    int64_t total_bytes = (num_elements + 1) / 2;
+    int64_t full_pair_bytes = num_elements / 2;
+    int bytes_per_block = blocksize / 2;
+
+    __shared__ float s_lut[16];
+    if (threadIdx.x < 16) {
+        s_lut[threadIdx.x] = NF4_LUT[threadIdx.x];
+    }
+    __syncthreads();
+
+    uint32_t* out_u32 = reinterpret_cast<uint32_t*>(output);
+    for (int64_t byte_idx = tid; byte_idx < full_pair_bytes; byte_idx += stride) {
+        uint8_t packed = packed_weights[byte_idx];
+        int block_id = static_cast<int>(byte_idx / bytes_per_block);
+        int group_id = block_id / group_size;
+
+        float real_absmax = __half2float(absmax2[group_id]) * __half2float(code2[absmax_q[block_id]]) + offset;
+        float v1 = s_lut[packed >> 4] * real_absmax;
+        float v2 = s_lut[packed & 0x0F] * real_absmax;
+        out_u32[byte_idx] = pack_pair_to_u32(v1, v2);
+    }
+
+    if ((num_elements & 1) != 0 && tid == 0) {
+        int64_t tail_byte = total_bytes - 1;
+        uint8_t packed = packed_weights[tail_byte];
+        int block_id = static_cast<int>(tail_byte / bytes_per_block);
+        int group_id = block_id / group_size;
+        float real_absmax = __half2float(absmax2[group_id]) * __half2float(code2[absmax_q[block_id]]) + offset;
+        output[num_elements - 1] = __float2half(s_lut[packed >> 4] * real_absmax);
+    }
+}
+
+int main(int argc, char** argv) {
+    std::string input_file = "03_nf4_dequant/ikko/data/weight_data.bin";
+    std::string output_file = "03_nf4_dequant/ikko/output_fp16_mu.bin";
+
+    if (argc >= 2) {
+        if (std::strcmp(argv[1], "fp16") != 0) {
+            std::cerr << "Warning: mainla.mu currently supports fp16 only, got '" << argv[1]
+                      << "'. Continue with fp16 output." << std::endl;
+        }
+    }
+    if (argc >= 3) {
+        output_file = argv[2];
+    }
+
+    std::ifstream infile(input_file, std::ios::binary);
+    if (!infile) {
+        char cwd[4096];
+        if (getcwd(cwd, sizeof(cwd)) != nullptr) {
+            std::cerr << "CWD: " << cwd << std::endl;
+        }
+        std::cerr << "Error: Cannot open input file: " << input_file << std::endl;
+        std::string fallback_file = "03_nf4_dequant/ikko/data/weight_data.bin";
+        infile.open(fallback_file, std::ios::binary);
+        if (!infile) {
+            std::cerr << "Error: Cannot open fallback input file: " << fallback_file << std::endl;
+            return 1;
+        }
+        input_file = fallback_file;
+    }
+
+    int64_t num_rows = 0;
+    int64_t num_cols = 0;
+    int32_t blocksize = 0;
+    infile.read(reinterpret_cast<char*>(&num_rows), sizeof(int64_t));
+    infile.read(reinterpret_cast<char*>(&num_cols), sizeof(int64_t));
+    infile.read(reinterpret_cast<char*>(&blocksize), sizeof(int32_t));
+
+    std::streampos data_start = infile.tellg();
+    infile.seekg(0, std::ios::end);
+    int64_t file_size = static_cast<int64_t>(infile.tellg());
+    infile.seekg(data_start, std::ios::beg);
+
+    int64_t num_elements = num_rows * num_cols;
+    int64_t num_blocks = (num_elements + blocksize - 1) / blocksize;
+
+    size_t size_packed = static_cast<size_t>((num_elements + 1) / 2);
+    size_t size_packed_padded = (size_packed + 15) & ~static_cast<size_t>(15);
+    size_t size_absmax_q = static_cast<size_t>(num_blocks) * sizeof(uint8_t);
+    size_t size_code2 = 256 * sizeof(half);
+    size_t size_absmax2 = 0;
+    int64_t num_groups = 0;
+
+    int64_t header_size = static_cast<int64_t>(sizeof(int64_t) * 2 + sizeof(int32_t));
+    int64_t remaining = file_size - header_size -
+                        static_cast<int64_t>(size_packed + size_absmax_q + size_code2 + sizeof(float));
+    if (remaining > 0 && (remaining % static_cast<int64_t>(sizeof(half)) == 0)) {
+        num_groups = remaining / static_cast<int64_t>(sizeof(half));
+    } else {
+        num_groups = (num_blocks + 255) / 256;
+    }
+    size_absmax2 = static_cast<size_t>(num_groups) * sizeof(half);
+
+    std::vector<uint8_t> h_packed(size_packed_padded, 0);
+    std::vector<uint8_t> h_absmax_q(static_cast<size_t>(num_blocks));
+    std::vector<half> h_absmax2(static_cast<size_t>(num_groups));
+    std::vector<half> h_code2(256);
+    float offset = 0.0f;
+
+    infile.read(reinterpret_cast<char*>(h_packed.data()), size_packed);
+    infile.read(reinterpret_cast<char*>(h_absmax_q.data()), size_absmax_q);
+    infile.read(reinterpret_cast<char*>(h_absmax2.data()), size_absmax2);
+    infile.read(reinterpret_cast<char*>(h_code2.data()), size_code2);
+    infile.read(reinterpret_cast<char*>(&offset), sizeof(float));
+    infile.close();
+
+    uint8_t* d_packed = nullptr;
+    uint8_t* d_absmax_q = nullptr;
+    half* d_absmax2 = nullptr;
+    half* d_code2 = nullptr;
+    half* d_output = nullptr;
+
+    CHECK_MUSA(musaMalloc(&d_packed, size_packed_padded));
+    CHECK_MUSA(musaMalloc(&d_absmax_q, size_absmax_q));
+    CHECK_MUSA(musaMalloc(&d_absmax2, size_absmax2));
+    CHECK_MUSA(musaMalloc(&d_code2, size_code2));
+    CHECK_MUSA(musaMalloc(&d_output, static_cast<size_t>(num_elements) * sizeof(half)));
+
+    CHECK_MUSA(musaMemcpy(d_packed, h_packed.data(), size_packed_padded, musaMemcpyHostToDevice));
+    CHECK_MUSA(musaMemcpy(d_absmax_q, h_absmax_q.data(), size_absmax_q, musaMemcpyHostToDevice));
+    CHECK_MUSA(musaMemcpy(d_absmax2, h_absmax2.data(), size_absmax2, musaMemcpyHostToDevice));
+    CHECK_MUSA(musaMemcpy(d_code2, h_code2.data(), size_code2, musaMemcpyHostToDevice));
+
+    dim3 block_dim(256);
+    int64_t total_bytes = (num_elements + 1) / 2;
+    int grid_x = 4096;
+    int64_t max_grid = (total_bytes + block_dim.x - 1) / block_dim.x;
+    if (grid_x > max_grid) {
+        grid_x = static_cast<int>(max_grid);
+    }
+    if (grid_x < 1) {
+        grid_x = 1;
+    }
+    dim3 grid_dim(grid_x);
+
+    int group_size = static_cast<int>((num_blocks + num_groups - 1) / num_groups);
+
+    nf4_decode_kernel_fp16<<<grid_dim, block_dim>>>(
+        d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output, num_elements, blocksize, group_size);
+    CHECK_MUSA(musaGetLastError());
+    CHECK_MUSA(musaDeviceSynchronize());
+
+    const int iters = 100;
+    auto t0 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < iters; ++i) {
+        nf4_decode_kernel_fp16<<<grid_dim, block_dim>>>(
+            d_packed, d_absmax_q, d_absmax2, d_code2, offset, d_output, num_elements, blocksize, group_size);
+    }
+    CHECK_MUSA(musaGetLastError());
+    CHECK_MUSA(musaDeviceSynchronize());
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    double milliseconds = std::chrono::duration<double, std::milli>(t1 - t0).count() / static_cast<double>(iters);
+
+    std::vector<half> h_output(static_cast<size_t>(num_elements));
+    CHECK_MUSA(musaMemcpy(h_output.data(), d_output, static_cast<size_t>(num_elements) * sizeof(half), musaMemcpyDeviceToHost));
+
+    double total_io_bytes = static_cast<double>(size_packed + size_absmax_q + size_absmax2 + size_code2) +
+                            static_cast<double>(num_elements * sizeof(half));
+    double bandwidth = total_io_bytes / (milliseconds / 1000.0) / 1e9;
+
+    std::cout << "grid_x: " << grid_x << std::endl;
+    std::cout << "Kernel Time: " << milliseconds << " ms" << std::endl;
+    std::cout << "Effective Bandwidth (approx): " << bandwidth << " GB/s" << std::endl;
+    std::cout << "Output dtype: fp16" << std::endl;
+
+    std::ofstream outfile(output_file, std::ios::binary);
+    outfile.write(reinterpret_cast<char*>(h_output.data()), static_cast<std::streamsize>(num_elements * sizeof(half)));
+    outfile.close();
+    std::cout << "Output written to " << output_file << std::endl;
+
+    CHECK_MUSA(musaFree(d_packed));
+    CHECK_MUSA(musaFree(d_absmax_q));
+    CHECK_MUSA(musaFree(d_absmax2));
+    CHECK_MUSA(musaFree(d_code2));
+    CHECK_MUSA(musaFree(d_output));
+
+    return 0;
+}
diff --git a/03_nf4_dequant/ikko/ncu_report.txt b/03_nf4_dequant/ikko/ncu_report.txt
new file mode 100644
index 0000000..e8b4f03
--- /dev/null
+++ b/03_nf4_dequant/ikko/ncu_report.txt
@@ -0,0 +1,5051 @@
+```shell
+ nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:47, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.29
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,783,921
+    Memory [%]                                                                           %                          83.81
+    DRAM Throughput                                                                      %                          81.81
+    Duration                                                                       usecond                         794.30
+    L1/TEX Cache Throughput                                                              %                          41.80
+    L2 Cache Throughput                                                                  %                          83.81
+    SM Active Cycles                                                                 cycle                   1,707,265.78
+    Compute (SM) [%]                                                                     %                           9.20
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.95
+    Achieved Active Warps Per SM                                                      warp                          35.01
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:47, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.30
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,772,530
+    Memory [%]                                                                           %                          84.35
+    DRAM Throughput                                                                      %                          82.27
+    Duration                                                                       usecond                         789.44
+    L1/TEX Cache Throughput                                                              %                          42.15
+    L2 Cache Throughput                                                                  %                          84.35
+    SM Active Cycles                                                                 cycle                   1,699,883.62
+    Compute (SM) [%]                                                                     %                           9.28
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.93
+    Achieved Active Warps Per SM                                                      warp                          35.01
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:47, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.27
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,767,384
+    Memory [%]                                                                           %                          84.55
+    DRAM Throughput                                                                      %                          82.52
+    Duration                                                                       usecond                         789.34
+    L1/TEX Cache Throughput                                                              %                          42.24
+    L2 Cache Throughput                                                                  %                          84.55
+    SM Active Cycles                                                                 cycle                   1,709,933.30
+    Compute (SM) [%]                                                                     %                           9.30
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.14
+    Achieved Active Warps Per SM                                                      warp                          35.11
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:47, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.16
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,760,429
+    Memory [%]                                                                           %                          84.92
+    DRAM Throughput                                                                      %                          82.83
+    Duration                                                                       usecond                         794.72
+    L1/TEX Cache Throughput                                                              %                          42.43
+    L2 Cache Throughput                                                                  %                          84.92
+    SM Active Cycles                                                                 cycle                   1,705,613.77
+    Compute (SM) [%]                                                                     %                           9.34
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.14
+    Achieved Active Warps Per SM                                                      warp                          35.11
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:47, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.27
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,786,082
+    Memory [%]                                                                           %                          83.68
+    DRAM Throughput                                                                      %                          81.64
+    Duration                                                                       usecond                         797.79
+    L1/TEX Cache Throughput                                                              %                          41.84
+    L2 Cache Throughput                                                                  %                          83.68
+    SM Active Cycles                                                                 cycle                   1,709,916.09
+    Compute (SM) [%]                                                                     %                           9.20
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.02
+    Achieved Active Warps Per SM                                                      warp                          35.05
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:47, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.28
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,782,712
+    Memory [%]                                                                           %                          83.86
+    DRAM Throughput                                                                      %                          81.75
+    Duration                                                                       usecond                         795.97
+    L1/TEX Cache Throughput                                                              %                          41.89
+    L2 Cache Throughput                                                                  %                          83.86
+    SM Active Cycles                                                                 cycle                   1,692,069.59
+    Compute (SM) [%]                                                                     %                           9.22
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.12
+    Achieved Active Warps Per SM                                                      warp                          35.10
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:47, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,784,529
+    Memory [%]                                                                           %                          83.78
+    DRAM Throughput                                                                      %                          81.78
+    Duration                                                                       usecond                         798.02
+    L1/TEX Cache Throughput                                                              %                          41.74
+    L2 Cache Throughput                                                                  %                          83.78
+    SM Active Cycles                                                                 cycle                   1,695,400.41
+    Compute (SM) [%]                                                                     %                           9.20
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.99
+    Achieved Active Warps Per SM                                                      warp                          35.03
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:47, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.28
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,783,199
+    Memory [%]                                                                           %                          83.85
+    DRAM Throughput                                                                      %                          81.84
+    Duration                                                                       usecond                         795.36
+    L1/TEX Cache Throughput                                                              %                          41.77
+    L2 Cache Throughput                                                                  %                          83.85
+    SM Active Cycles                                                                 cycle                   1,720,274.64
+    Compute (SM) [%]                                                                     %                           9.20
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.11
+    Achieved Active Warps Per SM                                                      warp                          35.09
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,777,917
+    Memory [%]                                                                           %                          84.16
+    DRAM Throughput                                                                      %                          82.10
+    Duration                                                                       usecond                         794.69
+    L1/TEX Cache Throughput                                                              %                          41.95
+    L2 Cache Throughput                                                                  %                          84.16
+    SM Active Cycles                                                                 cycle                   1,708,117.10
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.06
+    Achieved Active Warps Per SM                                                      warp                          35.07
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.26
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,777,689
+    Memory [%]                                                                           %                          84.21
+    DRAM Throughput                                                                      %                          82.10
+    Duration                                                                       usecond                         794.40
+    L1/TEX Cache Throughput                                                              %                          41.94
+    L2 Cache Throughput                                                                  %                          84.21
+    SM Active Cycles                                                                 cycle                   1,716,964.59
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.75
+    Achieved Active Warps Per SM                                                      warp                          34.92
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.29
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,771,893
+    Memory [%]                                                                           %                          84.42
+    DRAM Throughput                                                                      %                          82.39
+    Duration                                                                       usecond                         789.09
+    L1/TEX Cache Throughput                                                              %                          42.09
+    L2 Cache Throughput                                                                  %                          84.42
+    SM Active Cycles                                                                 cycle                   1,715,996.22
+    Compute (SM) [%]                                                                     %                           9.26
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.97
+    Achieved Active Warps Per SM                                                      warp                          35.02
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.15
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,779,698
+    Memory [%]                                                                           %                          84.02
+    DRAM Throughput                                                                      %                          82.00
+    Duration                                                                       usecond                         803.46
+    L1/TEX Cache Throughput                                                              %                          41.86
+    L2 Cache Throughput                                                                  %                          84.02
+    SM Active Cycles                                                                 cycle                   1,700,522.24
+    Compute (SM) [%]                                                                     %                           9.22
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.87
+    Achieved Active Warps Per SM                                                      warp                          34.98
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.20
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,777,023
+    Memory [%]                                                                           %                          84.16
+    DRAM Throughput                                                                      %                          82.12
+    Duration                                                                       usecond                         798.62
+    L1/TEX Cache Throughput                                                              %                          41.91
+    L2 Cache Throughput                                                                  %                          84.16
+    SM Active Cycles                                                                 cycle                   1,694,419.05
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.21
+    Achieved Active Warps Per SM                                                      warp                          35.14
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.2%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.26
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,778,816
+    Memory [%]                                                                           %                          84.15
+    DRAM Throughput                                                                      %                          82.02
+    Duration                                                                       usecond                         794.91
+    L1/TEX Cache Throughput                                                              %                          41.94
+    L2 Cache Throughput                                                                  %                          84.15
+    SM Active Cycles                                                                 cycle                   1,706,151.49
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.72
+    Achieved Active Warps Per SM                                                      warp                          34.91
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.7%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.12
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,770,841
+    Memory [%]                                                                           %                          84.43
+    DRAM Throughput                                                                      %                          82.41
+    Duration                                                                       usecond                         802.11
+    L1/TEX Cache Throughput                                                              %                          42.09
+    L2 Cache Throughput                                                                  %                          84.43
+    SM Active Cycles                                                                 cycle                   1,708,276.97
+    Compute (SM) [%]                                                                     %                           9.27
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.83
+    Achieved Active Warps Per SM                                                      warp                          34.96
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.13
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,754,791
+    Memory [%]                                                                           %                          85.21
+    DRAM Throughput                                                                      %                          83.03
+    Duration                                                                       usecond                         794.24
+    L1/TEX Cache Throughput                                                              %                          42.49
+    L2 Cache Throughput                                                                  %                          85.21
+    SM Active Cycles                                                                 cycle                   1,711,735.01
+    Compute (SM) [%]                                                                     %                           9.35
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.02
+    Achieved Active Warps Per SM                                                      warp                          35.05
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.15
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,777,220
+    Memory [%]                                                                           %                          84.20
+    DRAM Throughput                                                                      %                          82.12
+    Duration                                                                       usecond                         802.21
+    L1/TEX Cache Throughput                                                              %                          41.91
+    L2 Cache Throughput                                                                  %                          84.20
+    SM Active Cycles                                                                 cycle                   1,704,338.13
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.88
+    Achieved Active Warps Per SM                                                      warp                          34.98
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.33
+    SM Frequency                                                             cycle/nsecond                           2.25
+    Elapsed Cycles                                                                   cycle                      1,782,247
+    Memory [%]                                                                           %                          83.99
+    DRAM Throughput                                                                      %                          81.90
+    Duration                                                                       usecond                         790.78
+    L1/TEX Cache Throughput                                                              %                          41.79
+    L2 Cache Throughput                                                                  %                          83.99
+    SM Active Cycles                                                                 cycle                   1,697,375.76
+    Compute (SM) [%]                                                                     %                           9.21
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.88
+    Achieved Active Warps Per SM                                                      warp                          34.98
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.27
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,774,262
+    Memory [%]                                                                           %                          84.24
+    DRAM Throughput                                                                      %                          82.28
+    Duration                                                                       usecond                         791.65
+    L1/TEX Cache Throughput                                                              %                          42.00
+    L2 Cache Throughput                                                                  %                          84.24
+    SM Active Cycles                                                                 cycle                   1,713,035.62
+    Compute (SM) [%]                                                                     %                           9.25
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.91
+    Achieved Active Warps Per SM                                                      warp                          35.00
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.08
+    SM Frequency                                                             cycle/nsecond                           2.20
+    Elapsed Cycles                                                                   cycle                      1,759,671
+    Memory [%]                                                                           %                          84.97
+    DRAM Throughput                                                                      %                          82.92
+    Duration                                                                       usecond                         799.97
+    L1/TEX Cache Throughput                                                              %                          42.33
+    L2 Cache Throughput                                                                  %                          84.97
+    SM Active Cycles                                                                 cycle                   1,722,472.24
+    Compute (SM) [%]                                                                     %                           9.33
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.00
+    Achieved Active Warps Per SM                                                      warp                          35.04
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.26
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,770,599
+    Memory [%]                                                                           %                          84.53
+    DRAM Throughput                                                                      %                          82.36
+    Duration                                                                       usecond                         790.88
+    L1/TEX Cache Throughput                                                              %                          42.08
+    L2 Cache Throughput                                                                  %                          84.53
+    SM Active Cycles                                                                 cycle                   1,733,918.75
+    Compute (SM) [%]                                                                     %                           9.27
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.54
+    Achieved Active Warps Per SM                                                      warp                          34.82
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.5%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.29
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,784,071
+    Memory [%]                                                                           %                          83.77
+    DRAM Throughput                                                                      %                          81.81
+    Duration                                                                       usecond                         795.04
+    L1/TEX Cache Throughput                                                              %                          41.75
+    L2 Cache Throughput                                                                  %                          83.77
+    SM Active Cycles                                                                 cycle                   1,708,494.95
+    Compute (SM) [%]                                                                     %                           9.20
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.98
+    Achieved Active Warps Per SM                                                      warp                          35.03
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.29
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,774,534
+    Memory [%]                                                                           %                          84.24
+    DRAM Throughput                                                                      %                          82.27
+    Duration                                                                       usecond                         790.30
+    L1/TEX Cache Throughput                                                              %                          42.00
+    L2 Cache Throughput                                                                  %                          84.24
+    SM Active Cycles                                                                 cycle                   1,701,005.85
+    Compute (SM) [%]                                                                     %                           9.25
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.15
+    Achieved Active Warps Per SM                                                      warp                          35.11
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.11
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,764,482
+    Memory [%]                                                                           %                          84.71
+    DRAM Throughput                                                                      %                          82.68
+    Duration                                                                       usecond                         799.65
+    L1/TEX Cache Throughput                                                              %                          42.29
+    L2 Cache Throughput                                                                  %                          84.71
+    SM Active Cycles                                                                 cycle                   1,703,234.04
+    Compute (SM) [%]                                                                     %                           9.30
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.03
+    Achieved Active Warps Per SM                                                      warp                          35.05
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:48, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.09
+    SM Frequency                                                             cycle/nsecond                           2.20
+    Elapsed Cycles                                                                   cycle                      1,763,753
+    Memory [%]                                                                           %                          84.85
+    DRAM Throughput                                                                      %                          82.75
+    Duration                                                                       usecond                         800.99
+    L1/TEX Cache Throughput                                                              %                          42.25
+    L2 Cache Throughput                                                                  %                          84.85
+    SM Active Cycles                                                                 cycle                   1,700,330.09
+    Compute (SM) [%]                                                                     %                           9.30
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.94
+    Achieved Active Warps Per SM                                                      warp                          35.01
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.08
+    SM Frequency                                                             cycle/nsecond                           2.20
+    Elapsed Cycles                                                                   cycle                      1,756,891
+    Memory [%]                                                                           %                          85.16
+    DRAM Throughput                                                                      %                          83.05
+    Duration                                                                       usecond                         798.75
+    L1/TEX Cache Throughput                                                              %                          42.42
+    L2 Cache Throughput                                                                  %                          85.16
+    SM Active Cycles                                                                 cycle                   1,703,631.23
+    Compute (SM) [%]                                                                     %                           9.34
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.07
+    Achieved Active Warps Per SM                                                      warp                          35.07
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.21
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,769,343
+    Memory [%]                                                                           %                          84.57
+    DRAM Throughput                                                                      %                          82.49
+    Duration                                                                       usecond                         793.92
+    L1/TEX Cache Throughput                                                              %                          42.11
+    L2 Cache Throughput                                                                  %                          84.57
+    SM Active Cycles                                                                 cycle                   1,703,819.69
+    Compute (SM) [%]                                                                     %                           9.28
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.73
+    Achieved Active Warps Per SM                                                      warp                          34.91
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.7%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.29
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,783,887
+    Memory [%]                                                                           %                          83.60
+    DRAM Throughput                                                                      %                          81.33
+    Duration                                                                       usecond                         799.68
+    L1/TEX Cache Throughput                                                              %                          41.90
+    L2 Cache Throughput                                                                  %                          83.60
+    SM Active Cycles                                                                 cycle                   1,689,866.03
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.24
+    Achieved Active Warps Per SM                                                      warp                          35.16
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.2%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.19
+    SM Frequency                                                             cycle/nsecond                           2.20
+    Elapsed Cycles                                                                   cycle                      1,765,145
+    Memory [%]                                                                           %                          84.51
+    DRAM Throughput                                                                      %                          82.22
+    Duration                                                                       usecond                         797.95
+    L1/TEX Cache Throughput                                                              %                          42.35
+    L2 Cache Throughput                                                                  %                          84.51
+    SM Active Cycles                                                                 cycle                   1,685,987.81
+    Compute (SM) [%]                                                                     %                           9.32
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.00
+    Achieved Active Warps Per SM                                                      warp                          35.04
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.14
+    SM Frequency                                                             cycle/nsecond                           2.20
+    Elapsed Cycles                                                                   cycle                      1,766,425
+    Memory [%]                                                                           %                          84.65
+    DRAM Throughput                                                                      %                          82.51
+    Duration                                                                       usecond                         799.55
+    L1/TEX Cache Throughput                                                              %                          42.29
+    L2 Cache Throughput                                                                  %                          84.65
+    SM Active Cycles                                                                 cycle                   1,699,142.89
+    Compute (SM) [%]                                                                     %                           9.31
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.99
+    Achieved Active Warps Per SM                                                      warp                          35.03
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,773,542
+    Memory [%]                                                                           %                          84.23
+    DRAM Throughput                                                                      %                          82.17
+    Duration                                                                       usecond                         793.82
+    L1/TEX Cache Throughput                                                              %                          42.09
+    L2 Cache Throughput                                                                  %                          84.23
+    SM Active Cycles                                                                 cycle                   1,698,140.84
+    Compute (SM) [%]                                                                     %                           9.27
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.03
+    Achieved Active Warps Per SM                                                      warp                          35.06
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.41
+    SM Frequency                                                             cycle/nsecond                           2.26
+    Elapsed Cycles                                                                   cycle                      1,799,017
+    Memory [%]                                                                           %                          83.10
+    DRAM Throughput                                                                      %                          81.05
+    Duration                                                                       usecond                         793.15
+    L1/TEX Cache Throughput                                                              %                          41.52
+    L2 Cache Throughput                                                                  %                          83.10
+    SM Active Cycles                                                                 cycle                   1,704,204.10
+    Compute (SM) [%]                                                                     %                           9.14
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.92
+    Achieved Active Warps Per SM                                                      warp                          35.00
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.11
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,761,121
+    Memory [%]                                                                           %                          84.91
+    DRAM Throughput                                                                      %                          82.89
+    Duration                                                                       usecond                         798.24
+    L1/TEX Cache Throughput                                                              %                          42.30
+    L2 Cache Throughput                                                                  %                          84.91
+    SM Active Cycles                                                                 cycle                   1,713,465.43
+    Compute (SM) [%]                                                                     %                           9.32
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.76
+    Achieved Active Warps Per SM                                                      warp                          34.93
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.21
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,780,830
+    Memory [%]                                                                           %                          84.02
+    DRAM Throughput                                                                      %                          81.93
+    Duration                                                                       usecond                         799.68
+    L1/TEX Cache Throughput                                                              %                          41.86
+    L2 Cache Throughput                                                                  %                          84.02
+    SM Active Cycles                                                                 cycle                   1,705,064.41
+    Compute (SM) [%]                                                                     %                           9.21
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.09
+    Achieved Active Warps Per SM                                                      warp                          35.08
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.21
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,774,889
+    Memory [%]                                                                           %                          84.33
+    DRAM Throughput                                                                      %                          82.19
+    Duration                                                                       usecond                         796.96
+    L1/TEX Cache Throughput                                                              %                          42.00
+    L2 Cache Throughput                                                                  %                          84.33
+    SM Active Cycles                                                                 cycle                   1,702,487.20
+    Compute (SM) [%]                                                                     %                           9.24
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.82
+    Achieved Active Warps Per SM                                                      warp                          34.95
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.17
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,767,706
+    Memory [%]                                                                           %                          84.61
+    DRAM Throughput                                                                      %                          82.50
+    Duration                                                                       usecond                         796.54
+    L1/TEX Cache Throughput                                                              %                          42.15
+    L2 Cache Throughput                                                                  %                          84.61
+    SM Active Cycles                                                                 cycle                   1,717,969.28
+    Compute (SM) [%]                                                                     %                           9.28
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.84
+    Achieved Active Warps Per SM                                                      warp                          34.96
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.15
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,771,567
+    Memory [%]                                                                           %                          84.45
+    DRAM Throughput                                                                      %                          82.40
+    Duration                                                                       usecond                         800.16
+    L1/TEX Cache Throughput                                                              %                          42.11
+    L2 Cache Throughput                                                                  %                          84.45
+    SM Active Cycles                                                                 cycle                   1,699,681.70
+    Compute (SM) [%]                                                                     %                           9.26
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.04
+    Achieved Active Warps Per SM                                                      warp                          35.06
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.29
+    SM Frequency                                                             cycle/nsecond                           2.25
+    Elapsed Cycles                                                                   cycle                      1,782,897
+    Memory [%]                                                                           %                          83.85
+    DRAM Throughput                                                                      %                          81.87
+    Duration                                                                       usecond                         793.95
+    L1/TEX Cache Throughput                                                              %                          41.80
+    L2 Cache Throughput                                                                  %                          83.85
+    SM Active Cycles                                                                 cycle                   1,709,269.09
+    Compute (SM) [%]                                                                     %                           9.20
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.93
+    Achieved Active Warps Per SM                                                      warp                          35.01
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.26
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,773,328
+    Memory [%]                                                                           %                          84.32
+    DRAM Throughput                                                                      %                          82.30
+    Duration                                                                       usecond                         791.87
+    L1/TEX Cache Throughput                                                              %                          42.02
+    L2 Cache Throughput                                                                  %                          84.32
+    SM Active Cycles                                                                 cycle                   1,699,452.32
+    Compute (SM) [%]                                                                     %                           9.25
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.12
+    Achieved Active Warps Per SM                                                      warp                          35.10
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.18
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,763,737
+    Memory [%]                                                                           %                          84.86
+    DRAM Throughput                                                                      %                          82.67
+    Duration                                                                       usecond                         793.79
+    L1/TEX Cache Throughput                                                              %                          42.25
+    L2 Cache Throughput                                                                  %                          84.86
+    SM Active Cycles                                                                 cycle                   1,705,623.02
+    Compute (SM) [%]                                                                     %                           9.30
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.99
+    Achieved Active Warps Per SM                                                      warp                          35.03
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,776,957
+    Memory [%]                                                                           %                          84.17
+    DRAM Throughput                                                                      %                          81.93
+    Duration                                                                       usecond                         794.66
+    L1/TEX Cache Throughput                                                              %                          41.96
+    L2 Cache Throughput                                                                  %                          84.17
+    SM Active Cycles                                                                 cycle                   1,725,193.72
+    Compute (SM) [%]                                                                     %                           9.24
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.62
+    Achieved Active Warps Per SM                                                      warp                          34.86
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.6%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.20
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,780,655
+    Memory [%]                                                                           %                          84.00
+    DRAM Throughput                                                                      %                          81.97
+    Duration                                                                       usecond                         800.48
+    L1/TEX Cache Throughput                                                              %                          41.84
+    L2 Cache Throughput                                                                  %                          84.00
+    SM Active Cycles                                                                 cycle                   1,714,244.70
+    Compute (SM) [%]                                                                     %                           9.22
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.99
+    Achieved Active Warps Per SM                                                      warp                          35.04
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:49, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,766,287
+    Memory [%]                                                                           %                          84.69
+    DRAM Throughput                                                                      %                          82.49
+    Duration                                                                       usecond                         789.63
+    L1/TEX Cache Throughput                                                              %                          42.18
+    L2 Cache Throughput                                                                  %                          84.69
+    SM Active Cycles                                                                 cycle                   1,710,521.86
+    Compute (SM) [%]                                                                     %                           9.29
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.80
+    Achieved Active Warps Per SM                                                      warp                          34.94
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.17
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,784,939
+    Memory [%]                                                                           %                          83.73
+    DRAM Throughput                                                                      %                          81.77
+    Duration                                                                       usecond                         804.61
+    L1/TEX Cache Throughput                                                              %                          41.74
+    L2 Cache Throughput                                                                  %                          83.73
+    SM Active Cycles                                                                 cycle                   1,722,849.51
+    Compute (SM) [%]                                                                     %                           9.19
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.07
+    Achieved Active Warps Per SM                                                      warp                          35.07
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.23
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,772,852
+    Memory [%]                                                                           %                          84.32
+    DRAM Throughput                                                                      %                          82.28
+    Duration                                                                       usecond                         794.34
+    L1/TEX Cache Throughput                                                              %                          42.04
+    L2 Cache Throughput                                                                  %                          84.32
+    SM Active Cycles                                                                 cycle                   1,700,840.83
+    Compute (SM) [%]                                                                     %                           9.26
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.17
+    Achieved Active Warps Per SM                                                      warp                          35.12
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.2%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.31
+    SM Frequency                                                             cycle/nsecond                           2.25
+    Elapsed Cycles                                                                   cycle                      1,778,877
+    Memory [%]                                                                           %                          84.04
+    DRAM Throughput                                                                      %                          82.06
+    Duration                                                                       usecond                         790.82
+    L1/TEX Cache Throughput                                                              %                          41.89
+    L2 Cache Throughput                                                                  %                          84.04
+    SM Active Cycles                                                                 cycle                   1,708,369.62
+    Compute (SM) [%]                                                                     %                           9.22
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.13
+    Achieved Active Warps Per SM                                                      warp                          35.10
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,770,828
+    Memory [%]                                                                           %                          84.53
+    DRAM Throughput                                                                      %                          82.40
+    Duration                                                                       usecond                         791.90
+    L1/TEX Cache Throughput                                                              %                          42.08
+    L2 Cache Throughput                                                                  %                          84.53
+    SM Active Cycles                                                                 cycle                   1,711,479.29
+    Compute (SM) [%]                                                                     %                           9.27
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.04
+    Achieved Active Warps Per SM                                                      warp                          35.06
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.16
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,782,681
+    Memory [%]                                                                           %                          83.96
+    DRAM Throughput                                                                      %                          81.87
+    Duration                                                                       usecond                         803.74
+    L1/TEX Cache Throughput                                                              %                          41.81
+    L2 Cache Throughput                                                                  %                          83.96
+    SM Active Cycles                                                                 cycle                   1,705,371.77
+    Compute (SM) [%]                                                                     %                           9.21
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.97
+    Achieved Active Warps Per SM                                                      warp                          35.03
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.22
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,770,715
+    Memory [%]                                                                           %                          84.45
+    DRAM Throughput                                                                      %                          82.41
+    Duration                                                                       usecond                         793.92
+    L1/TEX Cache Throughput                                                              %                          42.12
+    L2 Cache Throughput                                                                  %                          84.45
+    SM Active Cycles                                                                 cycle                   1,711,417.78
+    Compute (SM) [%]                                                                     %                           9.27
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.13
+    Achieved Active Warps Per SM                                                      warp                          35.10
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.30
+    SM Frequency                                                             cycle/nsecond                           2.25
+    Elapsed Cycles                                                                   cycle                      1,785,654
+    Memory [%]                                                                           %                          83.71
+    DRAM Throughput                                                                      %                          81.77
+    Duration                                                                       usecond                         794.27
+    L1/TEX Cache Throughput                                                              %                          41.75
+    L2 Cache Throughput                                                                  %                          83.71
+    SM Active Cycles                                                                 cycle                   1,722,500.21
+    Compute (SM) [%]                                                                     %                           9.19
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.12
+    Achieved Active Warps Per SM                                                      warp                          35.10
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.15
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,752,615
+    Memory [%]                                                                           %                          85.32
+    DRAM Throughput                                                                      %                          83.20
+    Duration                                                                       usecond                         791.23
+    L1/TEX Cache Throughput                                                              %                          42.51
+    L2 Cache Throughput                                                                  %                          85.32
+    SM Active Cycles                                                                 cycle                   1,701,769.79
+    Compute (SM) [%]                                                                     %                           9.36
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.80
+    Achieved Active Warps Per SM                                                      warp                          34.94
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.31
+    SM Frequency                                                             cycle/nsecond                           2.25
+    Elapsed Cycles                                                                   cycle                      1,777,620
+    Memory [%]                                                                           %                          84.13
+    DRAM Throughput                                                                      %                          82.11
+    Duration                                                                       usecond                         790.46
+    L1/TEX Cache Throughput                                                              %                          41.97
+    L2 Cache Throughput                                                                  %                          84.13
+    SM Active Cycles                                                                 cycle                   1,706,438.66
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.06
+    Achieved Active Warps Per SM                                                      warp                          35.07
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.23
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,775,317
+    Memory [%]                                                                           %                          84.24
+    DRAM Throughput                                                                      %                          82.21
+    Duration                                                                       usecond                         795.23
+    L1/TEX Cache Throughput                                                              %                          41.97
+    L2 Cache Throughput                                                                  %                          84.24
+    SM Active Cycles                                                                 cycle                   1,703,559.45
+    Compute (SM) [%]                                                                     %                           9.24
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.21
+    Achieved Active Warps Per SM                                                      warp                          35.14
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.2%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.20
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,775,640
+    Memory [%]                                                                           %                          84.25
+    DRAM Throughput                                                                      %                          82.19
+    Duration                                                                       usecond                         798.21
+    L1/TEX Cache Throughput                                                              %                          41.98
+    L2 Cache Throughput                                                                  %                          84.25
+    SM Active Cycles                                                                 cycle                   1,697,484.91
+    Compute (SM) [%]                                                                     %                           9.24
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.15
+    Achieved Active Warps Per SM                                                      warp                          35.11
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.2%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,767,237
+    Memory [%]                                                                           %                          84.64
+    DRAM Throughput                                                                      %                          82.57
+    Duration                                                                       usecond                         790.14
+    L1/TEX Cache Throughput                                                              %                          42.17
+    L2 Cache Throughput                                                                  %                          84.64
+    SM Active Cycles                                                                 cycle                   1,727,770.23
+    Compute (SM) [%]                                                                     %                           9.29
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.89
+    Achieved Active Warps Per SM                                                      warp                          34.99
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.27
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,777,519
+    Memory [%]                                                                           %                          84.18
+    DRAM Throughput                                                                      %                          82.11
+    Duration                                                                       usecond                         793.22
+    L1/TEX Cache Throughput                                                              %                          41.94
+    L2 Cache Throughput                                                                  %                          84.18
+    SM Active Cycles                                                                 cycle                   1,695,667.48
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.26
+    Achieved Active Warps Per SM                                                      warp                          35.16
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.3%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.22
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,778,392
+    Memory [%]                                                                           %                          84.05
+    DRAM Throughput                                                                      %                          82.06
+    Duration                                                                       usecond                         797.98
+    L1/TEX Cache Throughput                                                              %                          41.95
+    L2 Cache Throughput                                                                  %                          84.05
+    SM Active Cycles                                                                 cycle                   1,711,383.60
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.92
+    Achieved Active Warps Per SM                                                      warp                          35.00
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.20
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,773,335
+    Memory [%]                                                                           %                          84.35
+    DRAM Throughput                                                                      %                          82.22
+    Duration                                                                       usecond                         796.54
+    L1/TEX Cache Throughput                                                              %                          42.03
+    L2 Cache Throughput                                                                  %                          84.35
+    SM Active Cycles                                                                 cycle                   1,699,665.90
+    Compute (SM) [%]                                                                     %                           9.25
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.89
+    Achieved Active Warps Per SM                                                      warp                          34.99
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.22
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,774,833
+    Memory [%]                                                                           %                          84.10
+    DRAM Throughput                                                                      %                          81.80
+    Duration                                                                       usecond                         800.38
+    L1/TEX Cache Throughput                                                              %                          42.11
+    L2 Cache Throughput                                                                  %                          84.10
+    SM Active Cycles                                                                 cycle                   1,684,571.45
+    Compute (SM) [%]                                                                     %                           9.28
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.88
+    Achieved Active Warps Per SM                                                      warp                          34.98
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.18
+    SM Frequency                                                             cycle/nsecond                           2.20
+    Elapsed Cycles                                                                   cycle                      1,757,012
+    Memory [%]                                                                           %                          84.84
+    DRAM Throughput                                                                      %                          82.59
+    Duration                                                                       usecond                         795.10
+    L1/TEX Cache Throughput                                                              %                          42.61
+    L2 Cache Throughput                                                                  %                          84.84
+    SM Active Cycles                                                                 cycle                   1,690,701.17
+    Compute (SM) [%]                                                                     %                           9.37
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.93
+    Achieved Active Warps Per SM                                                      warp                          35.01
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.28
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,779,817
+    Memory [%]                                                                           %                          83.94
+    DRAM Throughput                                                                      %                          81.91
+    Duration                                                                       usecond                         794.82
+    L1/TEX Cache Throughput                                                              %                          41.95
+    L2 Cache Throughput                                                                  %                          83.94
+    SM Active Cycles                                                                 cycle                   1,698,823.51
+    Compute (SM) [%]                                                                     %                           9.24
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.78
+    Achieved Active Warps Per SM                                                      warp                          34.94
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:50, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.28
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,777,105
+    Memory [%]                                                                           %                          84.14
+    DRAM Throughput                                                                      %                          82.12
+    Duration                                                                       usecond                         792.96
+    L1/TEX Cache Throughput                                                              %                          42.04
+    L2 Cache Throughput                                                                  %                          84.14
+    SM Active Cycles                                                                 cycle                   1,693,662.16
+    Compute (SM) [%]                                                                     %                           9.25
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.38
+    Achieved Active Warps Per SM                                                      warp                          35.22
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.4%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.32
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,780,194
+    Memory [%]                                                                           %                          83.88
+    DRAM Throughput                                                                      %                          81.86
+    Duration                                                                       usecond                         791.36
+    L1/TEX Cache Throughput                                                              %                          41.98
+    L2 Cache Throughput                                                                  %                          83.88
+    SM Active Cycles                                                                 cycle                   1,698,912.73
+    Compute (SM) [%]                                                                     %                           9.24
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.04
+    Achieved Active Warps Per SM                                                      warp                          35.06
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.30
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,780,258
+    Memory [%]                                                                           %                          83.98
+    DRAM Throughput                                                                      %                          81.91
+    Duration                                                                       usecond                         793.31
+    L1/TEX Cache Throughput                                                              %                          41.95
+    L2 Cache Throughput                                                                  %                          83.98
+    SM Active Cycles                                                                 cycle                   1,718,252.30
+    Compute (SM) [%]                                                                     %                           9.24
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.01
+    Achieved Active Warps Per SM                                                      warp                          35.05
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.32
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,782,033
+    Memory [%]                                                                           %                          83.91
+    DRAM Throughput                                                                      %                          81.82
+    Duration                                                                       usecond                         792.51
+    L1/TEX Cache Throughput                                                              %                          41.89
+    L2 Cache Throughput                                                                  %                          83.91
+    SM Active Cycles                                                                 cycle                   1,694,315.37
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.97
+    Achieved Active Warps Per SM                                                      warp                          35.02
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.16
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,766,427
+    Memory [%]                                                                           %                          84.61
+    DRAM Throughput                                                                      %                          82.63
+    Duration                                                                       usecond                         796.99
+    L1/TEX Cache Throughput                                                              %                          42.18
+    L2 Cache Throughput                                                                  %                          84.61
+    SM Active Cycles                                                                 cycle                   1,682,914.27
+    Compute (SM) [%]                                                                     %                           9.29
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.16
+    Achieved Active Warps Per SM                                                      warp                          35.12
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.2%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.19
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,767,233
+    Memory [%]                                                                           %                          84.65
+    DRAM Throughput                                                                      %                          82.59
+    Duration                                                                       usecond                         794.91
+    L1/TEX Cache Throughput                                                              %                          42.20
+    L2 Cache Throughput                                                                  %                          84.65
+    SM Active Cycles                                                                 cycle                   1,717,799.90
+    Compute (SM) [%]                                                                     %                           9.29
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.81
+    Achieved Active Warps Per SM                                                      warp                          34.95
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.27
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,781,387
+    Memory [%]                                                                           %                          83.98
+    DRAM Throughput                                                                      %                          81.94
+    Duration                                                                       usecond                         794.82
+    L1/TEX Cache Throughput                                                              %                          41.86
+    L2 Cache Throughput                                                                  %                          83.98
+    SM Active Cycles                                                                 cycle                   1,709,151.03
+    Compute (SM) [%]                                                                     %                           9.21
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.82
+    Achieved Active Warps Per SM                                                      warp                          34.96
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.18
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,773,419
+    Memory [%]                                                                           %                          84.36
+    DRAM Throughput                                                                      %                          82.28
+    Duration                                                                       usecond                         798.59
+    L1/TEX Cache Throughput                                                              %                          42.03
+    L2 Cache Throughput                                                                  %                          84.36
+    SM Active Cycles                                                                 cycle                   1,703,497.02
+    Compute (SM) [%]                                                                     %                           9.25
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.92
+    Achieved Active Warps Per SM                                                      warp                          35.00
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.26
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,786,644
+    Memory [%]                                                                           %                          83.67
+    DRAM Throughput                                                                      %                          81.70
+    Duration                                                                       usecond                         798.27
+    L1/TEX Cache Throughput                                                              %                          41.71
+    L2 Cache Throughput                                                                  %                          83.67
+    SM Active Cycles                                                                 cycle                   1,696,600.74
+    Compute (SM) [%]                                                                     %                           9.19
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.06
+    Achieved Active Warps Per SM                                                      warp                          35.07
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.17
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,767,842
+    Memory [%]                                                                           %                          84.64
+    DRAM Throughput                                                                      %                          82.42
+    Duration                                                                       usecond                         796.86
+    L1/TEX Cache Throughput                                                              %                          42.15
+    L2 Cache Throughput                                                                  %                          84.64
+    SM Active Cycles                                                                 cycle                   1,707,926.27
+    Compute (SM) [%]                                                                     %                           9.28
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.02
+    Achieved Active Warps Per SM                                                      warp                          35.05
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.31
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,774,485
+    Memory [%]                                                                           %                          84.06
+    DRAM Throughput                                                                      %                          81.82
+    Duration                                                                       usecond                         792.80
+    L1/TEX Cache Throughput                                                              %                          42.14
+    L2 Cache Throughput                                                                  %                          84.06
+    SM Active Cycles                                                                 cycle                   1,691,359.18
+    Compute (SM) [%]                                                                     %                           9.28
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.83
+    Achieved Active Warps Per SM                                                      warp                          34.96
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.30
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,774,864
+    Memory [%]                                                                           %                          84.07
+    DRAM Throughput                                                                      %                          81.84
+    Duration                                                                       usecond                         793.09
+    L1/TEX Cache Throughput                                                              %                          42.19
+    L2 Cache Throughput                                                                  %                          84.07
+    SM Active Cycles                                                                 cycle                   1,688,350.28
+    Compute (SM) [%]                                                                     %                           9.29
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.86
+    Achieved Active Warps Per SM                                                      warp                          34.97
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.16
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,772,425
+    Memory [%]                                                                           %                          84.42
+    DRAM Throughput                                                                      %                          82.24
+    Duration                                                                       usecond                         799.36
+    L1/TEX Cache Throughput                                                              %                          42.03
+    L2 Cache Throughput                                                                  %                          84.42
+    SM Active Cycles                                                                 cycle                   1,695,587.32
+    Compute (SM) [%]                                                                     %                           9.26
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.98
+    Achieved Active Warps Per SM                                                      warp                          35.03
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.28
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,784,447
+    Memory [%]                                                                           %                          83.49
+    DRAM Throughput                                                                      %                          81.32
+    Duration                                                                       usecond                         800.19
+    L1/TEX Cache Throughput                                                              %                          41.92
+    L2 Cache Throughput                                                                  %                          83.49
+    SM Active Cycles                                                                 cycle                   1,706,873.40
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.03
+    Achieved Active Warps Per SM                                                      warp                          35.06
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.10
+    SM Frequency                                                             cycle/nsecond                           2.18
+    Elapsed Cycles                                                                   cycle                      1,756,646
+    Memory [%]                                                                           %                          84.91
+    DRAM Throughput                                                                      %                          82.62
+    Duration                                                                       usecond                         801.54
+    L1/TEX Cache Throughput                                                              %                          42.61
+    L2 Cache Throughput                                                                  %                          84.91
+    SM Active Cycles                                                                 cycle                   1,689,021.83
+    Compute (SM) [%]                                                                     %                           9.38
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.82
+    Achieved Active Warps Per SM                                                      warp                          34.95
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.02
+    SM Frequency                                                             cycle/nsecond                           2.19
+    Elapsed Cycles                                                                   cycle                      1,757,450
+    Memory [%]                                                                           %                          85.07
+    DRAM Throughput                                                                      %                          82.98
+    Duration                                                                       usecond                         803.68
+    L1/TEX Cache Throughput                                                              %                          42.41
+    L2 Cache Throughput                                                                  %                          85.07
+    SM Active Cycles                                                                 cycle                   1,714,182.38
+    Compute (SM) [%]                                                                     %                           9.34
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.69
+    Achieved Active Warps Per SM                                                      warp                          34.89
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.7%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.16
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,780,359
+    Memory [%]                                                                           %                          84.00
+    DRAM Throughput                                                                      %                          81.89
+    Duration                                                                       usecond                         803.14
+    L1/TEX Cache Throughput                                                              %                          41.91
+    L2 Cache Throughput                                                                  %                          84.00
+    SM Active Cycles                                                                 cycle                   1,689,127.79
+    Compute (SM) [%]                                                                     %                           9.22
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.98
+    Achieved Active Warps Per SM                                                      warp                          35.03
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,778,013
+    Memory [%]                                                                           %                          84.15
+    DRAM Throughput                                                                      %                          82.09
+    Duration                                                                       usecond                         795.10
+    L1/TEX Cache Throughput                                                              %                          41.93
+    L2 Cache Throughput                                                                  %                          84.15
+    SM Active Cycles                                                                 cycle                   1,710,770.48
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.95
+    Achieved Active Warps Per SM                                                      warp                          35.02
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:51, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.26
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,777,156
+    Memory [%]                                                                           %                          84.19
+    DRAM Throughput                                                                      %                          82.08
+    Duration                                                                       usecond                         793.73
+    L1/TEX Cache Throughput                                                              %                          41.92
+    L2 Cache Throughput                                                                  %                          84.19
+    SM Active Cycles                                                                 cycle                   1,708,096.22
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.09
+    Achieved Active Warps Per SM                                                      warp                          35.09
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.23
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,772,348
+    Memory [%]                                                                           %                          84.26
+    DRAM Throughput                                                                      %                          81.92
+    Duration                                                                       usecond                         798.21
+    L1/TEX Cache Throughput                                                              %                          42.20
+    L2 Cache Throughput                                                                  %                          84.26
+    SM Active Cycles                                                                 cycle                   1,705,450.02
+    Compute (SM) [%]                                                                     %                           9.29
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.10
+    Achieved Active Warps Per SM                                                      warp                          35.09
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.06
+    SM Frequency                                                             cycle/nsecond                           2.18
+    Elapsed Cycles                                                                   cycle                      1,760,523
+    Memory [%]                                                                           %                          84.63
+    DRAM Throughput                                                                      %                          82.44
+    Duration                                                                       usecond                         806.53
+    L1/TEX Cache Throughput                                                              %                          42.47
+    L2 Cache Throughput                                                                  %                          84.63
+    SM Active Cycles                                                                 cycle                   1,708,124.71
+    Compute (SM) [%]                                                                     %                           9.35
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.77
+    Achieved Active Warps Per SM                                                      warp                          34.93
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.21
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,765,081
+    Memory [%]                                                                           %                          84.70
+    DRAM Throughput                                                                      %                          82.69
+    Duration                                                                       usecond                            792
+    L1/TEX Cache Throughput                                                              %                          42.21
+    L2 Cache Throughput                                                                  %                          84.70
+    SM Active Cycles                                                                 cycle                   1,714,362.12
+    Compute (SM) [%]                                                                     %                           9.30
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.92
+    Achieved Active Warps Per SM                                                      warp                          35.00
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.12
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,771,305
+    Memory [%]                                                                           %                          84.49
+    DRAM Throughput                                                                      %                          82.42
+    Duration                                                                       usecond                         801.98
+    L1/TEX Cache Throughput                                                              %                          42.09
+    L2 Cache Throughput                                                                  %                          84.49
+    SM Active Cycles                                                                 cycle                   1,712,171.16
+    Compute (SM) [%]                                                                     %                           9.27
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.82
+    Achieved Active Warps Per SM                                                      warp                          34.95
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.30
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,782,550
+    Memory [%]                                                                           %                          83.71
+    DRAM Throughput                                                                      %                          81.52
+    Duration                                                                       usecond                         796.70
+    L1/TEX Cache Throughput                                                              %                          41.95
+    L2 Cache Throughput                                                                  %                          83.71
+    SM Active Cycles                                                                 cycle                   1,680,424.63
+    Compute (SM) [%]                                                                     %                           9.24
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.19
+    Achieved Active Warps Per SM                                                      warp                          35.13
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.2%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.23
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,766,189
+    Memory [%]                                                                           %                          84.44
+    DRAM Throughput                                                                      %                          82.22
+    Duration                                                                       usecond                         794.82
+    L1/TEX Cache Throughput                                                              %                          42.35
+    L2 Cache Throughput                                                                  %                          84.44
+    SM Active Cycles                                                                 cycle                   1,697,133.87
+    Compute (SM) [%]                                                                     %                           9.32
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.90
+    Achieved Active Warps Per SM                                                      warp                          34.99
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.9%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.19
+    SM Frequency                                                             cycle/nsecond                           2.20
+    Elapsed Cycles                                                                   cycle                      1,771,601
+    Memory [%]                                                                           %                          84.24
+    DRAM Throughput                                                                      %                          81.97
+    Duration                                                                       usecond                         800.58
+    L1/TEX Cache Throughput                                                              %                          42.21
+    L2 Cache Throughput                                                                  %                          84.24
+    SM Active Cycles                                                                 cycle                   1,715,335.30
+    Compute (SM) [%]                                                                     %                           9.30
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.48
+    Achieved Active Warps Per SM                                                      warp                          34.79
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.5%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.21
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,773,845
+    Memory [%]                                                                           %                          84.26
+    DRAM Throughput                                                                      %                          82.21
+    Duration                                                                       usecond                         796.77
+    L1/TEX Cache Throughput                                                              %                          42.09
+    L2 Cache Throughput                                                                  %                          84.26
+    SM Active Cycles                                                                 cycle                   1,714,437.01
+    Compute (SM) [%]                                                                     %                           9.27
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.98
+    Achieved Active Warps Per SM                                                      warp                          35.03
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.27
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,780,810
+    Memory [%]                                                                           %                          83.98
+    DRAM Throughput                                                                      %                          81.89
+    Duration                                                                       usecond                         795.36
+    L1/TEX Cache Throughput                                                              %                          41.91
+    L2 Cache Throughput                                                                  %                          83.98
+    SM Active Cycles                                                                 cycle                   1,699,327.82
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.06
+    Achieved Active Warps Per SM                                                      warp                          35.07
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.17
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,765,811
+    Memory [%]                                                                           %                          84.68
+    DRAM Throughput                                                                      %                          82.59
+    Duration                                                                       usecond                         796.83
+    L1/TEX Cache Throughput                                                              %                          42.32
+    L2 Cache Throughput                                                                  %                          84.68
+    SM Active Cycles                                                                 cycle                   1,711,670.90
+    Compute (SM) [%]                                                                     %                           9.32
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.96
+    Achieved Active Warps Per SM                                                      warp                          35.02
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.26
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,773,499
+    Memory [%]                                                                           %                          84.25
+    DRAM Throughput                                                                      %                          82.22
+    Duration                                                                       usecond                         793.09
+    L1/TEX Cache Throughput                                                              %                          42.07
+    L2 Cache Throughput                                                                  %                          84.25
+    SM Active Cycles                                                                 cycle                   1,685,840.59
+    Compute (SM) [%]                                                                     %                           9.27
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.06
+    Achieved Active Warps Per SM                                                      warp                          35.07
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.36
+    SM Frequency                                                             cycle/nsecond                           2.26
+    Elapsed Cycles                                                                   cycle                      1,792,890
+    Memory [%]                                                                           %                          83.44
+    DRAM Throughput                                                                      %                          81.41
+    Duration                                                                       usecond                         793.34
+    L1/TEX Cache Throughput                                                              %                          41.58
+    L2 Cache Throughput                                                                  %                          83.44
+    SM Active Cycles                                                                 cycle                   1,701,241.52
+    Compute (SM) [%]                                                                     %                           9.16
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.01
+    Achieved Active Warps Per SM                                                      warp                          35.05
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,780,267
+    Memory [%]                                                                           %                          84.05
+    DRAM Throughput                                                                      %                          81.96
+    Duration                                                                       usecond                         796.26
+    L1/TEX Cache Throughput                                                              %                          41.87
+    L2 Cache Throughput                                                                  %                          84.05
+    SM Active Cycles                                                                 cycle                   1,720,124.72
+    Compute (SM) [%]                                                                     %                           9.22
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          72.80
+    Achieved Active Warps Per SM                                                      warp                          34.95
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (72.8%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.22
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,779,850
+    Memory [%]                                                                           %                          84.00
+    DRAM Throughput                                                                      %                          81.98
+    Duration                                                                       usecond                         798.18
+    L1/TEX Cache Throughput                                                              %                          41.87
+    L2 Cache Throughput                                                                  %                          84.00
+    SM Active Cycles                                                                 cycle                   1,698,645.58
+    Compute (SM) [%]                                                                     %                           9.22
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.24
+    Achieved Active Warps Per SM                                                      warp                          35.16
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.2%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.25
+    SM Frequency                                                             cycle/nsecond                           2.24
+    Elapsed Cycles                                                                   cycle                      1,781,004
+    Memory [%]                                                                           %                          84.03
+    DRAM Throughput                                                                      %                          81.93
+    Duration                                                                       usecond                         796.26
+    L1/TEX Cache Throughput                                                              %                          41.85
+    L2 Cache Throughput                                                                  %                          84.03
+    SM Active Cycles                                                                 cycle                   1,699,534.09
+    Compute (SM) [%]                                                                     %                           9.22
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.13
+    Achieved Active Warps Per SM                                                      warp                          35.10
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.19
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,760,056
+    Memory [%]                                                                           %                          84.93
+    DRAM Throughput                                                                      %                          82.91
+    Duration                                                                       usecond                         791.55
+    L1/TEX Cache Throughput                                                              %                          42.34
+    L2 Cache Throughput                                                                  %                          84.93
+    SM Active Cycles                                                                 cycle                   1,692,582.16
+    Compute (SM) [%]                                                                     %                           9.32
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.04
+    Achieved Active Warps Per SM                                                      warp                          35.06
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.22
+    SM Frequency                                                             cycle/nsecond                           2.21
+    Elapsed Cycles                                                                   cycle                      1,759,875
+    Memory [%]                                                                           %                          84.76
+    DRAM Throughput                                                                      %                          82.60
+    Duration                                                                       usecond                         792.54
+    L1/TEX Cache Throughput                                                              %                          42.47
+    L2 Cache Throughput                                                                  %                          84.76
+    SM Active Cycles                                                                 cycle                   1,690,070.65
+    Compute (SM) [%]                                                                     %                           9.35
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.26
+    Achieved Active Warps Per SM                                                      warp                          35.16
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.3%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.32
+    SM Frequency                                                             cycle/nsecond                           2.23
+    Elapsed Cycles                                                                   cycle                      1,784,241
+    Memory [%]                                                                           %                          83.62
+    DRAM Throughput                                                                      %                          81.39
+    Duration                                                                       usecond                         795.84
+    L1/TEX Cache Throughput                                                              %                          41.93
+    L2 Cache Throughput                                                                  %                          83.62
+    SM Active Cycles                                                                 cycle                   1,682,712.12
+    Compute (SM) [%]                                                                     %                           9.23
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.13
+    Achieved Active Warps Per SM                                                      warp                          35.10
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.1%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:52, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.17
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,763,701
+    Memory [%]                                                                           %                          84.78
+    DRAM Throughput                                                                      %                          82.78
+    Duration                                                                       usecond                         795.14
+    L1/TEX Cache Throughput                                                              %                          42.27
+    L2 Cache Throughput                                                                  %                          84.78
+    SM Active Cycles                                                                 cycle                   1,693,308.48
+    Compute (SM) [%]                                                                     %                           9.31
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.04
+    Achieved Active Warps Per SM                                                      warp                          35.06
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:53, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.20
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,769,207
+    Memory [%]                                                                           %                          84.49
+    DRAM Throughput                                                                      %                          82.51
+    Duration                                                                       usecond                         795.30
+    L1/TEX Cache Throughput                                                              %                          42.11
+    L2 Cache Throughput                                                                  %                          84.49
+    SM Active Cycles                                                                 cycle                   1,704,195.24
+    Compute (SM) [%]                                                                     %                           9.28
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.03
+    Achieved Active Warps Per SM                                                      warp                          35.06
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.0%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.                                                                                         
+
+  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 10:50:53, Context 1, Stream 7
+    Section: GPU Speed Of Light Throughput
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    DRAM Frequency                                                           cycle/nsecond                          10.27
+    SM Frequency                                                             cycle/nsecond                           2.22
+    Elapsed Cycles                                                                   cycle                      1,773,060
+    Memory [%]                                                                           %                          84.15
+    DRAM Throughput                                                                      %                          81.88
+    Duration                                                                       usecond                         795.39
+    L1/TEX Cache Throughput                                                              %                          42.17
+    L2 Cache Throughput                                                                  %                          84.15
+    SM Active Cycles                                                                 cycle                   1,699,194.80
+    Compute (SM) [%]                                                                     %                           9.29
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    INF   The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To   
+          further improve performance, work will likely need to be shifted from the most utilized to another unit.      
+          Start by analyzing L2 in the Memory Workload Analysis section.                                                
+
+    Section: Launch Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Size                                                                                                        256
+    Function Cache Configuration                                                                  cudaFuncCachePreferNone
+    Grid Size                                                                                                         768
+    Registers Per Thread                                                   register/thread                             32
+    Shared Memory Configuration Size                                                 Kbyte                          16.38
+    Driver Shared Memory Per Block                                             Kbyte/block                           1.02
+    Dynamic Shared Memory Per Block                                             byte/block                              0
+    Static Shared Memory Per Block                                              byte/block                             64
+    Threads                                                                         thread                        196,608
+    Waves Per SM                                                                                                        1
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+    Section: Occupancy
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Block Limit SM                                                                   block                             24
+    Block Limit Registers                                                            block                              8
+    Block Limit Shared Mem                                                           block                             14
+    Block Limit Warps                                                                block                              6
+    Theoretical Active Warps per SM                                                   warp                             48
+    Theoretical Occupancy                                                                %                            100
+    Achieved Occupancy                                                                   %                          73.24
+    Achieved Active Warps Per SM                                                      warp                          35.15
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   This kernel's theoretical occupancy is not impacted by any block limit. The difference between calculated     
+          theoretical (100.0%) and measured achieved occupancy (73.2%) can be the result of warp scheduling overheads   
+          or workload imbalances during the kernel execution. Load imbalances can occur between warps within a block    
+          as well as across blocks of the same kernel. See the CUDA Best Practices Guide                                
+          (https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#occupancy) for more details on           
+          optimizing occupancy.
+```
diff --git a/03_nf4_dequant/ikko/readme.md b/03_nf4_dequant/ikko/readme.md
new file mode 100644
index 0000000..6c4677d
--- /dev/null
+++ b/03_nf4_dequant/ikko/readme.md
@@ -0,0 +1,914 @@
+## 测试
+~/Learning-CUDA目录下
+
+```bash
+# 默认 bf16
+make nf4
+
+# 显式指定 dtype
+make nf4 DTYPE=fp16
+make nf4 DTYPE=bf16
+
+# 简写（等价于上面的 DTYPE）
+make nf4 fp16
+make nf4 bf16
+
+# 指定 GPU
+make nf4 CUDA_DEVICE=7
+make nf4 fp16 CUDA_DEVICE=7
+
+# 查看用法
+make help
+```
+
+说明：
+- `make nf4 fp16` 中的 `fp16` 是 Make 目标简写，Makefile 已兼容并自动映射为 `DTYPE=fp16`。
+- 运行阶段会传参给 `mainla`，输出路径随 dtype 自动切换到 `data/output_fp16.bin` 或默认 bf16 输出文件。
+## NF4反量化
+权重 = NF4表值 × (一级scale × 二级scale) + offset
+scale = code2[ absmax_q[block] ] × absmax2[group]
+w =
+  NF4_TABLE[q]
+× code2[ absmax_q[block] ]
+× absmax2[group]
++ offset
+关联关系：
+Group (16384 weights)
+ └── 256 Blocks
+      └── 64 weights
+           └── 2 per byte
+## bitsandbytes参考实现
+```shell
+source /data/shared/miniconda3/etc/profile.d/conda.sh && conda activate cuda && python - <<'PY'
+import torch
+import bitsandbytes as bnb
+
+rows, cols, blocksize = 16384, 16384, 64
+
+# Generate data
+x = torch.randn(rows, cols, device='cuda', dtype=torch.bfloat16)
+packed, qstate = bnb.functional.quantize_4bit(
+    x,
+    blocksize=blocksize,
+    quant_type='nf4',
+    compress_statistics=True
+)
+
+# Warmup
+for _ in range(5):
+    y = bnb.functional.dequantize_4bit(packed, qstate, quant_type='nf4', blocksize=blocksize)
+
+torch.cuda.synchronize()
+
+# Timing
+iters = 100
+start = torch.cuda.Event(enable_timing=True)
+stop = torch.cuda.Event(enable_timing=True)
+start.record()
+for _ in range(iters):
+    y = bnb.functional.dequantize_4bit(packed, qstate, quant_type='nf4', blocksize=blocksize)
+stop.record()
+stop.synchronize()
+
+ms = start.elapsed_time(stop) / iters
+
+# Approx bandwidth
+num_elements = rows * cols
+size_packed = num_elements // 2
+num_blocks = (num_elements + blocksize - 1) // blocksize
+num_groups = (num_blocks + 255) // 256
+size_absmax_q = num_blocks
+size_absmax2 = num_groups * 2
+size_code2 = 256 * 2
+size_out = num_elements * 2
+
+total_bytes = size_packed + size_absmax_q + size_absmax2 + size_code2 + size_out
+bandwidth = total_bytes / (ms / 1000.0) / 1e9
+
+print(f"bitsandbytes dequantize_4bit: {ms:.6f} ms")
+print(f"Approx bandwidth: {bandwidth:.2f} GB/s")
+PY
+```
+bitsandbytes dequantize_4bit: 1.241162 ms
+Approx bandwidth: 544.10 GB/s
+## 构造标准测试集
+
+
+
+
+
+## 编写main.cu
+### 反量化查表
+来自github bitsandbytes/csrc/kernels.cu
+```cpp
+__device__ static float fp4_dequantization_lut[8] = {
+    0.0f,            // 0b000
+    0.005208333333f, // 0b001
+    0.66666667f,     // 0b010
+    1.0f,            // 0b011
+    0.33333333f,     // 0b100
+    0.5f,            // 0b101
+    0.16666667f,     // 0b110
+    0.25f            // 0b111
+};
+
+__device__ static float nf4_dequantization_lut[16] = {
+    -1.0f,                 // 0b0000
+    -0.6961928009986877f,  // 0b0001
+    -0.5250730514526367f,  // 0b0010
+    -0.39491748809814453f, // 0b0011
+    -0.28444138169288635f, // 0b0100
+    -0.18477343022823334f, // 0b0101
+    -0.09105003625154495f, // 0b0110
+    0.0f,                  // 0b0111
+    0.07958029955625534f,  // 0b1000
+    0.16093020141124725f,  // 0b1001
+    0.24611230194568634f,  // 0b1010
+    0.33791524171829224f,  // 0b1011
+    0.44070982933044434f,  // 0b1100
+    0.5626170039176941f,   // 0b1101
+    0.7229568362236023f,   // 0b1110
+    1.0f                   // 0b1111
+};
+```
+bitsandbytes使用device，考虑是否使用constant
+| 维度      | **device** static | **constant**      |
+| ------- | ----------------- | ----------------- |
+| 存储位置    | Global Memory     | Constant Memory   |
+| 缓存      | L2 / L1           | 专用 Constant Cache |
+| Warp 广播 | 无              |  有               |
+| 访问延迟    | 高                 | 低                 |
+| 适合      | 普通全局数据            | 查表 / 常量           |
+使用constant先。
+### host逻辑
+1.输入解析，读取二进制文件
+2.内存规划
+3.数据加载，分配显存
+4.启动kernel
+5.记录性能，写入数据
+### device逻辑
+1. 全局一维线程索引
+2. 读取这 1 个字节，并解包成两个 4-bit 索引
+3. 计算当前字节属于哪一个量化 Block 和 Group
+4. 暴力从全局内存 (Global Memory) 读取双重量化参数
+5. 结合 NF4 查表，计算真实的浮点权重
+6. 最朴素的分别写回 (没有使用 Union 向量化合并写入)
+
+## nativekernel问题解决
+精度远超阈值
+### 解码索引覆盖不足
+最初 grid 维度按 num_groups 计算，实际应按 packed byte 数计算，导致大部分元素没写。
+```
+Kernel Time: 0.121476 ms
+Effective Bandwidth (approx): 347.457 GB/s
+Output written to nf4/data/output.bin
+```
+
+### gendata，offset 处理错误
+offset 并非恒为 0，而且应加在 absmax 上，不应加在最终权重上。
+```
+Kernel Time: 0.141412 ms
+Effective Bandwidth (approx): 298.473 GB/s
+Output written to nf4/data/output.bin
+MAE (Mean Absolute Error): 0.000024
+Max Error:                 0.031250
+------------------------------
+✅ PASS: MAE (0.000024) is within threshold (0.01)
+```
+### 为了可拓展性，使用 Grid-Stride Loops
+```cpp
+int blockSize = 256;
+// 我们根据硬件 SM 数量来决定 grid，或者简单给一个足够大的数
+int numSMs;
+cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);
+int gridSize = 32 * numSMs; // 保证每个 SM 都有活干
+```
+修改代码
+```cpp
+int sm_count = 0;
+    CHECK_CUDA(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, 0));
+    int grid_x = sm_count * 4;
+    int64_t max_grid = (total_bytes + blockDim.x - 1) / blockDim.x;
+    if (grid_x > max_grid) {
+        grid_x = static_cast<int>(max_grid);
+    }
+    if (grid_x < 1) {
+        grid_x = 1;
+    }
+    dim3 gridDim(grid_x);
+```
+此时性能
+```
+Kernel Time: 0.121476 ms
+Effective Bandwidth (approx): 347.457 GB/s
+```
+换成通用api计算
+```cpp
+int sm_count = 0;
+    CHECK_CUDA(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, 0));
+    int grid_x = sm_count * 4;
+    int64_t max_grid = (total_bytes + blockDim.x - 1) / blockDim.x;
+    if (grid_x > max_grid) {
+        grid_x = static_cast<int>(max_grid);
+    }
+    if (grid_x < 1) {
+        grid_x = 1;
+    }
+    dim3 gridDim(grid_x);
+```
+此时性能
+```
+SM count: 108, max active blocks/SM: 8, grid_x: 864
+Kernel Time: 0.124518 ms
+Effective Bandwidth (approx): 338.969 GB/s
+Output written to nf4/data/output.bin
+```
+## 优化大纲
+1. 向量化访存优化
+```cpp
+output[tid * 2]     = __float2bfloat16(w1_fp32);
+    output[tid * 2 + 1] = __float2bfloat16(w2_fp32);
+```
+这发起了两次独立的写入，gpu显存控制器可以一次性吃32-bit，64-bit，甚至128-bit的数据
+
+2. 冗余计算与重复访存
+blocksize = 64，那么 32 个线程（正好是一个 Warp）处理的 64 个权重，其实都属于同一个 Block。
+3. 整数除法（位运算优化）
+4. code2 码表的读取延迟
+## 实际优化
+### 向量化访存
+向量化加载 packed_weights
+```cpp
+uint8_t packed = packed_weights[byte_idx];
+```
+单字节 load，效率差。GPU 更适合 4B / 8B 对齐访问。
+需注意，主机端对 packed buffer 做 4 字节对齐补零，避免最后一个 uint32 读取越界。
+```cpp
+for (int b = 0; b < 4; ++b) {
+            int64_t byte_idx = word_idx * 4 + b;
+            if (byte_idx >= total_bytes) {
+                continue;
+            }
+
+            uint8_t packed = static_cast<uint8_t>((packed_word >> (8 * b)) & 0xFF);
+            uint8_t idx1 = (packed >> 4) & 0x0F; // 高 4 位对应 output[byte_idx * 2]
+            uint8_t idx2 = packed & 0x0F;        // 低 4 位对应 output[byte_idx * 2 + 1]
+
+            int bytes_per_block = blocksize / 2;
+            int block_id = static_cast<int>(byte_idx / bytes_per_block);
+            int group_id = block_id / group_size;
+
+            float a2 = __half2float(absmax2[group_id]);
+            uint8_t qa = absmax_q[block_id];
+            float c2 = __half2float(code2[qa]);
+            float real_absmax = c2 * a2 + offset;
+
+            float w1_fp32 = NF4_LUT[idx1] * real_absmax;
+            float w2_fp32 = NF4_LUT[idx2] * real_absmax;
+
+            int64_t out_idx = byte_idx * 2;
+            if (out_idx < num_elements) {
+                output[out_idx] = __float2bfloat16(w1_fp32);
+            }
+            if (out_idx + 1 < num_elements) {
+                output[out_idx + 1] = __float2bfloat16(w2_fp32);
+            }
+        }
+// Kernel Time: 0.172722 ms
+// Effective Bandwidth (approx): 244.368 GB/s
+```
+#### 读取方式改为 16B 向量化
+
+kernel 参数从 const uint32_t* 改为 const uint4*
+每线程处理 1 个 16B word（32 个权重），total_words = ceil(total_bytes / 16)
+读取 16B 后用 #pragma unroll 在寄存器里拆 16 个 byte，再拆成 32 个 4-bit 索引
+参数只读一次
+
+对应这 32 个权重只计算一次 block_id/group_id
+只读一次 absmax2/absmax_q/code2/offset，得到 real_absmax，后面 32 个权重复用
+写回向量化
+
+每个byte 生成 2 个 bf16，打包成 1 个 uint32
+16 个 uint32 组成 4 个 float4，一次写回 8 个 bf16
+尾部不足 32 权重时走标量写回，避免越界
+主机侧对齐与网格调整
+
+packed buffer padding 改为 16B 对齐
+cudaMalloc/cudaMemcpy 使用对齐后的大小
+grid 上限按 total_words 而不是 total_bytes 计算
+#### 此外，精简写回：跳过 float4 强转
+```cpp
+out_f4[0] = *reinterpret_cast<float4*>(&v0);
+```
+既然已经拼接好了 uint4，直接用 uint4 类型的指针写回即可。GPU 并不在乎你存入的是 float 还是 uint，只要它是 128 位的。当告诉 GPU 要执行一个存储（Store）操作时，硬件只需要知道两件事：
+起始地址（Starting Address）： 数据要写到哪？
+位宽（Bit Width）： 这一趟搬多大的数据（32位、64位还是 128位）？
+当定义一个 uint4 变量并写回时，CUDA 编译器（NVCC）会生成一条类似 STG.E.128 (Store Global 128-bit) 的汇编指令。这条指令不管这 128 位里装的是 8 个 bfloat16、4 个 float，还是 16 个 char。它只负责把寄存器里的 128 个比特流，原封不动地拍到对应的显存地址上。
+### 在native上实现合并写回（无向量访存）
+```cpp
+if (out_idx + 1 < num_elements) {
+            Bf16Bits lo; lo.bf = __float2bfloat16(w1_fp32);
+            Bf16Bits hi; hi.bf = __float2bfloat16(w2_fp32);
+            out_u32[out_idx / 2] = static_cast<uint32_t>(lo.u) | (static_cast<uint32_t>(hi.u) << 16);
+        } else if (out_idx < num_elements) {
+            output[out_idx] = __float2bfloat16(w1_fp32);
+        }
+Kernel Time: 0.102705 ms
+Effective Bandwidth (approx): 410.962 GB/s
+```
+## 加载表constant为广播可能conflict
+```cpp
+__shared__ float s_LUT[16];
+    if (threadIdx.x < 16) {
+        s_LUT[threadIdx.x] = NF4_LUT[threadIdx.x];
+    }
+    __syncthreads();
+```
+搬到 SM（流处理器）内部极其昂贵、速度极快的 Shared Memory（共享内存）中。
+### 更改Shape: 16384x16384, Blocksize: 64
+SM count: 108, max active blocks/SM: 8, grid_x: 864
+Kernel Time: 0.968552 ms
+Effective Bandwidth (approx): 697.243 GB/s
+------------------------------
+MAE (Mean Absolute Error): 0.000017
+Max Error:                 0.031250
+------------------------------
+✅ PASS: MAE (0.000017) is within threshold (0.01)
+## 使用inline(编译器优化)
+```cpp
+__device__ __forceinline__ float nf4_lut_value(uint8_t idx) {
+    switch (idx & 0x0F) {
+        case 0x0: return -1.0f;
+        case 0x1: return -0.6961928009986877f;
+        case 0x2: return -0.5250730514526367f;
+        case 0x3: return -0.39491748809814453f;
+        case 0x4: return -0.28444138169288635f;
+        case 0x5: return -0.18477343022823334f;
+        case 0x6: return -0.09105003625154495f;
+        case 0x7: return 0.0f;
+        case 0x8: return 0.07958029955625534f;
+        case 0x9: return 0.16093020141124725f;
+        case 0xA: return 0.24611230194568634f;
+        case 0xB: return 0.33791524171829224f;
+        case 0xC: return 0.44070982933044434f;
+        case 0xD: return 0.5626170039176941f;
+        case 0xE: return 0.7229568362236023f;
+        default: return 1.0f;
+    }
+}
+float v1 = nf4_lut_value(p >> 4) * real_absmax;
+                float v2 = nf4_lut_value(p & 0x0F) * real_absmax;
+Kernel Time: 2.58223 ms
+Effective Bandwidth (approx): 261.524 GB/s
+```
+遂撤回，改为寄存器缓存表值
+
+
+## 输出适配fp16
+```cpp
+template <typename T>
+__device__ __forceinline__ T float_to_out(float v);
+
+template <>
+__device__ __forceinline__ __nv_bfloat16 float_to_out<__nv_bfloat16>(float v) {
+    return __float2bfloat16(v);
+}
+
+template <>
+__device__ __forceinline__ half float_to_out<half>(float v) {
+    return __float2half(v);
+}
+
+template <typename T>
+__device__ __forceinline__ uint32_t pack_pair_to_u32(float v1, float v2);
+
+template <>
+__device__ __forceinline__ uint32_t pack_pair_to_u32<__nv_bfloat16>(float v1, float v2) {
+    __nv_bfloat162 packed = __floats2bfloat162_rn(v1, v2);
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+
+template <>
+__device__ __forceinline__ uint32_t pack_pair_to_u32<half>(float v1, float v2) {
+    half2 packed = __floats2half2_rn(v1, v2);
+    return *reinterpret_cast<uint32_t*>(&packed);
+}
+uint32_t u32_val = pack_pair_to_u32<OutT>(v1, v2);
+```
+## 新增加速比，并修改makefile
+输出
+```cpp
+(cuda) ikko@dsw-607126-85f54bdf75-5lzlx:~/Learning-CUDA$ make nf4
+=== [NF4] Compiling nf4/mainla.cu ===
+TMPDIR=/home/ikko/Learning-CUDA/.tmp /usr/local/cuda/bin/nvcc -O3 -std=c++17 -arch=sm_80 nf4/mainla.cu -o nf4/mainla
+=== [NF4] Running nf4/mainla ===
+=== CUDA_VISIBLE_DEVICES=7 ===
+CUDA_VISIBLE_DEVICES=7 ./nf4/mainla
+SM count: 108, max active blocks/SM: 8, grid_x: 864
+Kernel Time: 0.968471 ms
+Effective Bandwidth (approx): 697.301 GB/s
+Speedup vs bitsandbytes: 1.28384x (ref 1.24336 ms)
+Bandwidth ratio vs bitsandbytes: 1.28383x (ref 543.14 GB/s)
+Output dtype: bf16
+Output written to nf4/data/output.bin
+=== [NF4] Verifying MAE ===
+CUDA_VISIBLE_DEVICES=7 python nf4/verify_mae.py
+=== Starting Verification ===
+Shape: 16384x16384, Blocksize: 64
+------------------------------
+MAE (Mean Absolute Error): 0.000017
+Max Error:                 0.031250
+------------------------------
+✅ PASS: MAE (0.000017) is within threshold (0.01)
+```
+## 按题目要求回退线程粒度
+
+题目要求的是 Packed Store：每个线程一次只处理两个 4-bit 索引，也就是读取 1 个 packed byte，算出 2 个 bf16 后，打包成 1 个 uint32_t，一次性写回全局内存。
+而我前一版做的是：kernel 输入改成 const uint4*，每个线程先读 1 个 uint4，也就是 16 个 byte
+16 个 byte 对应 32 个 4-bit 索引
+线程内部虽然也调用了 pack_pair_to_u32<OutT>(v1, v2)
+但那只是线程内部的中间步骤，最终是把 16 个 uint32 再组成 4 个 uint4 写回
+也就是说，之前那版本质上是：
+每线程处理 16 个 packed byte
+每线程解码 32 个 4-bit 索引
+每线程最终写 4 个 uint4
+这不等于题目要求的：
+每线程处理 1 个 packed byte，每线程解码 2 个 4-bit 索引，每线程最终写 1 个 uint32_t。所以这里重新改回严格按题目要求实现。
+```cpp
+template <typename OutT>
+__global__ void nf4_decode_kernel(
+    const uint8_t* __restrict__ packed_weights,
+    const uint8_t* __restrict__ absmax_q,
+    const half* __restrict__ absmax2,
+    const half* __restrict__ code2,
+    const float offset,
+    OutT* __restrict__ output,
+    int64_t num_elements,
+    int blocksize,
+    int group_size
+) {
+    int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t stride = gridDim.x * blockDim.x;
+    int64_t total_bytes = (num_elements + 1) / 2;
+    int64_t full_pair_bytes = num_elements / 2;
+
+    __shared__ float s_LUT[16];
+    if (threadIdx.x < 16) {
+        s_LUT[threadIdx.x] = NF4_LUT[threadIdx.x];
+    }
+    __syncthreads();
+    uint32_t* out_u32 = reinterpret_cast<uint32_t*>(output);
+
+    for (int64_t byte_idx = tid; byte_idx < full_pair_bytes; byte_idx += stride) {
+        uint8_t packed = packed_weights[byte_idx];
+        int block_id = static_cast<int>(byte_idx / (blocksize / 2));
+        int group_id = block_id / group_size;
+        float real_absmax = (__half2float(absmax2[group_id]) * __half2float(code2[absmax_q[block_id]])) + offset;
+        float v1 = s_LUT[packed >> 4] * real_absmax;
+        float v2 = s_LUT[packed & 0x0F] * real_absmax;
+        out_u32[byte_idx] = pack_pair_to_u32<OutT>(v1, v2);
+    }
+
+    if ((num_elements & 1) != 0) {
+        int64_t tail_byte = total_bytes - 1;
+        if (tid == 0) {
+            uint8_t packed = packed_weights[tail_byte];
+            int block_id = static_cast<int>(tail_byte / (blocksize / 2));
+            int group_id = block_id / group_size;
+            float real_absmax = (__half2float(absmax2[group_id]) * __half2float(code2[absmax_q[block_id]])) + offset;
+            output[num_elements - 1] = float_to_out<OutT>(s_LUT[packed >> 4] * real_absmax);
+        }
+    }
+}
+```
+
+同时 host 端的 grid 计算也要跟着改回按 total_bytes 来算，而不是按 total_words 算。因为现在一个线程对应一个 packed byte，不再是一个线程对应一个 uint4。
+
+这版的好处是题意完全对齐，线程粒度、处理粒度、写回粒度三者一致。缺点也很明显，性能会比之前那个“每线程吞 16 个 byte”的版本低一些。
+
+输出
+```cpp
+(cuda) ikko@dsw-607126-85f54bdf75-5lzlx:~/Learning-CUDA$ make nf4
+=== [NF4] Compiling nf4/mainla.cu ===
+TMPDIR=/home/ikko/Learning-CUDA/.tmp /usr/local/cuda/bin/nvcc -O3 -std=c++17 -arch=sm_80 nf4/mainla.cu -o nf4/mainla
+=== [NF4] Running nf4/mainla ===
+=== CUDA_VISIBLE_DEVICES=7 ===
+CUDA_VISIBLE_DEVICES=7 ./nf4/mainla
+SM count: 108, max active blocks/SM: 8, grid_x: 864
+Kernel Time: 1.95401 ms
+Effective Bandwidth (approx): 345.605 GB/s
+Speedup vs bitsandbytes: 0.636311x (ref 1.24336 ms)
+Bandwidth ratio vs bitsandbytes: 0.636309x (ref 543.14 GB/s)
+Output dtype: bf16
+Output written to nf4/data/output.bin
+=== [NF4] Verifying MAE ===
+CUDA_VISIBLE_DEVICES=7 python nf4/verify_mae.py
+=== Starting Verification ===
+Shape: 16384x16384, Blocksize: 64
+------------------------------
+MAE (Mean Absolute Error): 0.000017
+Max Error:                 0.031250
+------------------------------
+✅ PASS: MAE (0.000017) is within threshold (0.01)
+```
+
+
+## nsys分析
+```shell
+nsys profile --stats=true --force-overwrite=true -o nf4_profile ./nf4/main
+```
+输出
+```
+Collecting data...
+SM count: 108, max active blocks/SM: 8, grid_x: 864
+Kernel Time: 1.85433 ms
+Effective Bandwidth (approx): 364.183 GB/s
+Speedup vs bitsandbytes: 0.670517x (ref 1.24336 ms)
+Bandwidth ratio vs bitsandbytes: 0.670515x (ref 543.14 GB/s)
+Output dtype: bf16
+Output written to nf4/data/output_bf16.bin
+Generating '/tmp/nsys-report-aac2.qdstrm'
+[1/8] [========================100%] nsys_mainla_bf16.nsys-rep
+[2/8] [========================100%] nsys_mainla_bf16.sqlite
+[3/8] Executing 'nvtx_sum' stats report
+SKIPPED: /home/ikko/Learning-CUDA/nf4/nsys_mainla_bf16.sqlite does not contain NV Tools Extension (NVTX) data.
+[4/8] Executing 'osrt_sum' stats report
+
+ Time (%)  Total Time (ns)  Num Calls   Avg (ns)     Med (ns)    Min (ns)   Max (ns)   StdDev (ns)           Name         
+ --------  ---------------  ---------  -----------  -----------  ---------  ---------  -----------  ----------------------
+     58.9       1684692553         24   70195523.0   73322110.5     320037  381847608   79085855.6  poll                  
+     18.6        532343606       1622     328202.0      64362.5       1125   15994793     862809.4  ioctl                 
+     11.3        322412024         31   10400387.9       1856.0       1022  322343688   57894226.3  fclose                
+      8.1        232039676          1  232039676.0  232039676.0  232039676  232039676          0.0  writev                
+      2.0         57202734        112     510738.7       2468.0       1002   56702430    5357453.0  fopen                 
+      1.0         28063349         11    2551213.5       1931.0       1007   27199930    8178930.8  read                  
+      0.1          1904289         43      44285.8      10845.0       6238    1058775     159640.8  mmap64                
+      0.0           673049         10      67304.9      57184.0      21078     112731      29916.6  sem_timedwait         
+      0.0           623082        118       5280.4       4386.0       1603      16509       3208.3  open64                
+      0.0           306282          2     153141.0     153141.0     139640     166642      19093.3  pthread_create        
+      0.0           201836         16      12614.8       5748.5       1002      88787      20920.5  mmap                  
+      0.0            82243          1      82243.0      82243.0      82243      82243          0.0  pthread_cond_wait     
+      0.0            77095         11       7008.6       7257.0       4709       9864       1769.3  write                 
+      0.0            47414          8       5926.8       4498.0       2641      12043       3485.8  munmap                
+      0.0            32173          3      10724.3      12344.0       6113      13716       4052.0  putc                  
+      0.0            29046          1      29046.0      29046.0      29046      29046          0.0  fgets                 
+      0.0            24664          5       4932.8       4357.0       2220       7422       2186.0  open                  
+      0.0            20110          4       5027.5       4305.5       1200      10299       4042.1  fwrite                
+      0.0            13228          3       4409.3       3261.0       1894       8073       3245.6  pipe2                 
+      0.0            11141          2       5570.5       5570.5       5464       5677        150.6  socket                
+      0.0             9397          2       4698.5       4698.5       1691       7706       4253.2  pthread_cond_broadcast
+      0.0             7259          1       7259.0       7259.0       7259       7259          0.0  connect               
+      0.0             6258          5       1251.6       1244.0       1080       1351        109.6  fcntl                 
+      0.0             4160          1       4160.0       4160.0       4160       4160          0.0  fread                 
+      0.0             2407          1       2407.0       2407.0       2407       2407          0.0  bind                  
+
+[5/8] Executing 'cuda_api_sum' stats report
+
+ Time (%)  Total Time (ns)  Num Calls   Avg (ns)     Med (ns)    Min (ns)   Max (ns)   StdDev (ns)           Name         
+ --------  ---------------  ---------  -----------  -----------  ---------  ---------  -----------  ----------------------
+     51.2        277706536          5   55541307.2    1658159.0       5925  272606419  121345975.1  cudaMalloc            
+     34.5        187258524          1  187258524.0  187258524.0  187258524  187258524          0.0  cudaEventSynchronize  
+     12.5         67776654          5   13555330.8    1488128.0      63639   51690255   22091691.8  cudaMemcpy            
+      1.2          6327994          5    1265598.8    1131945.0      20003    2503567     933614.4  cudaFree              
+      0.6          3056503          1    3056503.0    3056503.0    3056503    3056503          0.0  cudaDeviceSynchronize 
+      0.1           578595        101       5728.7       4944.0       4228      36734       3821.5  cudaLaunchKernel      
+      0.0            22548          2      11274.0      11274.0       5889      16659       7615.5  cudaEventRecord       
+      0.0            17520          2       8760.0       8760.0        762      16758      11310.9  cudaEventDestroy      
+      0.0             6073          2       3036.5       3036.5        780       5293       3191.2  cudaEventCreate       
+      0.0             1196          1       1196.0       1196.0       1196       1196          0.0  cuModuleGetLoadingMode
+
+[6/8] Executing 'cuda_gpu_kern_sum' stats report
+
+ Time (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)  Min (ns)  Max (ns)  StdDev (ns)                                                  Name                                                
+ --------  ---------------  ---------  ---------  --------  --------  --------  -----------  ----------------------------------------------------------------------------------------------------
+    100.0        186128837        101  1842859.8  834756.0    812388   3455505    1258741.2  void nf4_decode_kernel<__nv_bfloat16>(const uint4 *, const unsigned char *, const __half *, const _…
+
+[7/8] Executing 'cuda_gpu_mem_time_sum' stats report
+
+ Time (%)  Total Time (ns)  Count   Avg (ns)    Med (ns)   Min (ns)  Max (ns)  StdDev (ns)           Operation          
+ --------  ---------------  -----  ----------  ----------  --------  --------  -----------  ----------------------------
+     78.2         51319384      1  51319384.0  51319384.0  51319384  51319384          0.0  [CUDA memcpy Device-to-Host]
+     21.8         14291557      4   3572889.3    194241.0      3008  13900067    6887079.2  [CUDA memcpy Host-to-Device]
+
+[8/8] Executing 'cuda_gpu_mem_size_sum' stats report
+
+ Total (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)           Operation          
+ ----------  -----  --------  --------  --------  --------  -----------  ----------------------------
+    536.871      1   536.871   536.871   536.871   536.871        0.000  [CUDA memcpy Device-to-Host]
+    138.445      4    34.611     2.114     0.001   134.218       66.433  [CUDA memcpy Host-to-Device]
+```
+
+## maca
+cd /data/Learning-CUDA && mxcc -O3 -std=c++17 nf4/mainla.maca -o nf4/mainla_maca
+```cpp
+cd /data/Learning-CUDA && mxcc -O3 -std=c++17 nf4/mainla.maca -o nf4/mainla_maca
+nf4/mainla.maca:158:19: error: use of undeclared identifier 'macaMalloc'; did you mean 'mcMalloc'?
+    RUNTIME_CHECK(macaMalloc(&d_packed, size_packed));
+                  ^~~~~~~~~~
+                  mcMalloc
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api_template_wrapper.h:6:44: note: 'mcMalloc' declared here
+template <class T> static inline mcError_t mcMalloc(T **devPtr, size_t size)
+                                           ^
+nf4/mainla.maca:159:19: error: use of undeclared identifier 'macaMalloc'; did you mean 'mcMalloc'?
+    RUNTIME_CHECK(macaMalloc(&d_absmax_q, size_absmax_q));
+                  ^~~~~~~~~~
+                  mcMalloc
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api_template_wrapper.h:6:44: note: 'mcMalloc' declared here
+template <class T> static inline mcError_t mcMalloc(T **devPtr, size_t size)
+                                           ^
+nf4/mainla.maca:160:19: error: use of undeclared identifier 'macaMalloc'; did you mean 'mcMalloc'?
+    RUNTIME_CHECK(macaMalloc(&d_absmax2, size_absmax2));
+                  ^~~~~~~~~~
+                  mcMalloc
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api_template_wrapper.h:6:44: note: 'mcMalloc' declared here
+template <class T> static inline mcError_t mcMalloc(T **devPtr, size_t size)
+                                           ^
+nf4/mainla.maca:161:19: error: use of undeclared identifier 'macaMalloc'; did you mean 'mcMalloc'?
+    RUNTIME_CHECK(macaMalloc(&d_code2, size_code2));
+                  ^~~~~~~~~~
+                  mcMalloc
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api_template_wrapper.h:6:44: note: 'mcMalloc' declared here
+template <class T> static inline mcError_t mcMalloc(T **devPtr, size_t size)
+                                           ^
+nf4/mainla.maca:162:19: error: use of undeclared identifier 'macaMalloc'; did you mean 'mcMalloc'?
+    RUNTIME_CHECK(macaMalloc(&d_output, static_cast<size_t>(num_elements) * sizeof(half)));
+                  ^~~~~~~~~~
+                  mcMalloc
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api_template_wrapper.h:6:44: note: 'mcMalloc' declared here
+template <class T> static inline mcError_t mcMalloc(T **devPtr, size_t size)
+                                           ^
+nf4/mainla.maca:164:70: error: use of undeclared identifier 'macaMemcpyHostToDevice'
+    RUNTIME_CHECK(macaMemcpy(d_packed, h_packed.data(), size_packed, macaMemcpyHostToDevice));
+                                                                     ^
+nf4/mainla.maca:165:76: error: use of undeclared identifier 'macaMemcpyHostToDevice'
+    RUNTIME_CHECK(macaMemcpy(d_absmax_q, h_absmax_q.data(), size_absmax_q, macaMemcpyHostToDevice));
+                                                                           ^
+nf4/mainla.maca:166:73: error: use of undeclared identifier 'macaMemcpyHostToDevice'
+    RUNTIME_CHECK(macaMemcpy(d_absmax2, h_absmax2.data(), size_absmax2, macaMemcpyHostToDevice));
+                                                                        ^
+nf4/mainla.maca:167:67: error: use of undeclared identifier 'macaMemcpyHostToDevice'
+    RUNTIME_CHECK(macaMemcpy(d_code2, h_code2.data(), size_code2, macaMemcpyHostToDevice));
+                                                                  ^
+nf4/mainla.maca:185:19: error: use of undeclared identifier 'macaGetLastError'; did you mean 'mcGetLastError'?
+    RUNTIME_CHECK(macaGetLastError());
+                  ^~~~~~~~~~~~~~~~
+                  mcGetLastError
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api.h:2311:11: note: 'mcGetLastError' declared here
+mcError_t mcGetLastError(void);
+          ^
+nf4/mainla.maca:186:19: error: use of undeclared identifier 'macaDeviceSynchronize'; did you mean 'mcDeviceSynchronize'?
+    RUNTIME_CHECK(macaDeviceSynchronize());
+                  ^~~~~~~~~~~~~~~~~~~~~
+                  mcDeviceSynchronize
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api.h:570:11: note: 'mcDeviceSynchronize' declared here
+mcError_t mcDeviceSynchronize(void);
+          ^
+nf4/mainla.maca:194:19: error: use of undeclared identifier 'macaGetLastError'; did you mean 'mcGetLastError'?
+    RUNTIME_CHECK(macaGetLastError());
+                  ^~~~~~~~~~~~~~~~
+                  mcGetLastError
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api.h:2311:11: note: 'mcGetLastError' declared here
+mcError_t mcGetLastError(void);
+          ^
+nf4/mainla.maca:195:19: error: use of undeclared identifier 'macaDeviceSynchronize'; did you mean 'mcDeviceSynchronize'?
+    RUNTIME_CHECK(macaDeviceSynchronize());
+                  ^~~~~~~~~~~~~~~~~~~~~
+                  mcDeviceSynchronize
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api.h:570:11: note: 'mcDeviceSynchronize' declared here
+mcError_t mcDeviceSynchronize(void);
+          ^
+nf4/mainla.maca:201:107: error: use of undeclared identifier 'macaMemcpyDeviceToHost'
+    RUNTIME_CHECK(macaMemcpy(h_output.data(), d_output, static_cast<size_t>(num_elements) * sizeof(half), macaMemcpyDeviceToHost));
+                                                                                                          ^
+nf4/mainla.maca:217:19: error: use of undeclared identifier 'macaFree'; did you mean 'mcFree'?
+    RUNTIME_CHECK(macaFree(d_packed));
+                  ^~~~~~~~
+                  mcFree
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api.h:2609:11: note: 'mcFree' declared here
+mcError_t mcFree(void *ptr);
+          ^
+nf4/mainla.maca:218:19: error: use of undeclared identifier 'macaFree'; did you mean 'mcFree'?
+    RUNTIME_CHECK(macaFree(d_absmax_q));
+                  ^~~~~~~~
+                  mcFree
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api.h:2609:11: note: 'mcFree' declared here
+mcError_t mcFree(void *ptr);
+          ^
+nf4/mainla.maca:219:19: error: use of undeclared identifier 'macaFree'; did you mean 'mcFree'?
+    RUNTIME_CHECK(macaFree(d_absmax2));
+                  ^~~~~~~~
+                  mcFree
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api.h:2609:11: note: 'mcFree' declared here
+mcError_t mcFree(void *ptr);
+          ^
+nf4/mainla.maca:220:19: error: use of undeclared identifier 'macaFree'; did you mean 'mcFree'?
+    RUNTIME_CHECK(macaFree(d_code2));
+                  ^~~~~~~~
+                  mcFree
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api.h:2609:11: note: 'mcFree' declared here
+mcError_t mcFree(void *ptr);
+          ^
+nf4/mainla.maca:221:19: error: use of undeclared identifier 'macaFree'; did you mean 'mcFree'?
+    RUNTIME_CHECK(macaFree(d_output));
+                  ^~~~~~~~
+                  mcFree
+nf4/../tester/utils.h:29:28: note: expanded from macro 'RUNTIME_CHECK'
+    RUNTIME_ERR_TYPE err = call;                                               \
+                           ^
+/opt/maca/include/mcr/mc_runtime_api.h:2609:11: note: 'mcFree' declared here
+mcError_t mcFree(void *ptr);
+          ^
+19 errors generated when compiling for host.
+```
+原来沐曦用的是mc而不是maca
+
+
+## ncu
+
+4090上ncu的结果
+```shell
+sudo ncu \
+  --section SpeedOfLight \
+  --section MemoryWorkloadAnalysis \
+  --launch-skip 1 \
+  --launch-count 1 \
+  ./nf4/mainla bf16
+```
+拿到的关键信息大致是：
+	•	DRAM Throughput ≈ 83%
+	•	Memory ≈ 83%
+	•	Compute (SM) ≈ 38% ~ 42%
+	•	Memory Throughput ≈ 816 ~ 824 GB/s
+	•	L2 Hit Rate ≈ 79.84%
+说明 1.	这个 kernel 已经明显是 memory-bound；
+	2.	不是算力没吃满，而是 DRAM 带宽已经被压得比较高；
+	3.	继续优化如果不动访存主路径，很难有大提升。
+
+```shell
+sudo ncu \
+  --section SchedulerStats \
+  --section WarpStateStats \
+  --launch-skip 1 \
+  --launch-count 1 \
+  ./nf4/mainla bf16
+==PROF== Connected to process 810279 (/home/xjy/Learning-CUDA/nf4/mainla)
+SM count: 128, max active blocks/SM: 6, grid_x: 768
+==PROF== Profiling "nf4_decode_kernel" - 0 (1/1): 0%....50%....100% - 8 passes
+Kernel Time: 2.01215 ms
+Effective Bandwidth (approx): 335.62 GB/s
+Speedup vs bitsandbytes: 0.617927x (ref 1.24336 ms)
+Bandwidth ratio vs bitsandbytes: 0.617925x (ref 543.14 GB/s)
+Output dtype: bf16
+Output written to nf4/data/output_bf16.bin
+==PROF== Disconnected from process 810279
+[810279] mainla@127.0.0.1
+  void nf4_decode_kernel<__nv_bfloat16>(const unsigned char *, const unsigned char *, const __half *, const __half *, float, T1 *, long, int, int), 2026-Mar-10 14:16:43, Context 1, Stream 7
+    Section: Scheduler Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    One or More Eligible                                                                 %                          42.23
+    Issued Warp Per Scheduler                                                                                        0.42
+    No Eligible                                                                          %                          57.77
+    Active Warps Per Scheduler                                                        warp                          12.08
+    Eligible Warps Per Scheduler                                                      warp                           1.11
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   Every scheduler is capable of issuing one instruction per cycle, but for this kernel each scheduler only      
+          issues an instruction every 2.4 cycles. This might leave hardware resources underutilized and may lead to     
+          less optimal performance. Out of the maximum of 12 warps per scheduler, this kernel allocates an average of   
+          12.08 active warps per scheduler, but only an average of 1.11 warps were eligible per cycle. Eligible warps   
+          are the subset of active warps that are ready to issue their next instruction. Every cycle with no eligible   
+          warp results in no instruction being issued and the issue slot remains unused. To increase the number of      
+          eligible warps, avoid possible load imbalances due to highly different execution durations per warp.          
+          Reducing stalls indicated on the Warp State Statistics and Source Counters sections can help, too.            
+
+    Section: Warp State Statistics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Warp Cycles Per Issued Instruction                                               cycle                          28.61
+    Warp Cycles Per Executed Instruction                                             cycle                          28.61
+    Avg. Active Threads Per Warp                                                                                    19.67
+    Avg. Not Predicated Off Threads Per Warp                                                                        18.16
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    WRN   On average, each warp of this kernel spends 19.0 cycles being stalled waiting for a scoreboard dependency on  
+          a L1TEX (local, global, surface, texture, rtcore) operation. This represents about 66.5% of the total         
+          average of 28.6 cycles between issuing two instructions. To reduce the number of cycles waiting on L1TEX      
+          data accesses verify the memory access patterns are optimal for the target architecture, attempt to increase  
+          cache hit rates by increasing data locality or by changing the cache configuration, and consider moving       
+          frequently used data to registers and to shared memory.                                                       
+    ----- --------------------------------------------------------------------------------------------------------------
+    INF   Check the Source Counters section for the top stall locations in your source based on sampling data. The      
+          Kernel Profiling Guide (https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#sampling) provides   
+          more details on each stall reason.                                                                            
+    ----- --------------------------------------------------------------------------------------------------------------
+    WRN   Instructions are executed in warps, which are groups of 32 threads. Optimal instruction throughput is         
+          achieved if all 32 threads of a warp execute the same instruction. The chosen launch configuration, early     
+          thread completion, and divergent flow control can significantly lower the number of active threads in a warp  
+          per cycle. This kernel achieves an average of 19.7 threads being active per cycle. This is further reduced    
+          to 18.2 threads per warp due to predication. The compiler may use predication to avoid an actual branch.      
+          Instead, all instructions are scheduled, but a per-thread condition code or predicate controls which threads  
+          execute the instructions. Try to avoid different execution paths within a warp when possible. In addition,    
+          ensure your kernel makes use of Independent Thread Scheduling, which allows a warp to reconverge after a      
+          data-dependent conditional block by explicitly calling __syncwarp().
+```
+不是抽象地 memory-bound，而是 大量 warp 在等 L1TEX 路径上的 load 返回；调度器手上虽然挂了很多 warp，但真正能随时发射的 warp 很少；所以 issue slot 经常空着。
+怀疑：1.不只是主数据流 packed_weights 在拖；
+2.absmax_q / absmax2 / code2 这些访问也可能形成依赖链；
+3.另外，热路径里的整数计算（特别是除法）可能在吃性能。
+### blocksize,group_size
+printf("group_size: %d,block_size: %d",group_size,blocksize);
+得到block_size: 64；group_size:256
+```cpp
+float real_absmax = 0.0f;
+        if (lane == 0) {
+            uint8_t qa = absmax_q[block_id];
+            real_absmax = (__half2float(absmax2[group_id]) * __half2float(code2[qa])) + offset;
+        }
+        real_absmax = __shfl_sync(warp_mask, real_absmax, 0);
+```
+没啥变化
+可能code2 / absmax2 这种小数据本来就可能已经被 cache 命中；新加的 if (lane == 0) 和 __shfl_sync 也有额外开销。
+
+综上
+综合几轮 NCU，我目前对这个 kernel 的判断是：
+
+已经确认的事实
+	1.	它是明显的 memory-bound kernel；
+	2.	DRAM 带宽已经吃到比较高；
+	3.	warp 大量 stall 在 L1TEX scoreboard dependency 上；
+	4.	shared LUT 在我的测试里更快，应该保留；
+	5.	real_absmax 的 warp 广播不是主收益点；
+	6.	热路径里的运行时除法是一个更值得优先处理的问题。
+
+热路径的主要成本大概来自：
+主成本
+	•	packed_weights[byte_idx] 的 load
+	•	out_u32[byte_idx] 的 store
+次成本
+	•	absmax_q[block_id]
+	•	absmax2[group_id]
+	•	code2[qa]
+可能被低估的成本
+	•	byte_idx / (blocksize / 2)
+	•	block_id / group_size
+
+## 由于A100显存占用
+最终在4090测试。
+结果：rows=16384, cols=16384, blocksize=64
+```shell
+./nf4/mainla 
+SM count: 128, max active blocks/SM: 6, grid_x: 768
+Kernel Time: 0.790252 ms
+Effective Bandwidth (approx): 854.559 GB/s
+Speedup vs bitsandbytes: 1.57337x (ref 1.24336 ms)
+Bandwidth ratio vs bitsandbytes: 1.57337x (ref 543.14 GB/s)
+Output dtype: bf16
+Output written to nf4/data/output.bin
+```
+rows=24576, cols=24576, blocksize=64
+```shell
+./nf4/mainla 
+SM count: 128, max active blocks/SM: 6, grid_x: 768
+Kernel Time: 1.77564 ms
+Effective Bandwidth (approx): 855.727 GB/s
+Speedup vs bitsandbytes: 0.700233x (ref 1.24336 ms)
+Bandwidth ratio vs bitsandbytes: 1.57552x (ref 543.14 GB/s)
+Output dtype: bf16
+Output written to nf4/data/output.bin
+```
\ No newline at end of file
diff --git a/03_nf4_dequant/ikko/testbit.py b/03_nf4_dequant/ikko/testbit.py
new file mode 100644
index 0000000..eeb86bd
--- /dev/null
+++ b/03_nf4_dequant/ikko/testbit.py
@@ -0,0 +1,93 @@
+import torch
+import bitsandbytes.functional as F
+
+
+def main():
+    assert torch.cuda.is_available(), "CUDA"
+
+    device = "cuda"
+
+    # =========================
+    # 测试规模
+    # =========================
+    rows = 16384
+    cols = 16384
+    blocksize = 64
+    repeat = 10
+
+    print("=== 4bit Dequant Bandwidth Test ===")
+
+    # =========================
+    # 构造数据
+    # =========================
+    x = torch.randn(
+        rows,
+        cols,
+        device=device,
+        dtype=torch.float16
+    )
+
+    # 4bit 量化
+    q_weight, state = F.quantize_4bit(
+        x,
+        blocksize=blocksize,
+        compress_statistics=True
+    )
+
+    numel = q_weight.numel()
+
+    # =========================
+    # 预热
+    # =========================
+    for _ in range(3):
+        F.dequantize_4bit(q_weight, state)
+
+    torch.cuda.synchronize()
+
+    # =========================
+    # CUDA 计时
+    # =========================
+    start = torch.cuda.Event(True)
+    end = torch.cuda.Event(True)
+
+    start.record()
+
+    for _ in range(repeat):
+        y = F.dequantize_4bit(q_weight, state)
+
+    end.record()
+    torch.cuda.synchronize()
+
+    elapsed_ms = start.elapsed_time(end)
+
+    # =========================
+    # 带宽估算
+    # =========================
+
+    # 读取：
+    # 1. 4bit weight: 0.5 byte
+    # 2. absmax (fp16): 2 byte
+    #
+    # 写入：
+    # 3. fp16 output: 2 byte
+
+    bytes_read = numel * 0.5 + numel * 2
+    bytes_write = numel * 2
+
+    total_bytes = (bytes_read + bytes_write) * repeat
+
+    seconds = elapsed_ms / 1000.0
+
+    gbps = total_bytes / seconds / 1e9
+
+    # =========================
+    # 输出
+    # =========================
+    print(f"Matrix: {rows} x {cols}")
+    print(f"Repeat: {repeat}")
+    print(f"Time: {elapsed_ms:.3f} ms")
+    print(f"Bandwidth: {gbps:.2f} GB/s")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/03_nf4_dequant/ikko/verify_mae.py b/03_nf4_dequant/ikko/verify_mae.py
new file mode 100644
index 0000000..4d87ebd
--- /dev/null
+++ b/03_nf4_dequant/ikko/verify_mae.py
@@ -0,0 +1,85 @@
+import torch
+import numpy as np
+import os
+import struct
+
+def check_mae(output_dir="03_nf4_dequant/ikko/data", cuda_output_file="03_nf4_dequant/ikko/data/output.bin"):
+    truth_file = os.path.join(output_dir, "ground_truth.bin")
+    input_file = os.path.join(output_dir, "weight_data.bin")
+    
+    print("=== Starting Verification ===")
+    
+    # 1. 读取元数据以确定形状 (从输入文件读)
+    with open(input_file, "rb") as f:
+        # 读取前 20 字节 (rows:8, cols:8, blocksize:4)
+        header = f.read(20)
+        rows, cols, blocksize = struct.unpack("qqi", header)
+    
+    print(f"Shape: {rows}x{cols}, Blocksize: {blocksize}")
+    
+    # 2. 读取官方 Ground Truth
+    # 假设它是 BF16 格式 (numpy 不直接支持 bf16，通常视具体情况处理)
+    # 这里我们用 pytorch 读取，因为它支持 bf16
+    with open(truth_file, "rb") as f:
+        truth_bytes = f.read()
+    # 将字节流转为 Tensor
+    # 注意：Python 的 torch.frombuffer 可能会由于字节对齐问题报错，这里使用 numpy view 变通
+    # (由于 numpy 无 bf16，我们假设文件存储的是原生字节，用 int16 读取再转 torch.bfloat16)
+    truth_np = np.frombuffer(truth_bytes, dtype=np.int16).copy().reshape(rows, cols)
+    truth_tensor = torch.from_numpy(truth_np).view(torch.bfloat16).float() # 转为 float32 用于计算 MAE
+    
+    # 3. 读取你的 CUDA Kernel 输出
+    candidates = [
+        cuda_output_file,
+        os.path.join(output_dir, cuda_output_file),
+        os.path.join(output_dir, "output.bin"),
+        "output.bin",
+    ]
+    unique_candidates = []
+    for path in candidates:
+        if path not in unique_candidates:
+            unique_candidates.append(path)
+
+    cuda_path = None
+    for path in unique_candidates:
+        if os.path.exists(path):
+            cuda_path = path
+            break
+
+    if cuda_path is None:
+        print(f"Error: CUDA output file not found. Tried: {unique_candidates}")
+        return
+
+    with open(cuda_path, "rb") as f:
+        cuda_bytes = f.read()
+    
+    # 检查文件大小是否匹配
+    expected_size = rows * cols * 2 # BF16 = 2 bytes
+    if len(cuda_bytes) != expected_size:
+        print(f"Error: Output size mismatch! Expected {expected_size}, got {len(cuda_bytes)}")
+        return
+
+    cuda_np = np.frombuffer(cuda_bytes, dtype=np.int16).copy().reshape(rows, cols)
+    cuda_tensor = torch.from_numpy(cuda_np).view(torch.bfloat16).float()
+    
+    # 4. 计算 MAE (Mean Absolute Error)
+    diff = torch.abs(truth_tensor - cuda_tensor)
+    mae = torch.mean(diff).item()
+    
+    # 计算最大误差
+    max_diff = torch.max(diff).item()
+    
+    print("-" * 30)
+    print(f"MAE (Mean Absolute Error): {mae:.6f}")
+    print(f"Max Error:                 {max_diff:.6f}")
+    print("-" * 30)
+    
+    # 5. 判定标准
+    threshold = 1e-2
+    if mae < threshold:
+        print(f"✅ PASS: MAE ({mae:.6f}) is within threshold ({threshold})")
+    else:
+        print(f"❌ FAIL: MAE ({mae:.6f}) exceeds threshold ({threshold})")
+
+if __name__ == "__main__":
+    check_mae()
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..9b99db5
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,114 @@
+CUDA_DEVICE ?= 0
+PYTHON ?= python
+DTYPE ?= bf16
+PLATFORM ?= nvidia
+
+# Usage:
+#   make nf4 PLATFORM=nvidia
+#   make nf4 PLATFORM=nvidia DTYPE=fp16
+#   make nf4 PLATFORM=metax
+#   make nf4 PLATFORM=moore
+#   make nf4 PLATFORM=metax bf16
+#   make nf4 PLATFORM=moore fp16
+
+NF4_DIR := 03_nf4_dequant/ikko
+NF4_VERIFY := $(NF4_DIR)/verify_mae.py
+
+ifneq ($(filter fp16,$(MAKECMDGOALS)),)
+DTYPE := fp16
+endif
+ifneq ($(filter bf16,$(MAKECMDGOALS)),)
+DTYPE := bf16
+endif
+
+ifneq ($(filter $(DTYPE),bf16 fp16),$(DTYPE))
+$(error Invalid DTYPE='$(DTYPE)'. Use bf16 or fp16)
+endif
+
+# =========================================================
+# Platform selection
+# =========================================================
+ifeq ($(PLATFORM),nvidia)
+    CC := /usr/local/cuda/bin/nvcc
+    SRC_SUFFIX := cu
+    CFLAGS := -O3 -std=c++17 -arch=sm_80
+    EXTRA_LIBS :=
+    NF4_BIN := $(NF4_DIR)/mainla
+    RUN_ENV := CUDA_VISIBLE_DEVICES=$(CUDA_DEVICE)
+
+else ifeq ($(PLATFORM),metax)
+    CC := mxcc
+    SRC_SUFFIX := maca
+    CFLAGS := -O3 -std=c++17
+    EXTRA_LIBS :=
+    NF4_BIN := $(NF4_DIR)/mainla_maca
+    RUN_ENV :=
+
+else ifeq ($(PLATFORM),moore)
+    CC := mcc
+    SRC_SUFFIX := mu
+    CFLAGS := -O3 -std=c++11
+    EXTRA_LIBS := -I/usr/local/musa/include \
+                  -L/usr/lib/gcc/x86_64-linux-gnu/11/ \
+                  -L/usr/local/musa/lib \
+                  -lmusart
+    NF4_BIN := $(NF4_DIR)/mainla_mu
+    RUN_ENV :=
+
+else ifeq ($(PLATFORM),iluvatar)
+    CC := clang++
+    SRC_SUFFIX := cu
+    CFLAGS := -O3 -std=c++17
+    EXTRA_LIBS := -lcudart -I/usr/local/corex/include -L/usr/local/corex/lib64 -fPIC
+    NF4_BIN := $(NF4_DIR)/mainla_iluvatar
+    RUN_ENV := CUDA_VISIBLE_DEVICES=$(CUDA_DEVICE)
+
+else
+    $(error Unsupported PLATFORM '$(PLATFORM)' (expected: nvidia, metax, moore, iluvatar))
+endif
+
+NF4_SRC := $(NF4_DIR)/mainla.$(SRC_SUFFIX)
+
+ifeq ($(DTYPE),fp16)
+NF4_OUTPUT := $(NF4_DIR)/data/output_fp16.bin
+else
+NF4_OUTPUT := $(NF4_DIR)/data/output.bin
+endif
+
+.PHONY: nf4 run verify clean help bf16 fp16
+
+help:
+	@echo "Usage:"
+	@echo "  make nf4 PLATFORM=nvidia"
+	@echo "  make nf4 PLATFORM=nvidia DTYPE=fp16"
+	@echo "  make nf4 PLATFORM=metax"
+	@echo "  make nf4 PLATFORM=moore"
+	@echo "  make run PLATFORM=metax"
+	@echo "  make verify"
+
+nf4: $(NF4_BIN)
+	@echo "=== [NF4] Running $(NF4_BIN) ==="
+	@echo "=== PLATFORM=$(PLATFORM), DTYPE=$(DTYPE) ==="
+	$(RUN_ENV) ./$(NF4_BIN) $(DTYPE) $(NF4_OUTPUT)
+	@echo "=== [NF4] Verifying MAE ==="
+	$(PYTHON) $(NF4_VERIFY)
+
+$(NF4_BIN): $(NF4_SRC)
+	@echo "=== [NF4] Compiling $(NF4_SRC) with $(CC) ==="
+	@mkdir -p .tmp
+	TMPDIR=$(CURDIR)/.tmp $(CC) $(CFLAGS) $(NF4_SRC) -o $(NF4_BIN) $(EXTRA_LIBS)
+
+run:
+	@echo "=== [NF4] Running $(NF4_BIN) ==="
+	@echo "=== PLATFORM=$(PLATFORM), DTYPE=$(DTYPE) ==="
+	$(RUN_ENV) ./$(NF4_BIN) $(DTYPE) $(NF4_OUTPUT)
+
+verify:
+	@echo "=== [NF4] Verifying MAE ==="
+	$(PYTHON) $(NF4_VERIFY)
+
+clean:
+	rm -f $(NF4_DIR)/mainla $(NF4_DIR)/mainla_maca $(NF4_DIR)/mainla_mu $(NF4_DIR)/mainla_iluvatar
+
+bf16 fp16:
+	@:
\ No newline at end of file