From 5ce9829fe1e3f6f6ce64e5b0a15a3e983e9baadc Mon Sep 17 00:00:00 2001
From: scbz4learning <learning@mail.scbz.cn.eu.org>
Date: Tue, 17 Mar 2026 03:30:03 +0000
Subject: [PATCH 1/2] issue/1031 merge T2-1-1

---
 include/infiniop.h                            |   1 +
 include/infiniop/ops/dequantize_gptq.h        |  30 ++
 scripts/python_test.py                        |   3 +-
 .../iluvatar/dequantize_w42f16_iluvatar.cu    |   8 +-
 .../iluvatar/dequantize_w42f16_kernel.cuh     |   2 +-
 .../moore/dequantize_w42f16_kernel.h          |   2 +-
 .../moore/dequantize_w42f16_moore.mu          |   8 +-
 .../nvidia/dequantize_w42f16_kernel.cuh       |   2 +-
 .../nvidia/dequantize_w42f16_nvidia.cu        |  71 ++---
 .../ops/dequantize_gptq/dequantize_gptq.h     |  55 ++++
 .../iluvatar/dequantize_w42f16_iluvatar.cu    | 125 ++++++++
 .../iluvatar/dequantize_w42f16_iluvatar.cuh   |   8 +
 .../iluvatar/dequantize_w42f16_kernel.cuh     |  41 +++
 src/infiniop/ops/dequantize_gptq/info.h       |  51 ++++
 .../moore/dequantize_w42f16_kernel.h          |  41 +++
 .../moore/dequantize_w42f16_moore.h           |   8 +
 .../moore/dequantize_w42f16_moore.mu          | 125 ++++++++
 .../nvidia/dequantize_w42f16_kernel.cuh       | 124 ++++++++
 .../nvidia/dequantize_w42f16_nvidia.cu        | 126 ++++++++
 .../nvidia/dequantize_w42f16_nvidia.cuh       |   8 +
 src/infiniop/ops/dequantize_gptq/operator.cc  | 143 +++++++++
 test/infiniop/dequantize_gptq.py              | 276 ++++++++++++++++++
 test/infiniop/libinfiniop/op_register.py      |  36 +++
 23 files changed, 1249 insertions(+), 45 deletions(-)
 create mode 100644 include/infiniop/ops/dequantize_gptq.h
 create mode 100644 src/infiniop/ops/dequantize_gptq/dequantize_gptq.h
 create mode 100644 src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu
 create mode 100644 src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cuh
 create mode 100644 src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh
 create mode 100644 src/infiniop/ops/dequantize_gptq/info.h
 create mode 100644 src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h
 create mode 100644 src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.h
 create mode 100644 src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.mu
 create mode 100644 src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_kernel.cuh
 create mode 100644 src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cu
 create mode 100644 src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cuh
 create mode 100644 src/infiniop/ops/dequantize_gptq/operator.cc
 create mode 100644 test/infiniop/dequantize_gptq.py

diff --git a/include/infiniop.h b/include/infiniop.h
index 29505093a..de0a51930 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -22,6 +22,7 @@
 #include "infiniop/ops/equal.h"
 #include "infiniop/ops/flash_attention.h"
 #include "infiniop/ops/fmod.h"
+#include "infiniop/ops/dequantize_gptq.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/hardswish.h"
diff --git a/include/infiniop/ops/dequantize_gptq.h b/include/infiniop/ops/dequantize_gptq.h
new file mode 100644
index 000000000..5b0a83351
--- /dev/null
+++ b/include/infiniop/ops/dequantize_gptq.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_DEQUANTIZE_GPTQ_API_H__
+#define __INFINIOP_DEQUANTIZE_GPTQ_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDequantizeGPTQDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(infiniopHandle_t handle,
+                                                                  infiniopDequantizeGPTQDescriptor_t *desc_ptr,
+                                                                  infiniopTensorDescriptor_t out_desc,
+                                                                  infiniopTensorDescriptor_t qweight_desc,
+                                                                  infiniopTensorDescriptor_t scales_desc,
+                                                                  infiniopTensorDescriptor_t zeros_desc,
+                                                                  infiniopTensorDescriptor_t g_idx_desc); // add g_idx
+
+__C __export infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDequantizeGPTQ(infiniopDequantizeGPTQDescriptor_t desc,
+                                                  void *workspace,
+                                                  size_t workspace_size,
+                                                  void *out,
+                                                  const void *qweight,
+                                                  const void *scales,
+                                                  const void *zeros,
+                                                  const void *g_idx,  // add g_idx
+                                                  void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index 13b69a013..d0d46df4d 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -17,7 +17,8 @@ def run_tests(args):
         "causal_softmax.py",
         "clip.py",
         "conv.py",
-        # "dequantize_awq.py",
+        "dequantize_awq.py",
+        "dequantize_gptq.py",
         "gelu.py",
         "gemm.py",
         # "layer_norm.py",
diff --git a/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu b/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
index d873ca49a..d0f7bc73a 100644
--- a/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
+++ b/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
@@ -7,7 +7,7 @@
 #include <cuda_fp16.h>
 
 __global__ void __launch_bounds__(64)
-    dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
+    dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
                        int *__restrict__ zeros, half *__restrict__ C, int G) {
     // static constexpr uint32_t ZERO = 0x0;
     half B_shared[32 * (128 + 8)];
@@ -29,11 +29,11 @@ __global__ void __launch_bounds__(64)
     half *scaling_factors_ptr2 = scaling_factors + index4;
 
     uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
     uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);
 
     uint32_t B_loaded = *(uint32_t *)B_ptr2;
-    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);
 
     // Reinterpret uint4 components as __half2
     __half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16);
@@ -119,7 +119,7 @@ Descriptor::calculate(
     half *scales_ = const_cast<half *>(reinterpret_cast<const half *>(scales));
     int *zeros_ = const_cast<int *>(reinterpret_cast<const int *>(zeros));
 
-    dequantize_weights<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
+    dequantize_weights_awq<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
         qweight_, scales_, zeros_, out_, group_size);
     return INFINI_STATUS_SUCCESS;
 }
diff --git a/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_kernel.cuh
index 7ef31c5f9..7acdfe122 100644
--- a/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_kernel.cuh
+++ b/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_kernel.cuh
@@ -11,7 +11,7 @@
  * @param source 输入的32位无符号整数，它打包了8个4-bit的数据。
  * @return 一个 uint4 变量，其中包含8个反量化后的 half 值。
  */
-__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
+__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
     // 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
     // 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0]，
     // 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。
diff --git a/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_kernel.h b/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_kernel.h
index 3bc6c2d6c..5b926e892 100644
--- a/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_kernel.h
+++ b/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_kernel.h
@@ -11,7 +11,7 @@
  * @param source 输入的32位无符号整数，它打包了8个4-bit的数据。
  * @return 一个 uint4 变量，其中包含8个反量化后的 half 值。
  */
-__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
+__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
     // 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
     // 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0]，
     // 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。
diff --git a/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_moore.mu b/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_moore.mu
index e1b1f4fc3..970a58cd4 100644
--- a/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_moore.mu
+++ b/src/infiniop/ops/dequantize_awq/moore/dequantize_w42f16_moore.mu
@@ -7,7 +7,7 @@
 #include <musa_fp16.h>
 
 __global__ void __launch_bounds__(64)
-    dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
+    dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
                        int *__restrict__ zeros, half *__restrict__ C, int G) {
     // static constexpr uint32_t ZERO = 0x0;
     half B_shared[32 * (128 + 8)];
@@ -29,11 +29,11 @@ __global__ void __launch_bounds__(64)
     half *scaling_factors_ptr2 = scaling_factors + index4;
 
     uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
     uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);
 
     uint32_t B_loaded = *(uint32_t *)B_ptr2;
-    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);
 
     // Reinterpret uint4 components as __half2
     __half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16);
@@ -119,7 +119,7 @@ Descriptor::calculate(
     half *scales_ = const_cast<half *>(reinterpret_cast<const half *>(scales));
     int *zeros_ = const_cast<int *>(reinterpret_cast<const int *>(zeros));
 
-    dequantize_weights<<<num_blocks, threads_per_block, 0, reinterpret_cast<musaStream_t>(stream)>>>(
+    dequantize_weights_awq<<<num_blocks, threads_per_block, 0, reinterpret_cast<musaStream_t>(stream)>>>(
         qweight_, scales_, zeros_, out_, group_size);
     return INFINI_STATUS_SUCCESS;
 }
diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh
index d1dcc0f44..133e0dcb5 100644
--- a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh
+++ b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const &source) {
+__device__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
     // 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
     // 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0]，
diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
index 61e5b42fc..152d56398 100644
--- a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
+++ b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
@@ -10,33 +10,36 @@
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 750)
 __global__ void __launch_bounds__(64)
-    dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int G) {
+    dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
+                       int *__restrict__ zeros, half *__restrict__ C, int G,
+                       int out_features, int in_features) {
     // static constexpr uint32_t ZERO = 0x0;
-    half B_shared[32 * (128 + 8)];
 
-    half *B_shared_ptr2 = B_shared;
-
-    int N = blockDim.x * gridDim.x; // 2
     int col = (blockIdx.x * blockDim.x + threadIdx.x);
     int row = (blockIdx.y * blockDim.y + threadIdx.y);
-    int index1 = 8 * col + 8 * row * N;
+
+    // 边界检查，防止越界访问
+    if (col >= out_features || row >= in_features) return;
+
+    // 每个元素在输出中的起始地址：行主序，连续 8 个 half
+    int index1 = 8 * col + 8 * row * out_features;
     half *C_ptr2 = C + index1;
 
-    int index2 = col + row * N;
+    int index2 = col + row * out_features;
     int *B_ptr2 = B + index2;
 
-    int index3 = col + (int)(row / G) * N;
+    int index3 = col + (int)(row / G) * out_features;
     int *zeros_ptr2 = zeros + index3;
-    int index4 = 8 * col + (int)(row / G) * N * 8;
+
+    int index4 = 8 * col + (int)(row / G) * out_features * 8;
     half *scaling_factors_ptr2 = scaling_factors + index4;
 
     uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
     uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);
 
     uint32_t B_loaded = *(uint32_t *)B_ptr2;
-    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);
 
     // Reinterpret uint4 components as __half2
     __half2 *B_loaded_fp16_h2 = reinterpret_cast<__half2 *>(&B_loaded_fp16);
@@ -55,42 +58,43 @@ __global__ void __launch_bounds__(64)
     B_loaded_fp16_h2[2] = __hfma2(B_loaded_fp16_h2[2], B_loaded_scale_h2[2], __float2half2_rn(0.0f));
     B_loaded_fp16_h2[3] = __hfma2(B_loaded_fp16_h2[3], B_loaded_scale_h2[3], __float2half2_rn(0.0f));
 
-    // Store back to shared memory
-    *(uint4 *)B_shared_ptr2 = B_loaded_fp16;
-
+    // 直接写回全局内存输出
+    half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
+    #pragma unroll
     for (int i = 0; i < 8; ++i) {
-        *(C_ptr2 + i) = B_shared[i];
+        C_ptr2[i] = out_vec[i];
     }
 }
 #else
 __global__ void __launch_bounds__(64)
-    dequantize_weights(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int group_size) {
+    dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
+                       int *__restrict__ zeros, half *__restrict__ C, int group_size,
+                       int out_features, int in_features) {
     static constexpr uint32_t ZERO = 0x0;
-    half B_shared[32 * (128 + 8)];
 
-    half *B_shared_ptr2 = B_shared;
-
-    int N = blockDim.x * gridDim.x; // 2
     int col = (blockIdx.x * blockDim.x + threadIdx.x);
     int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int index1 = 8 * col + 8 * row * N;
+
+    // 边界检查，防止越界访问
+    if (col >= out_features || row >= in_features) return;
+
+    int index1 = 8 * col + 8 * row * out_features;
     half *C_ptr2 = C + index1;
 
-    int index2 = col + row * N;
+    int index2 = col + row * out_features;
     int *B_ptr2 = B + index2;
 
-    int index3 = col + (int)(row / group_size) * N;
+    int index3 = col + (int)(row / group_size) * out_features;
     int *zeros_ptr2 = zeros + index3;
-    int index4 = 8 * col + (int)(row / group_size) * N * 8;
+    int index4 = 8 * col + (int)(row / group_size) * out_features * 8;
     half *scaling_factors_ptr2 = scaling_factors + index4;
 
     uint32_t zeros_loaded = *(uint32_t *)(zeros_ptr2);
-    uint4 B_loaded_zero = dequantize_s4_to_fp16x2(zeros_loaded);
+    uint4 B_loaded_zero = dequantize_s4_to_fp16x2_awq(zeros_loaded);
     uint4 B_loaded_scale = *(uint4 *)(scaling_factors_ptr2);
 
     uint32_t B_loaded = *(uint32_t *)B_ptr2;
-    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2(B_loaded);
+    uint4 B_loaded_fp16 = dequantize_s4_to_fp16x2_awq(B_loaded);
     asm volatile("sub.f16x2 %0, %1, %2;\n"
                  : "=r"(B_loaded_fp16.x)
                  : "r"(B_loaded_fp16.x), "r"(B_loaded_zero.x));
@@ -116,10 +120,11 @@ __global__ void __launch_bounds__(64)
                  : "=r"(B_loaded_fp16.w)
                  : "r"(B_loaded_fp16.w), "r"(B_loaded_scale.w), "r"(ZERO));
 
-    *(uint4 *)B_shared_ptr2 = B_loaded_fp16;
-
+    // 直接写回全局内存输出
+    half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
+    #pragma unroll
     for (int i = 0; i < 8; ++i) {
-        *(C_ptr2 + i) = B_shared[i];
+        C_ptr2[i] = out_vec[i];
     }
 }
 #endif
@@ -183,8 +188,8 @@ Descriptor::calculate(
     half *scales_ = const_cast<half *>(reinterpret_cast<const half *>(scales));
     int *zeros_ = const_cast<int *>(reinterpret_cast<const int *>(zeros));
 
-    dequantize_weights<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-        qweight_, scales_, zeros_, out_, group_size);
+    dequantize_weights_awq<<<num_blocks, threads_per_block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
+        qweight_, scales_, zeros_, out_, group_size, out_features, in_features);
 
     return INFINI_STATUS_SUCCESS;
 }
diff --git a/src/infiniop/ops/dequantize_gptq/dequantize_gptq.h b/src/infiniop/ops/dequantize_gptq/dequantize_gptq.h
new file mode 100644
index 000000000..82898fbaf
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/dequantize_gptq.h
@@ -0,0 +1,55 @@
+#ifndef __DEQUANTIZE_GPTQ_H__
+#define __DEQUANTIZE_GPTQ_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::dequantize_gptq::NAMESPACE {                    \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        DequantizeGPTQInfo _info;                                 \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            DequantizeGPTQInfo info,                              \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t out_desc,                 \
+            infiniopTensorDescriptor_t qweight_desc,             \
+            infiniopTensorDescriptor_t scales_desc,              \
+            infiniopTensorDescriptor_t zeros_desc,               \
+            infiniopTensorDescriptor_t g_idx_desc);              \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *out,                                           \
+            const void *qweight,                                 \
+            const void *scales,                                  \
+            const void *zeros,                                   \
+            const void *g_idx,                                  \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif //__DEQUANTIZE_GPTQ_H__
diff --git a/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu b/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu
new file mode 100644
index 000000000..a0b79d0ca
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu
@@ -0,0 +1,125 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "dequantize_w42f16_iluvatar.cuh"
+#include "dequantize_w42f16_kernel.cuh"
+
+#include "../dequantize_gptq.h"
+#include <cuda_fp16.h>
+#include <cstdint>
+
+namespace op::dequantize_gptq::iluvatar {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+// 对齐 nvidia 版：支持 g_idx
+// qweight: [in_packed, out_features] packing 8 input channels per word
+// zeros:   [num_groups, out_packed] packing 8 output channels per word
+// scales:  [num_groups, out_features], g_idx: [in_features]
+__global__ void __launch_bounds__(128)
+dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
+                        const half     *__restrict__ scales,
+                        const uint32_t *__restrict__ zeros,
+                        const int      *__restrict__ g_idx,
+                        half           *__restrict__ out,
+                        int in_features,
+                        int out_features,
+                        int out_packed,   // ceil(out_features / 8)
+                        int num_groups) {
+    const int col_pack = blockIdx.x * blockDim.x + threadIdx.x; // packed output column
+    const int row      = blockIdx.y * blockDim.y + threadIdx.y; // real input row
+    if (col_pack >= out_packed || row >= in_features) return;
+
+    const int gid_raw = g_idx ? g_idx[row] : 0;
+    const int gid = ((gid_raw % num_groups) + num_groups) % num_groups;
+
+    const int pack_row = row >> 3;          // packed input row (8 rows per pack)
+    const int q_shift  = (row & 7) * 4;     // nibble shift within uint32
+
+    const uint32_t zeros_loaded = zeros[gid * out_packed + col_pack];
+
+    const int col_base   = col_pack << 3;  // 8 real cols per pack
+    const int scale_base = gid * out_features + col_base;
+
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        const int col = col_base + j;
+        if (col >= out_features) break;
+
+        const uint32_t q_loaded = qweight[pack_row * out_features + col];
+        const int q_nib = (q_loaded >> q_shift) & 0xF;
+
+        const int z_nib = (zeros_loaded >> (j * 4)) & 0xF;
+        const half scale = scales[scale_base + j];
+
+        // aligned with nvidia: (q - (z + 1)) * s
+        const float v = float(q_nib - (z_nib + 1)) * __half2float(scale);
+        out[row * out_features + col] = __float2half(v);
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t qweight_desc,
+    infiniopTensorDescriptor_t scales_desc,
+    infiniopTensorDescriptor_t zeros_desc,
+    infiniopTensorDescriptor_t g_idx_desc) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto result = DequantizeGPTQInfo::create(out_desc, qweight_desc, scales_desc, zeros_desc, g_idx_desc);
+
+    *desc_ptr = new Descriptor(
+        0,
+        new Opaque{handle->internal()},
+        result.take(),
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *qweight,
+    const void *scales,
+    const void *zeros,
+    const void *g_idx,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    const int in_features  = _info.in_features();
+    const int out_features = _info.out_features();
+    const int out_packed   = _info.out_packed();
+    const int in_packed    = _info.in_packed();
+    const int num_groups   = _info.num_groups();
+
+    if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0)
+        return INFINI_STATUS_BAD_PARAM;
+
+    constexpr int BLOCK_X = 16; // packed columns
+    constexpr int BLOCK_Y = 4;  // rows
+    dim3 threads(BLOCK_X, BLOCK_Y);
+    dim3 blocks((out_packed + BLOCK_X - 1) / BLOCK_X,
+                (in_features + BLOCK_Y - 1) / BLOCK_Y);
+
+    dequantize_weights_gptq<<<blocks, threads, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
+        reinterpret_cast<const uint32_t *>(qweight),
+        reinterpret_cast<const half *>(scales),
+        reinterpret_cast<const uint32_t *>(zeros),
+        reinterpret_cast<const int *>(g_idx),
+        reinterpret_cast<half *>(out),
+        in_features, out_features, out_packed, num_groups);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::dequantize_gptq::iluvatar
diff --git a/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cuh b/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cuh
new file mode 100644
index 000000000..2916b0961
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cuh
@@ -0,0 +1,8 @@
+#ifndef __DEQUANTIZE_GPTQ_ILUVATAR_CUH__
+#define __DEQUANTIZE_GPTQ_ILUVATAR_CUH__
+
+#include "../dequantize_gptq.h"
+
+DESCRIPTOR(iluvatar)
+
+#endif // __DEQUANTIZE_GPTQ_ILUVATAR_CUH__
diff --git a/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh
new file mode 100644
index 000000000..0cbf67743
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh
@@ -0,0 +1,41 @@
+#pragma once
+#include <cuda_fp16.h>
+#include <cstdint>
+
+/**
+ * @brief Unpack 8x 4-bit unsigned integers (0..15) from a uint32_t into 8 half values.
+ *
+ * GPTQ dequant is applied outside this helper (aligned with nvidia impl):
+ *   out = (q - (z + 1)) * s
+ *
+ * Output order matches interleaved half2 packing:
+ *   (v0,v4), (v1,v5), (v2,v6), (v3,v7)
+ */
+__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_gptq(uint32_t const &source) {
+    const unsigned int v0 = (source >> 0) & 0x0F;
+    const unsigned int v1 = (source >> 4) & 0x0F;
+    const unsigned int v2 = (source >> 8) & 0x0F;
+    const unsigned int v3 = (source >> 12) & 0x0F;
+    const unsigned int v4 = (source >> 16) & 0x0F;
+    const unsigned int v5 = (source >> 20) & 0x0F;
+    const unsigned int v6 = (source >> 24) & 0x0F;
+    const unsigned int v7 = (source >> 28) & 0x0F;
+
+    // NOTE: GPTQ uses unsigned q/z in [0,15]. No "-8" signed mapping here.
+    const half hv0 = half(v0);
+    const half hv1 = half(v1);
+    const half hv2 = half(v2);
+    const half hv3 = half(v3);
+    const half hv4 = half(v4);
+    const half hv5 = half(v5);
+    const half hv6 = half(v6);
+    const half hv7 = half(v7);
+
+    uint4 result;
+    __half2 *p = reinterpret_cast<__half2 *>(&result);
+    p[0] = __halves2half2(hv0, hv4);
+    p[1] = __halves2half2(hv1, hv5);
+    p[2] = __halves2half2(hv2, hv6);
+    p[3] = __halves2half2(hv3, hv7);
+    return result;
+}
diff --git a/src/infiniop/ops/dequantize_gptq/info.h b/src/infiniop/ops/dequantize_gptq/info.h
new file mode 100644
index 000000000..9c12237c2
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/info.h
@@ -0,0 +1,51 @@
+#ifndef __DEQUANTIZE_GPTQ_INFO_H__
+#define __DEQUANTIZE_GPTQ_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+#include <cassert>
+
+namespace op::dequantize_gptq {
+
+class DequantizeGPTQInfo {
+    DequantizeGPTQInfo() = default;
+
+public:
+    int _in_features, _out_features, _num_groups, _out_packed, _in_packed;
+
+    int in_features() const { return _in_features; }
+    int out_features() const { return _out_features; }
+    int num_groups() const { return _num_groups; }
+    int out_packed() const { return _out_packed; }
+    int in_packed() const { return _in_packed; }
+
+    static utils::Result<DequantizeGPTQInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t qweight_desc,
+        infiniopTensorDescriptor_t scales_desc,
+        infiniopTensorDescriptor_t zeros_desc,
+        infiniopTensorDescriptor_t g_idx_desc) {
+
+        const int _in_features   = g_idx_desc->dim(0);          // real input channels
+        const int _in_packed     = qweight_desc->dim(0);        // ceil(in_features / 8)
+        const int _out_features  = qweight_desc->dim(1);        // real output channels
+        const int _num_groups    = scales_desc->dim(0);         // should be in_features / group_size
+        const int _out_packed    = zeros_desc->dim(1);          // ceil(out_features / 8)
+
+        assert(out_desc->dim(0) == _in_features);
+        assert(out_desc->dim(1) == _out_features);
+        assert(_in_packed == (_in_features + 7) / 8);
+        assert(scales_desc->dim(1) == _out_features);
+        assert(_num_groups == zeros_desc->dim(0));
+        assert(_out_packed == (_out_features + 7) / 8);
+
+        return utils::Result<DequantizeGPTQInfo>(
+            DequantizeGPTQInfo{_in_features, _out_features, _num_groups, _out_packed, _in_packed});
+    }
+};
+
+} // namespace op::dequantize_gptq
+
+#endif // __DEQUANTIZE_GPTQ_INFO_H__
diff --git a/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h b/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h
new file mode 100644
index 000000000..681404b54
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h
@@ -0,0 +1,41 @@
+#pragma once
+#include <musa_fp16.h> // __half / __half2
+
+/**
+ * @brief Unpack 8x 4-bit unsigned integers (0..15) from a uint32_t into 8 half values.
+ *
+ * GPTQ dequant is applied outside this helper (aligned with nvidia impl):
+ *   out = (q - (z + 1)) * s
+ *
+ * Output order matches the interleaved half2 packing:
+ *   (v0,v4), (v1,v5), (v2,v6), (v3,v7)
+ */
+__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_gptq(uint32_t const &source) {
+    // unpack 8 nibbles: v0..v7 in [0, 15]
+    const unsigned int v0 = (source >> 0) & 0x0F;
+    const unsigned int v1 = (source >> 4) & 0x0F;
+    const unsigned int v2 = (source >> 8) & 0x0F;
+    const unsigned int v3 = (source >> 12) & 0x0F;
+    const unsigned int v4 = (source >> 16) & 0x0F;
+    const unsigned int v5 = (source >> 20) & 0x0F;
+    const unsigned int v6 = (source >> 24) & 0x0F;
+    const unsigned int v7 = (source >> 28) & 0x0F;
+
+    // NOTE: no "-8" offset here (unlike signed s4). GPTQ uses unsigned q/z.
+    const __half hv0 = __half(v0);
+    const __half hv1 = __half(v1);
+    const __half hv2 = __half(v2);
+    const __half hv3 = __half(v3);
+    const __half hv4 = __half(v4);
+    const __half hv5 = __half(v5);
+    const __half hv6 = __half(v6);
+    const __half hv7 = __half(v7);
+
+    uint4 result;
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    result_ptr[0] = __halves2half2(hv0, hv4);
+    result_ptr[1] = __halves2half2(hv1, hv5);
+    result_ptr[2] = __halves2half2(hv2, hv6);
+    result_ptr[3] = __halves2half2(hv3, hv7);
+    return result;
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.h b/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.h
new file mode 100644
index 000000000..689657e0a
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.h
@@ -0,0 +1,8 @@
+#ifndef __DEQUANTIZE_GPTQ_MOORE_H__
+#define __DEQUANTIZE_GPTQ_MOORE_H__
+
+#include "../dequantize_gptq.h"
+
+DESCRIPTOR(moore)
+
+#endif // __DEQUANTIZE_GPTQ_MOORE_H__
diff --git a/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.mu b/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.mu
new file mode 100644
index 000000000..290d68c8b
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.mu
@@ -0,0 +1,125 @@
+#include "../../../devices/moore/moore_handle.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "dequantize_w42f16_moore.h"
+// #include "dequantize_w42f16_kernel.h"  // 不再需要（保留也无妨）
+
+#include "../dequantize_gptq.h"
+#include <musa_fp16.h>
+#include <cstdint>
+
+// 对齐 nvidia 版：支持 g_idx
+// qweight: [in_packed, out_features]，每个 uint32 打包 8 个输入通道的 4-bit
+// zeros:   [num_groups, out_packed]，每个 uint32 打包 8 个输出通道的 4-bit
+// scales:  [num_groups, out_features]
+// g_idx:   [in_features]
+__global__ void __launch_bounds__(128)
+dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
+                        const half     *__restrict__ scales,
+                        const uint32_t *__restrict__ zeros,
+                        const int      *__restrict__ g_idx,
+                        half           *__restrict__ out,
+                        int in_features,
+                        int out_features,
+                        int out_packed,   // ceil(out_features / 8)
+                        int num_groups) {
+    const int col_pack = blockIdx.x * blockDim.x + threadIdx.x; // packed output column
+    const int row      = blockIdx.y * blockDim.y + threadIdx.y; // real input row
+    if (col_pack >= out_packed || row >= in_features) return;
+
+    // clamp gid to [0, num_groups)
+    const int gid_raw = g_idx ? g_idx[row] : 0;
+    const int gid = ((gid_raw % num_groups) + num_groups) % num_groups;
+
+    const int pack_row = row >> 3;          // packed input row (8 rows per pack)
+    const int q_shift  = (row & 7) * 4;     // nibble shift within uint32
+
+    const int zero_idx = gid * out_packed + col_pack;
+    const uint32_t zeros_loaded = zeros[zero_idx];
+
+    const int col_base   = col_pack << 3;  // 8 real cols per pack
+    const int scale_base = gid * out_features + col_base;
+
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        const int col = col_base + j;
+        if (col >= out_features) break;
+
+        const uint32_t q_loaded = qweight[pack_row * out_features + col];
+        const int q_nib = (q_loaded >> q_shift) & 0xF;
+
+        const int z_nib = (zeros_loaded >> (j * 4)) & 0xF;
+        const half scale = scales[scale_base + j];
+
+        // 与 nvidia 版一致： (q - (z + 1)) * s
+        const float v = float(q_nib - (z_nib + 1)) * __half2float(scale);
+        out[row * out_features + col] = __float2half(v);
+    }
+}
+
+namespace op::dequantize_gptq::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t qweight_desc,
+    infiniopTensorDescriptor_t scales_desc,
+    infiniopTensorDescriptor_t zeros_desc,
+    infiniopTensorDescriptor_t g_idx_desc) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto result = DequantizeGPTQInfo::create(out_desc, qweight_desc, scales_desc, zeros_desc, g_idx_desc);
+
+    *desc_ptr = new Descriptor(
+        0,
+        new Opaque{handle->internal()},
+        result.take(),
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *qweight,
+    const void *scales,
+    const void *zeros,
+    const void *g_idx,
+    void *stream) const {
+
+    const int in_features  = _info.in_features();
+    const int out_features = _info.out_features();
+    const int out_packed   = _info.out_packed();
+    const int in_packed    = _info.in_packed();
+    const int num_groups   = _info.num_groups();
+
+    if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0)
+        return INFINI_STATUS_BAD_PARAM;
+
+    constexpr int BLOCK_X = 16; // packed columns
+    constexpr int BLOCK_Y = 4;  // rows
+    dim3 threads(BLOCK_X, BLOCK_Y);
+    dim3 blocks((out_packed + BLOCK_X - 1) / BLOCK_X,
+                (in_features + BLOCK_Y - 1) / BLOCK_Y);
+
+    dequantize_weights_gptq<<<blocks, threads, 0, reinterpret_cast<musaStream_t>(stream)>>>(
+        reinterpret_cast<const uint32_t*>(qweight),
+        reinterpret_cast<const half*>(scales),
+        reinterpret_cast<const uint32_t*>(zeros),
+        reinterpret_cast<const int*>(g_idx),
+        reinterpret_cast<half*>(out),
+        in_features, out_features, out_packed, num_groups);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::dequantize_gptq::moore
diff --git a/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_kernel.cuh
new file mode 100644
index 000000000..d77d888ee
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_kernel.cuh
@@ -0,0 +1,124 @@
+#pragma once
+
+__device__ uint4 dequantize_s4_to_fp16x2_gptq(uint32_t const &source) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
+    // 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
+    // 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0]，
+    // 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。
+    const unsigned int v0 = (source >> 0) & 0x0F;
+    const unsigned int v1 = (source >> 4) & 0x0F;
+    const unsigned int v2 = (source >> 8) & 0x0F;
+    const unsigned int v3 = (source >> 12) & 0x0F;
+    const unsigned int v4 = (source >> 16) & 0x0F;
+    const unsigned int v5 = (source >> 20) & 0x0F;
+    const unsigned int v6 = (source >> 24) & 0x0F;
+    const unsigned int v7 = (source >> 28) & 0x0F;
+
+    // 步骤 2: GPTQ 是 (Q - Z) * S。
+    // Q 和 Z 都是无符号数 [0, 15]。
+    // 这里不需要 - offset
+    
+    __half hv0 = __half(v0);
+    __half hv1 = __half(v1);
+    __half hv2 = __half(v2);
+    __half hv3 = __half(v3);
+    __half hv4 = __half(v4);
+    __half hv5 = __half(v5);
+    __half hv6 = __half(v6);
+    __half hv7 = __half(v7);
+
+    // 步骤 3: 将 half 值按 PTX 交错顺序打包成 __half2 并存入 result 中。
+    // 顺序：result_ptr[0]: low=hv0, high=hv4
+    //       result_ptr[1]: low=hv1, high=hv5
+    //       result_ptr[2]: low=hv2, high=hv6
+    //       result_ptr[3]: low=hv3, high=hv7
+    // __halves2half2 函数：low 为第一个参数，high 为第二个参数。
+    uint4 result;
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+
+    result_ptr[0] = __halves2half2(hv0, hv4);
+    result_ptr[1] = __halves2half2(hv1, hv5);
+    result_ptr[2] = __halves2half2(hv2, hv6);
+    result_ptr[3] = __halves2half2(hv3, hv7);
+
+    return result;
+#else
+    uint4 result;
+
+    uint32_t *h = reinterpret_cast<uint32_t *>(&result);
+    uint32_t const i4s = reinterpret_cast<uint32_t const &>(source);
+
+    // First, we extract the i4s and construct an intermediate fp16 number.
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
+    static constexpr uint32_t TOP_MASK = 0x00f000f0;
+    static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+    // Note that the entire sequence only requires 1 shift instruction. This is
+    // thanks to the register packing format and the fact that we force our
+    // integers to be unsigned, and account for this in the fp16 subtractions. In
+    // addition, I exploit the fact that sub and fma have the same throughput in
+    // order to convert elt_23 and elt_67 to fp16 without having to shift them to
+    // the bottom bits before hand.
+
+    // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
+    // dependency if we issue immediately before required.
+    const uint32_t top_i4s = i4s >> 8;
+    // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[0])
+                 : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                   "n"(immLut));
+    // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[1])
+                 : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                   "n"(immLut));
+    // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[2])
+                 : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                   "n"(immLut));
+    // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                 : "=r"(h[3])
+                 : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
+                   "n"(immLut));
+
+    // I use inline PTX below because I am not sure if the compiler will emit
+    // float2half instructions if I use the half2 ctor. In this case, I chose
+    // performance reliability over code readability.
+
+    // This is the half2 {1032, 1032} represented as an integer.
+    // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
+    // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
+    static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
+    // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+    static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+    // This is the half2 {-72, -72} represented as an integer.
+    // static constexpr uint32_t NEG_72 = 0xd480d480;
+    // Haotian: Let's use {-64, -64}.
+    static constexpr uint32_t NEG_64 = 0xd400d400;
+
+    // Finally, we construct the output numbers.
+    // Convert elt_01
+    asm volatile("sub.f16x2 %0, %1, %2;\n"
+                 : "=r"(h[0])
+                 : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_23
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                 : "=r"(h[1])
+                 : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+    // Convert elt_45
+    asm volatile("sub.f16x2 %0, %1, %2;\n"
+                 : "=r"(h[2])
+                 : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_67
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+                 : "=r"(h[3])
+                 : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+
+    return result;
+#endif
+    __builtin_unreachable(); // Suppress missing return statement warning
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cu b/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cu
new file mode 100644
index 000000000..b77446190
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cu
@@ -0,0 +1,126 @@
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "dequantize_w42f16_kernel.cuh"
+#include "dequantize_w42f16_nvidia.cuh"
+#include "../dequantize_gptq.h"
+#include <cuda_fp16.h>
+
+namespace op::dequantize_gptq::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t qweight_desc,
+    infiniopTensorDescriptor_t scales_desc,
+    infiniopTensorDescriptor_t zeros_desc,
+    infiniopTensorDescriptor_t g_idx_desc) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto result = DequantizeGPTQInfo::create(out_desc, qweight_desc, scales_desc, zeros_desc, g_idx_desc);
+
+    *desc_ptr = new Descriptor(
+        0,
+        new Opaque{handle->internal()},
+        result.take(),
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 仅保留这一版内核（支持 g_idx）
+// qweight: [in_packed, out_features] packing 8 input channels per word
+// zeros: [num_groups, out_packed] packing 8 output channels per word
+// scales: [num_groups, out_features], g_idx: [in_features]
+__global__ void __launch_bounds__(128)
+dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
+                        const half     *__restrict__ scales,
+                        const uint32_t *__restrict__ zeros,
+                        const int      *__restrict__ g_idx,
+                        half           *__restrict__ out,
+                        int in_features,
+                        int out_features,
+                        int out_packed,   // ceil(out_features / 8)
+                        int num_groups) {
+    // Each thread handles one packed output column (8 real output cols).
+    const int col_pack = blockIdx.x * blockDim.x + threadIdx.x;     // packed output column
+    const int row = blockIdx.y * blockDim.y + threadIdx.y;          // real input row
+    if (col_pack >= out_packed || row >= in_features) return;
+
+    // Clamp gid to valid range
+    const int gid_raw = g_idx ? g_idx[row] : 0;
+    const int gid = ((gid_raw % num_groups) + num_groups) % num_groups;
+
+    const int pack_row = row >> 3;                                  // packed input row
+
+    const int zero_idx = gid * out_packed + col_pack;               // zeros layout: [num_groups, out_packed]
+    const uint32_t zeros_loaded = zeros[zero_idx];
+
+    const int q_shift = (row & 7) * 4;      // qweight packs 8 input rows
+    const int col_base = col_pack << 3;    // 8 real cols per pack
+    const int scale_base = gid * out_features + col_base;
+
+    #pragma unroll
+    for (int j = 0; j < 8; ++j) {
+        const int col = col_base + j;
+        if (col >= out_features) break;
+
+        const uint32_t q_loaded = qweight[pack_row * out_features + col];
+        const int q_nib = (q_loaded >> q_shift) & 0xF;
+
+        const int z_nib = (zeros_loaded >> (j * 4)) & 0xF;
+        const half scale = scales[scale_base + j];
+
+        // GPTQ quirk: The stored zero point is usually offset by 1.
+        // Standard formula: (q - (z + 1)) * s
+        const float v = float(q_nib - (z_nib + 1)) * __half2float(scale);
+        out[row * out_features + col] = __float2half(v);
+    }
+}
+
+infiniStatus_t
+Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *qweight,
+    const void *scales,
+    const void *zeros,
+    const void *g_idx,
+    void *stream) const {
+
+    const int in_features  = _info.in_features();
+    const int out_features = _info.out_features();
+    const int out_packed   = _info.out_packed();
+    const int in_packed    = _info.in_packed();
+    const int num_groups   = _info.num_groups();
+
+    if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0)
+        return INFINI_STATUS_BAD_PARAM;
+
+    constexpr int BLOCK_X = 16;  // packed columns
+    constexpr int BLOCK_Y = 4;   // rows
+    dim3 threads(BLOCK_X, BLOCK_Y);
+    dim3 blocks((out_packed + BLOCK_X - 1) / BLOCK_X,
+                (in_features + BLOCK_Y - 1) / BLOCK_Y);
+
+    dequantize_weights_gptq<<<blocks, threads, 0,
+        reinterpret_cast<cudaStream_t>(stream)>>>(
+            reinterpret_cast<const uint32_t*>(qweight),
+            reinterpret_cast<const half*>(scales),
+            reinterpret_cast<const uint32_t*>(zeros),
+            reinterpret_cast<const int*>(g_idx),
+            reinterpret_cast<half*>(out),
+            in_features, out_features, out_packed, num_groups);
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::dequantize_gptq::nvidia
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cuh b/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cuh
new file mode 100644
index 000000000..ccaa0e429
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __DEQUANTIZE_GPTQ_CUDA_CUH__
+#define __DEQUANTIZE_GPTQ_CUDA_CUH__
+
+#include "../dequantize_gptq.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __DEQUANTIZE_GPTQ_CUDA_CUH__
diff --git a/src/infiniop/ops/dequantize_gptq/operator.cc b/src/infiniop/ops/dequantize_gptq/operator.cc
new file mode 100644
index 000000000..a0790612d
--- /dev/null
+++ b/src/infiniop/ops/dequantize_gptq/operator.cc
@@ -0,0 +1,143 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/dequantize_gptq.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#include "nvidia/dequantize_w42f16_nvidia.cuh"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/dequantize_w42f16_moore.h"
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#include "iluvatar/dequantize_w42f16_iluvatar.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(
+    infiniopHandle_t handle,
+    infiniopDequantizeGPTQDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t qweight_desc,
+    infiniopTensorDescriptor_t scales_desc,
+    infiniopTensorDescriptor_t zeros_desc,
+    infiniopTensorDescriptor_t g_idx_desc) {  // add g_idx
+#define CREATE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        return op::dequantize_gptq::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                   \
+            reinterpret_cast<op::dequantize_gptq::NAMESPACE::Descriptor **>(desc_ptr), \
+            out_desc,                                                                 \
+            qweight_desc,                                                             \
+            scales_desc,                                                              \
+            zeros_desc, g_idx_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, iluvatar);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc,
+                                                         size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                                \
+    case CASE:                                                                                              \
+        *size = reinterpret_cast<const op::dequantize_gptq::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, iluvatar);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopDequantizeGPTQ(
+    infiniopDequantizeGPTQDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *qweight,
+    const void *scales,
+    const void *zeros,
+    const void *g_idx,    // add g_idx
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                           \
+        return reinterpret_cast<const op::dequantize_gptq::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, out, qweight, scales, zeros, g_idx, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, iluvatar);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::dequantize_gptq::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, iluvatar);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
+
+// #endif
\ No newline at end of file
diff --git a/test/infiniop/dequantize_gptq.py b/test/infiniop/dequantize_gptq.py
new file mode 100644
index 000000000..a9d1a0404
--- /dev/null
+++ b/test/infiniop/dequantize_gptq.py
@@ -0,0 +1,276 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    TestWorkspace,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration
+# ==============================================================================
+# Each case: (in_features, out_features, group_size)
+_TEST_CASES = [
+    (128, 256, 32),
+    (512, 2048, 128),
+    (1024, 1024, 128),
+    # Non-multiple-of-8 edge case for both dims
+    (513, 257, 32),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0.0, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+# ==============================================================================
+#  Reference Implementation (matches CUDA kernel)
+# ==============================================================================
+
+def _unpack_qweight_int4_packed_by_rows(qweight_packed: torch.Tensor, in_features: int) -> torch.Tensor:
+    """qweight_packed: [in_packed, out_features] int32/uint32.
+
+    Packs 8 input rows per 32-bit word; nibble shift = (row_in_pack * 4).
+    Returns: [in_features, out_features] int32 in [0, 15].
+    """
+    assert qweight_packed.dim() == 2
+    shifts = torch.arange(0, 32, 4, device=qweight_packed.device, dtype=torch.int32)
+    # [in_packed, 8, out_features]
+    expanded = torch.bitwise_right_shift(qweight_packed[:, None, :], shifts[None, :, None])
+    vals = torch.bitwise_and(expanded, 0xF).to(torch.int32)
+    out_features = qweight_packed.shape[1]
+    return vals.reshape(-1, out_features)[:in_features, :]
+
+
+def _unpack_zeros_int4_packed_by_cols(zeros_packed: torch.Tensor, out_features: int) -> torch.Tensor:
+    """zeros_packed: [num_groups, out_packed] int32/uint32.
+
+    Packs 8 output cols per 32-bit word; nibble shift = (col_in_pack * 4).
+    Returns: [num_groups, out_features] int32 in [0, 15].
+    """
+    assert zeros_packed.dim() == 2
+    shifts = torch.arange(0, 32, 4, device=zeros_packed.device, dtype=torch.int32)
+    # [num_groups, out_packed, 8]
+    expanded = torch.bitwise_right_shift(zeros_packed[:, :, None], shifts[None, None, :])
+    vals = torch.bitwise_and(expanded, 0xF).to(torch.int32)
+    return vals.reshape(zeros_packed.shape[0], -1)[:, :out_features]
+
+
+def dequantize_gptq_ref(
+    qweight_packed: torch.Tensor,
+    scales: torch.Tensor,
+    zeros_packed: torch.Tensor,
+    g_idx: torch.Tensor,
+    group_size: int,
+) -> torch.Tensor:
+    """Reference matches kernel in dequantize_w42f16_nvidia.cu.
+
+    out[row, col] = (q - (z + 1)) * scale
+    where q is int4 packed by input rows and z is int4 packed by output cols.
+    gid = g_idx[row] (clamped modulo num_groups).
+    """
+    in_features = g_idx.numel()
+    out_features = qweight_packed.shape[1]
+
+    num_groups = scales.shape[0]
+    assert scales.shape == (num_groups, out_features)
+
+    q = _unpack_qweight_int4_packed_by_rows(qweight_packed, in_features)  # [in_features, out_features]
+    z_full = _unpack_zeros_int4_packed_by_cols(zeros_packed, out_features)  # [num_groups, out_features]
+
+    gid_raw = g_idx.to(torch.int32)
+    gid = ((gid_raw % num_groups) + num_groups) % num_groups
+
+    z = z_full[gid]  # [in_features, out_features]
+    s = scales[gid]  # [in_features, out_features]
+
+    out = (q - (z + 1)).to(torch.float32) * s.to(torch.float32)
+    return out.to(torch.float16)
+
+
+def _make_packed_inputs(in_features: int, out_features: int, group_size: int, torch_device: str):
+    # num_groups is implicit (same convention as GPTQ: group per input channel)
+    num_groups = (in_features + group_size - 1) // group_size
+    in_packed = (in_features + 7) // 8
+    out_packed = (out_features + 7) // 8
+
+    # Deterministic group mapping
+    g_idx = (torch.arange(in_features, device=torch_device, dtype=torch.int32) // group_size)
+
+    # Random nibble-level values to avoid relying on sign/shift corner cases
+    q_nib = torch.randint(0, 16, (in_features, out_features), device=torch_device, dtype=torch.int32)
+    z_nib = torch.randint(0, 16, (num_groups, out_features), device=torch_device, dtype=torch.int32)
+
+    # scales in fp16
+    scales = (torch.rand((num_groups, out_features), device=torch_device, dtype=torch.float16) * 0.5 + 0.01)
+
+    # Pack qweight: [in_packed, out_features]
+    qweight_packed = torch.zeros((in_packed, out_features), device=torch_device, dtype=torch.int32)
+    for i in range(8):
+        rows = torch.arange(i, in_features, 8, device=torch_device)
+        if rows.numel() == 0:
+            continue
+        pack_rows = (rows // 8).to(torch.int64)
+        qweight_packed[pack_rows, :] |= (q_nib[rows, :] & 0xF) << (i * 4)
+
+    # Pack zeros: [num_groups, out_packed]
+    zeros_packed = torch.zeros((num_groups, out_packed), device=torch_device, dtype=torch.int32)
+    for j in range(8):
+        cols = torch.arange(j, out_features, 8, device=torch_device)
+        if cols.numel() == 0:
+            continue
+        pack_cols = (cols // 8).to(torch.int64)
+        zeros_packed[:, pack_cols] |= (z_nib[:, cols] & 0xF) << (j * 4)
+
+    return qweight_packed, scales, zeros_packed, g_idx
+
+
+# ==============================================================================
+#  Test Entrypoint
+# ==============================================================================
+
+def test(
+    handle,
+    device,
+    in_features,
+    out_features,
+    group_size,
+    dtype=None,
+    sync=None,
+):
+    print(
+        f"Testing Dequantize GPTQ on {InfiniDeviceNames[device]} with in_features:{in_features}, out_features:{out_features}, group_size:{group_size}"
+    )
+
+    # Infer torch device from a probe tensor created by TestTensor.
+    probe = TestTensor((1,), None, InfiniDtype.U8, device, mode="ones")
+    torch_device = probe.actual_tensor().device
+
+    qweight_packed, scales, zeros_packed, g_idx = _make_packed_inputs(
+        in_features, out_features, group_size, torch_device
+    )
+
+    # Reference
+    ans = dequantize_gptq_ref(qweight_packed, scales, zeros_packed, g_idx, group_size)
+
+    # Wrap into TestTensor so we get descriptors + raw pointers
+    qweight = TestTensor.from_torch(qweight_packed, InfiniDtype.I32, device)
+    zeros = TestTensor.from_torch(zeros_packed, InfiniDtype.I32, device)
+    qscales = TestTensor.from_torch(scales, InfiniDtype.F16, device)
+    g_idx_t = TestTensor.from_torch(g_idx, InfiniDtype.I32, device)
+
+    out = TestTensor((in_features, out_features), None, InfiniDtype.F16, device, mode="zeros")
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDequantizeGPTQDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out.descriptor,
+            qweight.descriptor,
+            qscales.descriptor,
+            zeros.descriptor,
+            g_idx_t.descriptor,
+        )
+    )
+
+    # Invalidate descriptors (same pattern as other tests)
+    for tensor in [qweight, zeros, qscales, g_idx_t, out]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDequantizeGPTQWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_dequantize_gptq():
+        check_error(
+            LIBINFINIOP.infiniopDequantizeGPTQ(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                out.data(),
+                qweight.data(),
+                qscales.data(),
+                zeros.data(),
+                g_idx_t.data(),
+                None,
+            )
+        )
+
+    lib_dequantize_gptq()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+
+    if DEBUG:
+        debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation("PyTorch", lambda: dequantize_gptq_ref(qweight_packed, scales, zeros_packed, g_idx, group_size), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_dequantize_gptq(), device, NUM_PRERUN, NUM_ITERATIONS)
+
+    check_error(LIBINFINIOP.infiniopDestroyDequantizeGPTQDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # This operator is intended to run on NVIDIA in our current environment.
+    # Make `srun ... python test/infiniop/dequantize_gptq.py` work without extra flags.
+    if not any(
+        getattr(args, name)
+        for name in [
+            "cpu",
+            "nvidia",
+            "iluvatar",
+            "qy",
+            "cambricon",
+            "ascend",
+            "metax",
+            "moore",
+            "kunlun",
+            "hygon",
+        ]
+    ):
+        args.nvidia = True
+
+    devices = [d for d in get_test_devices(args) if InfiniDeviceNames[d] == "NVIDIA"]
+    if not devices:
+        raise RuntimeError("No NVIDIA device selected; run with --nvidia under srun.")
+
+    for device in devices:
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 5a67b830c..023c94e49 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -986,6 +986,42 @@ def dequantize_(lib):
     ]
 
 
+
+@OpRegister.operator
+def dequantize_gptq_(lib):
+    lib.infiniopCreateDequantizeGPTQDescriptor.restype = c_int32
+    lib.infiniopCreateDequantizeGPTQDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetDequantizeGPTQWorkspaceSize.restype = c_int32
+    lib.infiniopGetDequantizeGPTQWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopDequantizeGPTQ.restype = c_int32
+    lib.infiniopDequantizeGPTQ.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyDequantizeGPTQDescriptor.restype = c_int32
+    lib.infiniopDestroyDequantizeGPTQDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def per_channel_quant_int8_(lib):
     lib.infiniopCreatePerChannelQuantI8Descriptor.restype = c_int32

From 5a6f2593369bb122db142dec6e9cbc704186d3b4 Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Wed, 18 Mar 2026 02:57:52 +0000
Subject: [PATCH 2/2] issue/1031 fix T2-1-1

---
 include/infiniop/ops/dequantize_gptq.h        |  36 ++---
 scripts/python_test.py                        |   4 +-
 .../iluvatar/dequantize_w42f16_iluvatar.cu    |   2 +-
 .../nvidia/dequantize_w42f16_kernel.cuh       |   2 +-
 .../nvidia/dequantize_w42f16_nvidia.cu        |  20 +--
 .../ops/dequantize_gptq/dequantize_gptq.h     |   8 +-
 .../iluvatar/dequantize_w42f16_iluvatar.cu    |  50 +++----
 .../iluvatar/dequantize_w42f16_kernel.cuh     |  41 ------
 src/infiniop/ops/dequantize_gptq/info.h       |  10 +-
 .../moore/dequantize_w42f16_kernel.h          |  41 ------
 .../moore/dequantize_w42f16_moore.mu          |   1 -
 .../nvidia/dequantize_w42f16_kernel.cuh       | 124 ------------------
 .../nvidia/dequantize_w42f16_nvidia.cu        |  74 ++++++-----
 src/infiniop/ops/dequantize_gptq/operator.cc  |  40 +++---
 test/infiniop/dequantize_gptq.py              |  99 ++++++++------
 xmake/iluvatar.lua                            |   3 +-
 16 files changed, 189 insertions(+), 366 deletions(-)
 delete mode 100644 src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh
 delete mode 100644 src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h
 delete mode 100644 src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_kernel.cuh

diff --git a/include/infiniop/ops/dequantize_gptq.h b/include/infiniop/ops/dequantize_gptq.h
index 5b0a83351..78ff15839 100644
--- a/include/infiniop/ops/dequantize_gptq.h
+++ b/include/infiniop/ops/dequantize_gptq.h
@@ -5,26 +5,26 @@
 
 typedef struct InfiniopDescriptor *infiniopDequantizeGPTQDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(infiniopHandle_t handle,
-                                                                  infiniopDequantizeGPTQDescriptor_t *desc_ptr,
-                                                                  infiniopTensorDescriptor_t out_desc,
-                                                                  infiniopTensorDescriptor_t qweight_desc,
-                                                                  infiniopTensorDescriptor_t scales_desc,
-                                                                  infiniopTensorDescriptor_t zeros_desc,
-                                                                  infiniopTensorDescriptor_t g_idx_desc); // add g_idx
+__INFINI_C __export infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(infiniopHandle_t handle,
+                                                                          infiniopDequantizeGPTQDescriptor_t *desc_ptr,
+                                                                          infiniopTensorDescriptor_t out_desc,
+                                                                          infiniopTensorDescriptor_t qweight_desc,
+                                                                          infiniopTensorDescriptor_t scales_desc,
+                                                                          infiniopTensorDescriptor_t zeros_desc,
+                                                                          infiniopTensorDescriptor_t g_idx_desc); // add g_idx
 
-__C __export infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopDequantizeGPTQ(infiniopDequantizeGPTQDescriptor_t desc,
-                                                  void *workspace,
-                                                  size_t workspace_size,
-                                                  void *out,
-                                                  const void *qweight,
-                                                  const void *scales,
-                                                  const void *zeros,
-                                                  const void *g_idx,  // add g_idx
-                                                  void *stream);
+__INFINI_C __export infiniStatus_t infiniopDequantizeGPTQ(infiniopDequantizeGPTQDescriptor_t desc,
+                                                          void *workspace,
+                                                          size_t workspace_size,
+                                                          void *out,
+                                                          const void *qweight,
+                                                          const void *scales,
+                                                          const void *zeros,
+                                                          const void *g_idx, // add g_idx
+                                                          void *stream);
 
-__C __export infiniStatus_t infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc);
 
 #endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index d0d46df4d..4879da32a 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -17,8 +17,8 @@ def run_tests(args):
         "causal_softmax.py",
         "clip.py",
         "conv.py",
-        "dequantize_awq.py",
-        "dequantize_gptq.py",
+        # "dequantize_awq.py",
+        # "dequantize_gptq.py",
         "gelu.py",
         "gemm.py",
         # "layer_norm.py",
diff --git a/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu b/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
index d0f7bc73a..9425a0af2 100644
--- a/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
+++ b/src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu
@@ -8,7 +8,7 @@
 
 __global__ void __launch_bounds__(64)
     dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int G) {
+                           int *__restrict__ zeros, half *__restrict__ C, int G) {
     // static constexpr uint32_t ZERO = 0x0;
     half B_shared[32 * (128 + 8)];
 
diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh
index 133e0dcb5..8b397a5b0 100644
--- a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh
+++ b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh
@@ -122,4 +122,4 @@ __device__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
     return result;
 #endif
     __builtin_unreachable(); // Suppress missing return statement warning
-}
\ No newline at end of file
+}
diff --git a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
index 152d56398..0c5251142 100644
--- a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
+++ b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
@@ -11,15 +11,17 @@
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 750)
 __global__ void __launch_bounds__(64)
     dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int G,
-                       int out_features, int in_features) {
+                           int *__restrict__ zeros, half *__restrict__ C, int G,
+                           int out_features, int in_features) {
     // static constexpr uint32_t ZERO = 0x0;
 
     int col = (blockIdx.x * blockDim.x + threadIdx.x);
     int row = (blockIdx.y * blockDim.y + threadIdx.y);
 
     // 边界检查，防止越界访问
-    if (col >= out_features || row >= in_features) return;
+    if (col >= out_features || row >= in_features) {
+        return;
+    }
 
     // 每个元素在输出中的起始地址：行主序，连续 8 个 half
     int index1 = 8 * col + 8 * row * out_features;
@@ -60,7 +62,7 @@ __global__ void __launch_bounds__(64)
 
     // 直接写回全局内存输出
     half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
-    #pragma unroll
+#pragma unroll
     for (int i = 0; i < 8; ++i) {
         C_ptr2[i] = out_vec[i];
     }
@@ -68,15 +70,17 @@ __global__ void __launch_bounds__(64)
 #else
 __global__ void __launch_bounds__(64)
     dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int group_size,
-                       int out_features, int in_features) {
+                           int *__restrict__ zeros, half *__restrict__ C, int group_size,
+                           int out_features, int in_features) {
     static constexpr uint32_t ZERO = 0x0;
 
     int col = (blockIdx.x * blockDim.x + threadIdx.x);
     int row = blockIdx.y * blockDim.y + threadIdx.y;
 
     // 边界检查，防止越界访问
-    if (col >= out_features || row >= in_features) return;
+    if (col >= out_features || row >= in_features) {
+        return;
+    }
 
     int index1 = 8 * col + 8 * row * out_features;
     half *C_ptr2 = C + index1;
@@ -122,7 +126,7 @@ __global__ void __launch_bounds__(64)
 
     // 直接写回全局内存输出
     half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
-    #pragma unroll
+#pragma unroll
     for (int i = 0; i < 8; ++i) {
         C_ptr2[i] = out_vec[i];
     }
diff --git a/src/infiniop/ops/dequantize_gptq/dequantize_gptq.h b/src/infiniop/ops/dequantize_gptq/dequantize_gptq.h
index 82898fbaf..27092a3b3 100644
--- a/src/infiniop/ops/dequantize_gptq/dequantize_gptq.h
+++ b/src/infiniop/ops/dequantize_gptq/dequantize_gptq.h
@@ -8,17 +8,17 @@
 
 #define DESCRIPTOR(NAMESPACE)                                    \
                                                                  \
-    namespace op::dequantize_gptq::NAMESPACE {                    \
+    namespace op::dequantize_gptq::NAMESPACE {                   \
     class Descriptor final : public InfiniopDescriptor {         \
         struct Opaque;                                           \
         Opaque *_opaque;                                         \
-        DequantizeGPTQInfo _info;                                 \
+        DequantizeGPTQInfo _info;                                \
         size_t _workspace_size;                                  \
                                                                  \
         Descriptor(                                              \
             size_t workspace_size_,                              \
             Opaque *opaque,                                      \
-            DequantizeGPTQInfo info,                              \
+            DequantizeGPTQInfo info,                             \
             infiniDevice_t device_type,                          \
             int device_id)                                       \
             : InfiniopDescriptor{device_type, device_id},        \
@@ -47,7 +47,7 @@
             const void *qweight,                                 \
             const void *scales,                                  \
             const void *zeros,                                   \
-            const void *g_idx,                                  \
+            const void *g_idx,                                   \
             void *stream) const;                                 \
     };                                                           \
     }
diff --git a/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu b/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu
index a0b79d0ca..a853394bf 100644
--- a/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu
+++ b/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu
@@ -1,11 +1,10 @@
 #include "../../../devices/nvidia/nvidia_handle.cuh"
 #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
 #include "dequantize_w42f16_iluvatar.cuh"
-#include "dequantize_w42f16_kernel.cuh"
 
 #include "../dequantize_gptq.h"
-#include <cuda_fp16.h>
 #include <cstdint>
+#include <cuda_fp16.h>
 
 namespace op::dequantize_gptq::iluvatar {
 
@@ -20,34 +19,38 @@ Descriptor::~Descriptor() { delete _opaque; }
 // zeros:   [num_groups, out_packed] packing 8 output channels per word
 // scales:  [num_groups, out_features], g_idx: [in_features]
 __global__ void __launch_bounds__(128)
-dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
-                        const half     *__restrict__ scales,
-                        const uint32_t *__restrict__ zeros,
-                        const int      *__restrict__ g_idx,
-                        half           *__restrict__ out,
-                        int in_features,
-                        int out_features,
-                        int out_packed,   // ceil(out_features / 8)
-                        int num_groups) {
+    dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
+                            const half *__restrict__ scales,
+                            const uint32_t *__restrict__ zeros,
+                            const int *__restrict__ g_idx,
+                            half *__restrict__ out,
+                            int in_features,
+                            int out_features,
+                            int out_packed, // ceil(out_features / 8)
+                            int num_groups) {
     const int col_pack = blockIdx.x * blockDim.x + threadIdx.x; // packed output column
-    const int row      = blockIdx.y * blockDim.y + threadIdx.y; // real input row
-    if (col_pack >= out_packed || row >= in_features) return;
+    const int row = blockIdx.y * blockDim.y + threadIdx.y;      // real input row
+    if (col_pack >= out_packed || row >= in_features) {
+        return;
+    }
 
     const int gid_raw = g_idx ? g_idx[row] : 0;
     const int gid = ((gid_raw % num_groups) + num_groups) % num_groups;
 
-    const int pack_row = row >> 3;          // packed input row (8 rows per pack)
-    const int q_shift  = (row & 7) * 4;     // nibble shift within uint32
+    const int pack_row = row >> 3;     // packed input row (8 rows per pack)
+    const int q_shift = (row & 7) * 4; // nibble shift within uint32
 
     const uint32_t zeros_loaded = zeros[gid * out_packed + col_pack];
 
-    const int col_base   = col_pack << 3;  // 8 real cols per pack
+    const int col_base = col_pack << 3; // 8 real cols per pack
     const int scale_base = gid * out_features + col_base;
 
-    #pragma unroll
+#pragma unroll
     for (int j = 0; j < 8; ++j) {
         const int col = col_base + j;
-        if (col >= out_features) break;
+        if (col >= out_features) {
+            break;
+        }
 
         const uint32_t q_loaded = qweight[pack_row * out_features + col];
         const int q_nib = (q_loaded >> q_shift) & 0xF;
@@ -96,14 +99,15 @@ infiniStatus_t Descriptor::calculate(
     (void)workspace;
     (void)workspace_size;
 
-    const int in_features  = _info.in_features();
+    const int in_features = _info.in_features();
     const int out_features = _info.out_features();
-    const int out_packed   = _info.out_packed();
-    const int in_packed    = _info.in_packed();
-    const int num_groups   = _info.num_groups();
+    const int out_packed = _info.out_packed();
+    const int in_packed = _info.in_packed();
+    const int num_groups = _info.num_groups();
 
-    if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0)
+    if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0) {
         return INFINI_STATUS_BAD_PARAM;
+    }
 
     constexpr int BLOCK_X = 16; // packed columns
     constexpr int BLOCK_Y = 4;  // rows
diff --git a/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh
deleted file mode 100644
index 0cbf67743..000000000
--- a/src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-#include <cuda_fp16.h>
-#include <cstdint>
-
-/**
- * @brief Unpack 8x 4-bit unsigned integers (0..15) from a uint32_t into 8 half values.
- *
- * GPTQ dequant is applied outside this helper (aligned with nvidia impl):
- *   out = (q - (z + 1)) * s
- *
- * Output order matches interleaved half2 packing:
- *   (v0,v4), (v1,v5), (v2,v6), (v3,v7)
- */
-__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_gptq(uint32_t const &source) {
-    const unsigned int v0 = (source >> 0) & 0x0F;
-    const unsigned int v1 = (source >> 4) & 0x0F;
-    const unsigned int v2 = (source >> 8) & 0x0F;
-    const unsigned int v3 = (source >> 12) & 0x0F;
-    const unsigned int v4 = (source >> 16) & 0x0F;
-    const unsigned int v5 = (source >> 20) & 0x0F;
-    const unsigned int v6 = (source >> 24) & 0x0F;
-    const unsigned int v7 = (source >> 28) & 0x0F;
-
-    // NOTE: GPTQ uses unsigned q/z in [0,15]. No "-8" signed mapping here.
-    const half hv0 = half(v0);
-    const half hv1 = half(v1);
-    const half hv2 = half(v2);
-    const half hv3 = half(v3);
-    const half hv4 = half(v4);
-    const half hv5 = half(v5);
-    const half hv6 = half(v6);
-    const half hv7 = half(v7);
-
-    uint4 result;
-    __half2 *p = reinterpret_cast<__half2 *>(&result);
-    p[0] = __halves2half2(hv0, hv4);
-    p[1] = __halves2half2(hv1, hv5);
-    p[2] = __halves2half2(hv2, hv6);
-    p[3] = __halves2half2(hv3, hv7);
-    return result;
-}
diff --git a/src/infiniop/ops/dequantize_gptq/info.h b/src/infiniop/ops/dequantize_gptq/info.h
index 9c12237c2..aef668bb1 100644
--- a/src/infiniop/ops/dequantize_gptq/info.h
+++ b/src/infiniop/ops/dequantize_gptq/info.h
@@ -28,11 +28,11 @@ class DequantizeGPTQInfo {
         infiniopTensorDescriptor_t zeros_desc,
         infiniopTensorDescriptor_t g_idx_desc) {
 
-        const int _in_features   = g_idx_desc->dim(0);          // real input channels
-        const int _in_packed     = qweight_desc->dim(0);        // ceil(in_features / 8)
-        const int _out_features  = qweight_desc->dim(1);        // real output channels
-        const int _num_groups    = scales_desc->dim(0);         // should be in_features / group_size
-        const int _out_packed    = zeros_desc->dim(1);          // ceil(out_features / 8)
+        const int _in_features = g_idx_desc->dim(0);    // real input channels
+        const int _in_packed = qweight_desc->dim(0);    // ceil(in_features / 8)
+        const int _out_features = qweight_desc->dim(1); // real output channels
+        const int _num_groups = scales_desc->dim(0);    // should be in_features / group_size
+        const int _out_packed = zeros_desc->dim(1);     // ceil(out_features / 8)
 
         assert(out_desc->dim(0) == _in_features);
         assert(out_desc->dim(1) == _out_features);
diff --git a/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h b/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h
deleted file mode 100644
index 681404b54..000000000
--- a/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-#include <musa_fp16.h> // __half / __half2
-
-/**
- * @brief Unpack 8x 4-bit unsigned integers (0..15) from a uint32_t into 8 half values.
- *
- * GPTQ dequant is applied outside this helper (aligned with nvidia impl):
- *   out = (q - (z + 1)) * s
- *
- * Output order matches the interleaved half2 packing:
- *   (v0,v4), (v1,v5), (v2,v6), (v3,v7)
- */
-__device__ __forceinline__ uint4 dequantize_s4_to_fp16x2_gptq(uint32_t const &source) {
-    // unpack 8 nibbles: v0..v7 in [0, 15]
-    const unsigned int v0 = (source >> 0) & 0x0F;
-    const unsigned int v1 = (source >> 4) & 0x0F;
-    const unsigned int v2 = (source >> 8) & 0x0F;
-    const unsigned int v3 = (source >> 12) & 0x0F;
-    const unsigned int v4 = (source >> 16) & 0x0F;
-    const unsigned int v5 = (source >> 20) & 0x0F;
-    const unsigned int v6 = (source >> 24) & 0x0F;
-    const unsigned int v7 = (source >> 28) & 0x0F;
-
-    // NOTE: no "-8" offset here (unlike signed s4). GPTQ uses unsigned q/z.
-    const __half hv0 = __half(v0);
-    const __half hv1 = __half(v1);
-    const __half hv2 = __half(v2);
-    const __half hv3 = __half(v3);
-    const __half hv4 = __half(v4);
-    const __half hv5 = __half(v5);
-    const __half hv6 = __half(v6);
-    const __half hv7 = __half(v7);
-
-    uint4 result;
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-    result_ptr[0] = __halves2half2(hv0, hv4);
-    result_ptr[1] = __halves2half2(hv1, hv5);
-    result_ptr[2] = __halves2half2(hv2, hv6);
-    result_ptr[3] = __halves2half2(hv3, hv7);
-    return result;
-}
\ No newline at end of file
diff --git a/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.mu b/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.mu
index 290d68c8b..d3c31dd52 100644
--- a/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.mu
+++ b/src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_moore.mu
@@ -1,7 +1,6 @@
 #include "../../../devices/moore/moore_handle.h"
 #include "../../../devices/moore/moore_kernel_common.h"
 #include "dequantize_w42f16_moore.h"
-// #include "dequantize_w42f16_kernel.h"  // 不再需要（保留也无妨）
 
 #include "../dequantize_gptq.h"
 #include <musa_fp16.h>
diff --git a/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_kernel.cuh b/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_kernel.cuh
deleted file mode 100644
index d77d888ee..000000000
--- a/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_kernel.cuh
+++ /dev/null
@@ -1,124 +0,0 @@
-#pragma once
-
-__device__ uint4 dequantize_s4_to_fp16x2_gptq(uint32_t const &source) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
-    // 步骤 1: 从一个 32-bit 源数据中解包出 8 个 4-bit 无符号整数。
-    // 源数据的内存布局被假定为 [v7, v6, v5, v4, v3, v2, v1, v0]，
-    // 其中每个 'v' 都是一个 4-bit 的半字节 (nibble)。
-    const unsigned int v0 = (source >> 0) & 0x0F;
-    const unsigned int v1 = (source >> 4) & 0x0F;
-    const unsigned int v2 = (source >> 8) & 0x0F;
-    const unsigned int v3 = (source >> 12) & 0x0F;
-    const unsigned int v4 = (source >> 16) & 0x0F;
-    const unsigned int v5 = (source >> 20) & 0x0F;
-    const unsigned int v6 = (source >> 24) & 0x0F;
-    const unsigned int v7 = (source >> 28) & 0x0F;
-
-    // 步骤 2: GPTQ 是 (Q - Z) * S。
-    // Q 和 Z 都是无符号数 [0, 15]。
-    // 这里不需要 - offset
-    
-    __half hv0 = __half(v0);
-    __half hv1 = __half(v1);
-    __half hv2 = __half(v2);
-    __half hv3 = __half(v3);
-    __half hv4 = __half(v4);
-    __half hv5 = __half(v5);
-    __half hv6 = __half(v6);
-    __half hv7 = __half(v7);
-
-    // 步骤 3: 将 half 值按 PTX 交错顺序打包成 __half2 并存入 result 中。
-    // 顺序：result_ptr[0]: low=hv0, high=hv4
-    //       result_ptr[1]: low=hv1, high=hv5
-    //       result_ptr[2]: low=hv2, high=hv6
-    //       result_ptr[3]: low=hv3, high=hv7
-    // __halves2half2 函数：low 为第一个参数，high 为第二个参数。
-    uint4 result;
-    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
-
-    result_ptr[0] = __halves2half2(hv0, hv4);
-    result_ptr[1] = __halves2half2(hv1, hv5);
-    result_ptr[2] = __halves2half2(hv2, hv6);
-    result_ptr[3] = __halves2half2(hv3, hv7);
-
-    return result;
-#else
-    uint4 result;
-
-    uint32_t *h = reinterpret_cast<uint32_t *>(&result);
-    uint32_t const i4s = reinterpret_cast<uint32_t const &>(source);
-
-    // First, we extract the i4s and construct an intermediate fp16 number.
-    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
-    static constexpr uint32_t BOTTOM_MASK = 0x000f000f;
-    static constexpr uint32_t TOP_MASK = 0x00f000f0;
-    static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
-
-    // Note that the entire sequence only requires 1 shift instruction. This is
-    // thanks to the register packing format and the fact that we force our
-    // integers to be unsigned, and account for this in the fp16 subtractions. In
-    // addition, I exploit the fact that sub and fma have the same throughput in
-    // order to convert elt_23 and elt_67 to fp16 without having to shift them to
-    // the bottom bits before hand.
-
-    // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW
-    // dependency if we issue immediately before required.
-    const uint32_t top_i4s = i4s >> 8;
-    // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-                 : "=r"(h[0])
-                 : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
-                   "n"(immLut));
-    // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-                 : "=r"(h[1])
-                 : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
-                   "n"(immLut));
-    // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-                 : "=r"(h[2])
-                 : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
-                   "n"(immLut));
-    // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
-    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
-                 : "=r"(h[3])
-                 : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM),
-                   "n"(immLut));
-
-    // I use inline PTX below because I am not sure if the compiler will emit
-    // float2half instructions if I use the half2 ctor. In this case, I chose
-    // performance reliability over code readability.
-
-    // This is the half2 {1032, 1032} represented as an integer.
-    // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
-    // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
-    static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
-    // This is the half2 {1 / 16, 1 / 16} represented as an integer.
-    static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
-    // This is the half2 {-72, -72} represented as an integer.
-    // static constexpr uint32_t NEG_72 = 0xd480d480;
-    // Haotian: Let's use {-64, -64}.
-    static constexpr uint32_t NEG_64 = 0xd400d400;
-
-    // Finally, we construct the output numbers.
-    // Convert elt_01
-    asm volatile("sub.f16x2 %0, %1, %2;\n"
-                 : "=r"(h[0])
-                 : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
-    // Convert elt_23
-    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
-                 : "=r"(h[1])
-                 : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
-    // Convert elt_45
-    asm volatile("sub.f16x2 %0, %1, %2;\n"
-                 : "=r"(h[2])
-                 : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
-    // Convert elt_67
-    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
-                 : "=r"(h[3])
-                 : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
-
-    return result;
-#endif
-    __builtin_unreachable(); // Suppress missing return statement warning
-}
\ No newline at end of file
diff --git a/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cu b/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cu
index b77446190..5e1d15eaf 100644
--- a/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cu
+++ b/src/infiniop/ops/dequantize_gptq/nvidia/dequantize_w42f16_nvidia.cu
@@ -2,9 +2,8 @@
 
 #include "../../../devices/nvidia/nvidia_handle.cuh"
 #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "dequantize_w42f16_kernel.cuh"
-#include "dequantize_w42f16_nvidia.cuh"
 #include "../dequantize_gptq.h"
+#include "dequantize_w42f16_nvidia.cuh"
 #include <cuda_fp16.h>
 
 namespace op::dequantize_gptq::nvidia {
@@ -40,37 +39,41 @@ infiniStatus_t Descriptor::create(
 // zeros: [num_groups, out_packed] packing 8 output channels per word
 // scales: [num_groups, out_features], g_idx: [in_features]
 __global__ void __launch_bounds__(128)
-dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
-                        const half     *__restrict__ scales,
-                        const uint32_t *__restrict__ zeros,
-                        const int      *__restrict__ g_idx,
-                        half           *__restrict__ out,
-                        int in_features,
-                        int out_features,
-                        int out_packed,   // ceil(out_features / 8)
-                        int num_groups) {
+    dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
+                            const half *__restrict__ scales,
+                            const uint32_t *__restrict__ zeros,
+                            const int *__restrict__ g_idx,
+                            half *__restrict__ out,
+                            int in_features,
+                            int out_features,
+                            int out_packed, // ceil(out_features / 8)
+                            int num_groups) {
     // Each thread handles one packed output column (8 real output cols).
-    const int col_pack = blockIdx.x * blockDim.x + threadIdx.x;     // packed output column
-    const int row = blockIdx.y * blockDim.y + threadIdx.y;          // real input row
-    if (col_pack >= out_packed || row >= in_features) return;
+    const int col_pack = blockIdx.x * blockDim.x + threadIdx.x; // packed output column
+    const int row = blockIdx.y * blockDim.y + threadIdx.y;      // real input row
+    if (col_pack >= out_packed || row >= in_features) {
+        return;
+    }
 
     // Clamp gid to valid range
     const int gid_raw = g_idx ? g_idx[row] : 0;
     const int gid = ((gid_raw % num_groups) + num_groups) % num_groups;
 
-    const int pack_row = row >> 3;                                  // packed input row
+    const int pack_row = row >> 3; // packed input row
 
-    const int zero_idx = gid * out_packed + col_pack;               // zeros layout: [num_groups, out_packed]
+    const int zero_idx = gid * out_packed + col_pack; // zeros layout: [num_groups, out_packed]
     const uint32_t zeros_loaded = zeros[zero_idx];
 
-    const int q_shift = (row & 7) * 4;      // qweight packs 8 input rows
-    const int col_base = col_pack << 3;    // 8 real cols per pack
+    const int q_shift = (row & 7) * 4;  // qweight packs 8 input rows
+    const int col_base = col_pack << 3; // 8 real cols per pack
     const int scale_base = gid * out_features + col_base;
 
-    #pragma unroll
+#pragma unroll
     for (int j = 0; j < 8; ++j) {
         const int col = col_base + j;
-        if (col >= out_features) break;
+        if (col >= out_features) {
+            break;
+        }
 
         const uint32_t q_loaded = qweight[pack_row * out_features + col];
         const int q_nib = (q_loaded >> q_shift) & 0xF;
@@ -95,32 +98,33 @@ Descriptor::calculate(
     const void *g_idx,
     void *stream) const {
 
-    const int in_features  = _info.in_features();
+    const int in_features = _info.in_features();
     const int out_features = _info.out_features();
-    const int out_packed   = _info.out_packed();
-    const int in_packed    = _info.in_packed();
-    const int num_groups   = _info.num_groups();
+    const int out_packed = _info.out_packed();
+    const int in_packed = _info.in_packed();
+    const int num_groups = _info.num_groups();
 
-    if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0)
+    if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0) {
         return INFINI_STATUS_BAD_PARAM;
+    }
 
-    constexpr int BLOCK_X = 16;  // packed columns
-    constexpr int BLOCK_Y = 4;   // rows
+    constexpr int BLOCK_X = 16; // packed columns
+    constexpr int BLOCK_Y = 4;  // rows
     dim3 threads(BLOCK_X, BLOCK_Y);
     dim3 blocks((out_packed + BLOCK_X - 1) / BLOCK_X,
                 (in_features + BLOCK_Y - 1) / BLOCK_Y);
 
     dequantize_weights_gptq<<<blocks, threads, 0,
-        reinterpret_cast<cudaStream_t>(stream)>>>(
-            reinterpret_cast<const uint32_t*>(qweight),
-            reinterpret_cast<const half*>(scales),
-            reinterpret_cast<const uint32_t*>(zeros),
-            reinterpret_cast<const int*>(g_idx),
-            reinterpret_cast<half*>(out),
-            in_features, out_features, out_packed, num_groups);
+                              reinterpret_cast<cudaStream_t>(stream)>>>(
+        reinterpret_cast<const uint32_t *>(qweight),
+        reinterpret_cast<const half *>(scales),
+        reinterpret_cast<const uint32_t *>(zeros),
+        reinterpret_cast<const int *>(g_idx),
+        reinterpret_cast<half *>(out),
+        in_features, out_features, out_packed, num_groups);
     return INFINI_STATUS_SUCCESS;
 }
 
 } // namespace op::dequantize_gptq::nvidia
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/infiniop/ops/dequantize_gptq/operator.cc b/src/infiniop/ops/dequantize_gptq/operator.cc
index a0790612d..d48430445 100644
--- a/src/infiniop/ops/dequantize_gptq/operator.cc
+++ b/src/infiniop/ops/dequantize_gptq/operator.cc
@@ -12,22 +12,22 @@
 #include "iluvatar/dequantize_w42f16_iluvatar.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(
+__INFINI_C infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(
     infiniopHandle_t handle,
     infiniopDequantizeGPTQDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t out_desc,
     infiniopTensorDescriptor_t qweight_desc,
     infiniopTensorDescriptor_t scales_desc,
     infiniopTensorDescriptor_t zeros_desc,
-    infiniopTensorDescriptor_t g_idx_desc) {  // add g_idx
-#define CREATE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                        \
+    infiniopTensorDescriptor_t g_idx_desc) { // add g_idx
+#define CREATE(CASE, NAMESPACE)                                                        \
+    case CASE:                                                                         \
         return op::dequantize_gptq::NAMESPACE::Descriptor::create(                     \
-            handle,                                                                   \
+            handle,                                                                    \
             reinterpret_cast<op::dequantize_gptq::NAMESPACE::Descriptor **>(desc_ptr), \
-            out_desc,                                                                 \
-            qweight_desc,                                                             \
-            scales_desc,                                                              \
+            out_desc,                                                                  \
+            qweight_desc,                                                              \
+            scales_desc,                                                               \
             zeros_desc, g_idx_desc)
 
     switch (handle->device) {
@@ -50,10 +50,10 @@ __C infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(
 #undef CREATE
 }
 
-__C infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc,
-                                                         size_t *size) {
-#define GET(CASE, NAMESPACE)                                                                                \
-    case CASE:                                                                                              \
+__INFINI_C infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc,
+                                                                 size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                                 \
+    case CASE:                                                                                               \
         *size = reinterpret_cast<const op::dequantize_gptq::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS
 
@@ -76,7 +76,7 @@ __C infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQ
 #undef GET
 }
 
-__C infiniStatus_t infiniopDequantizeGPTQ(
+__INFINI_C infiniStatus_t infiniopDequantizeGPTQ(
     infiniopDequantizeGPTQDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
@@ -84,11 +84,11 @@ __C infiniStatus_t infiniopDequantizeGPTQ(
     const void *qweight,
     const void *scales,
     const void *zeros,
-    const void *g_idx,    // add g_idx
+    const void *g_idx, // add g_idx
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                           \
+#define CALCULATE(CASE, NAMESPACE)                                                        \
+    case CASE:                                                                            \
         return reinterpret_cast<const op::dequantize_gptq::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, out, qweight, scales, zeros, g_idx, stream)
 
@@ -112,11 +112,11 @@ __C infiniStatus_t infiniopDequantizeGPTQ(
 #undef CALCULATE
 }
 
-__C infiniStatus_t
+__INFINI_C infiniStatus_t
 infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc) {
 
-#define DELETE(CASE, NAMESPACE)                                                           \
-    case CASE:                                                                            \
+#define DELETE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                             \
         delete reinterpret_cast<const op::dequantize_gptq::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
@@ -140,4 +140,4 @@ infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc)
 #undef DELETE
 }
 
-// #endif
\ No newline at end of file
+// #endif
diff --git a/test/infiniop/dequantize_gptq.py b/test/infiniop/dequantize_gptq.py
index a9d1a0404..e0bc4f44a 100644
--- a/test/infiniop/dequantize_gptq.py
+++ b/test/infiniop/dequantize_gptq.py
@@ -46,7 +46,10 @@
 #  Reference Implementation (matches CUDA kernel)
 # ==============================================================================
 
-def _unpack_qweight_int4_packed_by_rows(qweight_packed: torch.Tensor, in_features: int) -> torch.Tensor:
+
+def _unpack_qweight_int4_packed_by_rows(
+    qweight_packed: torch.Tensor, in_features: int
+) -> torch.Tensor:
     """qweight_packed: [in_packed, out_features] int32/uint32.
 
     Packs 8 input rows per 32-bit word; nibble shift = (row_in_pack * 4).
@@ -55,13 +58,17 @@ def _unpack_qweight_int4_packed_by_rows(qweight_packed: torch.Tensor, in_feature
     assert qweight_packed.dim() == 2
     shifts = torch.arange(0, 32, 4, device=qweight_packed.device, dtype=torch.int32)
     # [in_packed, 8, out_features]
-    expanded = torch.bitwise_right_shift(qweight_packed[:, None, :], shifts[None, :, None])
+    expanded = torch.bitwise_right_shift(
+        qweight_packed[:, None, :], shifts[None, :, None]
+    )
     vals = torch.bitwise_and(expanded, 0xF).to(torch.int32)
     out_features = qweight_packed.shape[1]
     return vals.reshape(-1, out_features)[:in_features, :]
 
 
-def _unpack_zeros_int4_packed_by_cols(zeros_packed: torch.Tensor, out_features: int) -> torch.Tensor:
+def _unpack_zeros_int4_packed_by_cols(
+    zeros_packed: torch.Tensor, out_features: int
+) -> torch.Tensor:
     """zeros_packed: [num_groups, out_packed] int32/uint32.
 
     Packs 8 output cols per 32-bit word; nibble shift = (col_in_pack * 4).
@@ -70,7 +77,9 @@ def _unpack_zeros_int4_packed_by_cols(zeros_packed: torch.Tensor, out_features:
     assert zeros_packed.dim() == 2
     shifts = torch.arange(0, 32, 4, device=zeros_packed.device, dtype=torch.int32)
     # [num_groups, out_packed, 8]
-    expanded = torch.bitwise_right_shift(zeros_packed[:, :, None], shifts[None, None, :])
+    expanded = torch.bitwise_right_shift(
+        zeros_packed[:, :, None], shifts[None, None, :]
+    )
     vals = torch.bitwise_and(expanded, 0xF).to(torch.int32)
     return vals.reshape(zeros_packed.shape[0], -1)[:, :out_features]
 
@@ -94,8 +103,12 @@ def dequantize_gptq_ref(
     num_groups = scales.shape[0]
     assert scales.shape == (num_groups, out_features)
 
-    q = _unpack_qweight_int4_packed_by_rows(qweight_packed, in_features)  # [in_features, out_features]
-    z_full = _unpack_zeros_int4_packed_by_cols(zeros_packed, out_features)  # [num_groups, out_features]
+    q = _unpack_qweight_int4_packed_by_rows(
+        qweight_packed, in_features
+    )  # [in_features, out_features]
+    z_full = _unpack_zeros_int4_packed_by_cols(
+        zeros_packed, out_features
+    )  # [num_groups, out_features]
 
     gid_raw = g_idx.to(torch.int32)
     gid = ((gid_raw % num_groups) + num_groups) % num_groups
@@ -107,24 +120,38 @@ def dequantize_gptq_ref(
     return out.to(torch.float16)
 
 
-def _make_packed_inputs(in_features: int, out_features: int, group_size: int, torch_device: str):
+def _make_packed_inputs(
+    in_features: int, out_features: int, group_size: int, torch_device: str
+):
     # num_groups is implicit (same convention as GPTQ: group per input channel)
     num_groups = (in_features + group_size - 1) // group_size
     in_packed = (in_features + 7) // 8
     out_packed = (out_features + 7) // 8
 
     # Deterministic group mapping
-    g_idx = (torch.arange(in_features, device=torch_device, dtype=torch.int32) // group_size)
+    g_idx = (
+        torch.arange(in_features, device=torch_device, dtype=torch.int32) // group_size
+    )
 
     # Random nibble-level values to avoid relying on sign/shift corner cases
-    q_nib = torch.randint(0, 16, (in_features, out_features), device=torch_device, dtype=torch.int32)
-    z_nib = torch.randint(0, 16, (num_groups, out_features), device=torch_device, dtype=torch.int32)
+    q_nib = torch.randint(
+        0, 16, (in_features, out_features), device=torch_device, dtype=torch.int32
+    )
+    z_nib = torch.randint(
+        0, 16, (num_groups, out_features), device=torch_device, dtype=torch.int32
+    )
 
     # scales in fp16
-    scales = (torch.rand((num_groups, out_features), device=torch_device, dtype=torch.float16) * 0.5 + 0.01)
+    scales = (
+        torch.rand((num_groups, out_features), device=torch_device, dtype=torch.float16)
+        * 0.5
+        + 0.01
+    )
 
     # Pack qweight: [in_packed, out_features]
-    qweight_packed = torch.zeros((in_packed, out_features), device=torch_device, dtype=torch.int32)
+    qweight_packed = torch.zeros(
+        (in_packed, out_features), device=torch_device, dtype=torch.int32
+    )
     for i in range(8):
         rows = torch.arange(i, in_features, 8, device=torch_device)
         if rows.numel() == 0:
@@ -133,7 +160,9 @@ def _make_packed_inputs(in_features: int, out_features: int, group_size: int, to
         qweight_packed[pack_rows, :] |= (q_nib[rows, :] & 0xF) << (i * 4)
 
     # Pack zeros: [num_groups, out_packed]
-    zeros_packed = torch.zeros((num_groups, out_packed), device=torch_device, dtype=torch.int32)
+    zeros_packed = torch.zeros(
+        (num_groups, out_packed), device=torch_device, dtype=torch.int32
+    )
     for j in range(8):
         cols = torch.arange(j, out_features, 8, device=torch_device)
         if cols.numel() == 0:
@@ -148,6 +177,7 @@ def _make_packed_inputs(in_features: int, out_features: int, group_size: int, to
 #  Test Entrypoint
 # ==============================================================================
 
+
 def test(
     handle,
     device,
@@ -178,7 +208,9 @@ def test(
     qscales = TestTensor.from_torch(scales, InfiniDtype.F16, device)
     g_idx_t = TestTensor.from_torch(g_idx, InfiniDtype.I32, device)
 
-    out = TestTensor((in_features, out_features), None, InfiniDtype.F16, device, mode="zeros")
+    out = TestTensor(
+        (in_features, out_features), None, InfiniDtype.F16, device, mode="zeros"
+    )
 
     if sync is not None:
         sync()
@@ -233,8 +265,18 @@ def lib_dequantize_gptq():
     assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
 
     if PROFILE:
-        profile_operation("PyTorch", lambda: dequantize_gptq_ref(qweight_packed, scales, zeros_packed, g_idx, group_size), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_dequantize_gptq(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation(
+            "PyTorch",
+            lambda: dequantize_gptq_ref(
+                qweight_packed, scales, zeros_packed, g_idx, group_size
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_dequantize_gptq(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
 
     check_error(LIBINFINIOP.infiniopDestroyDequantizeGPTQDescriptor(descriptor))
 
@@ -247,30 +289,7 @@ def lib_dequantize_gptq():
     NUM_PRERUN = args.num_prerun
     NUM_ITERATIONS = args.num_iterations
 
-    # This operator is intended to run on NVIDIA in our current environment.
-    # Make `srun ... python test/infiniop/dequantize_gptq.py` work without extra flags.
-    if not any(
-        getattr(args, name)
-        for name in [
-            "cpu",
-            "nvidia",
-            "iluvatar",
-            "qy",
-            "cambricon",
-            "ascend",
-            "metax",
-            "moore",
-            "kunlun",
-            "hygon",
-        ]
-    ):
-        args.nvidia = True
-
-    devices = [d for d in get_test_devices(args) if InfiniDeviceNames[d] == "NVIDIA"]
-    if not devices:
-        raise RuntimeError("No NVIDIA device selected; run with --nvidia under srun.")
-
-    for device in devices:
+    for device in get_test_devices(args):
         test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
 
     print("\033[92mTest passed!\033[0m")
diff --git a/xmake/iluvatar.lua b/xmake/iluvatar.lua
index 9c3160092..ffc3738b0 100644
--- a/xmake/iluvatar.lua
+++ b/xmake/iluvatar.lua
@@ -56,8 +56,7 @@ target("infiniop-iluvatar")
     -- skip scaled_mm, adapt it later
     -- remove_files("../src/infiniop/ops/scaled_mm/nvidia/*.cu")
 
-    -- 天数平台不支持部分 NVIDIA PTX 指令，AWQ 反量化改用 CUDA C++ 实现
-    add_files("../src/infiniop/ops/dequantize_awq/iluvatar/*.cu")
+    add_files("../src/infiniop/ops/*/iluvatar/*.cu")
 
     if has_config("ninetoothed") then
         add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp", {cxxflags = {"-Wno-return-type"}})