Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 37 additions & 15 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ layout(std430) buffer;

#include "indexing.glslh"
#include "common.glslh"
#include "conv2d_common.glslh"

${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)}
${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)}
Expand All @@ -38,7 +37,6 @@ ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False
// Metadata for input/output tensors (memory layout agnostic)
${layout_declare_ubo(B, "BufferMetadata", "outp")}
${layout_declare_ubo(B, "BufferMetadata", "inp")}
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}

layout(push_constant) uniform restrict Block {
float input_scale;
Expand All @@ -56,6 +54,30 @@ ${layout_declare_spec_const(C, "int", "activation_type", "0")}
${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}

$if USE_SPEC_CONST:
// Conv2D parameter specialization constants
${layout_declare_spec_const(C, "int", "kernel_size_x", "1")}
${layout_declare_spec_const(C, "int", "kernel_size_y", "1")}
${layout_declare_spec_const(C, "int", "stride_x", "1")}
${layout_declare_spec_const(C, "int", "stride_y", "1")}
${layout_declare_spec_const(C, "int", "padding_x", "0")}
${layout_declare_spec_const(C, "int", "padding_y", "0")}
${layout_declare_spec_const(C, "int", "dilation_x", "1")}
${layout_declare_spec_const(C, "int", "dilation_y", "1")}
${layout_declare_spec_const(C, "int", "groups", "1")}
$else:
#include "conv2d_common.glslh"
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
#define kernel_size_x conv2d_params.kernel_size.x
#define kernel_size_y conv2d_params.kernel_size.y
#define stride_x conv2d_params.stride.x
#define stride_y conv2d_params.stride.y
#define padding_x conv2d_params.padding.x
#define padding_y conv2d_params.padding.y
#define dilation_x conv2d_params.dilation.x
#define dilation_y conv2d_params.dilation.y
#define groups conv2d_params.groups

// Load weight block for a given (ic4, kx, ky, oc4) position.
// Weight texture layout (from pack_q8_conv2d_weights.glsl):
// block_x = oc4 * K_w + kx
Expand Down Expand Up @@ -101,8 +123,8 @@ void main() {
const int IC = int(inp.sizes[0][2]);

// Compute channels per group
const int OC_per_group = OC / conv2d_params.groups;
const int IC_per_group = IC / conv2d_params.groups;
const int OC_per_group = OC / groups;
const int IC_per_group = IC / groups;
const int IC4_per_group = div_up_4(IC_per_group);

// Determine which group this output channel block belongs to
Expand All @@ -113,14 +135,14 @@ void main() {
const int inp_w_stride = int(inp.strides[0][0]);
const int inp_h_stride = int(inp.strides[0][1]);
const int inp_c_stride = int(inp.strides[0][2]);
const int w_texel_step = conv2d_params.dilation.x * inp_w_stride;
const int h_texel_step = conv2d_params.dilation.y * inp_h_stride;
const int subtile_w_step = conv2d_params.stride.x * inp_w_stride;
const int w_texel_step = dilation_x * inp_w_stride;
const int h_texel_step = dilation_y * inp_h_stride;
const int subtile_w_step = stride_x * inp_w_stride;

// Compute base input position (for subtile_w=0, ic4=0)
TensorIndex4D inp_tidx;
inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x;
inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y;
inp_tidx.data[0] = outp_tidx.data[0] * stride_x - padding_x;
inp_tidx.data[1] = outp_tidx.data[1] * stride_y - padding_y;
inp_tidx.data[2] = ic_group_start;
inp_tidx.data[3] = 0;

Expand All @@ -142,7 +164,7 @@ void main() {
}

// Perform convolution using packed int8 dot products
for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
for (int ky = 0; ky < kernel_size_y; ky++) {
const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H);

// Process input channels in blocks of 4
Expand All @@ -153,10 +175,10 @@ void main() {
// Reset width coordinate at start of each ic4 iteration
inp_tidx.data[0] = base_inp_w;

for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) {
for (int kx = 0; kx < kernel_size_x; kx++) {
// Load weight block: 4 output channels × 4 input channels
// weight_block[oc] contains packed weights for ic4*4 to ic4*4+3 -> oc
const ivec4 weight_block = load_weight_block(ic4, kx, ky, oc4, IC4_per_group, conv2d_params.kernel_size.x);
const ivec4 weight_block = load_weight_block(ic4, kx, ky, oc4, IC4_per_group, kernel_size_x);

// Process 4 adjacent width positions
[[unroll]] for (int subtile_w = 0; subtile_w < 4; ++subtile_w) {
Expand Down Expand Up @@ -187,16 +209,16 @@ void main() {
}

// Advance to next output position's input coordinate
inp_tidx.data[0] += conv2d_params.stride.x;
inp_tidx.data[0] += stride_x;
}

// Adjust for net dilation step
inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x;
inp_tidx.data[0] += dilation_x - 4 * stride_x;
}
}

// Advance height by dilation for next kernel row
inp_tidx.data[1] += conv2d_params.dilation.y;
inp_tidx.data[1] += dilation_y;

if (get_outer_packed_dim_block_size(inp_layout) == 1) {
// Advance base index by height step for next kernel row
Expand Down
6 changes: 6 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@ q8ta_conv2d:
parameter_names_with_default_values:
DTYPE: float
USE_INT8_DOT_PRODUCT_EXT: 1
USE_SPEC_CONST: 0
generate_variant_forall:
DTYPE:
- VALUE: float
shader_variants:
- NAME: q8ta_conv2d
- NAME: q8ta_conv2d_fallback
USE_INT8_DOT_PRODUCT_EXT: 0
- NAME: q8ta_conv2d_spec_const
USE_SPEC_CONST: 1
- NAME: q8ta_conv2d_fallback_spec_const
USE_INT8_DOT_PRODUCT_EXT: 0
USE_SPEC_CONST: 1
48 changes: 34 additions & 14 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ layout(std430) buffer;

#include "indexing.glslh"
#include "common.glslh"
#include "conv2d_common.glslh"

${layout_declare_tensor(B, "w", "t_packed_int8_output", "int", "buffer", is_scalar_array=True)}
${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", "buffer", is_scalar_array=True)}
Expand All @@ -32,7 +31,6 @@ ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False
// Metadata for input/output tensors (memory layout agnostic)
${layout_declare_ubo(B, "BufferMetadata", "outp")}
${layout_declare_ubo(B, "BufferMetadata", "inp")}
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}

layout(push_constant) uniform restrict Block {
float input_scale;
Expand All @@ -50,6 +48,28 @@ ${layout_declare_spec_const(C, "int", "activation_type", "0")}
${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}

$if USE_SPEC_CONST:
// Conv2D parameter specialization constants
${layout_declare_spec_const(C, "int", "kernel_size_x", "1")}
${layout_declare_spec_const(C, "int", "kernel_size_y", "1")}
${layout_declare_spec_const(C, "int", "stride_x", "1")}
${layout_declare_spec_const(C, "int", "stride_y", "1")}
${layout_declare_spec_const(C, "int", "padding_x", "0")}
${layout_declare_spec_const(C, "int", "padding_y", "0")}
${layout_declare_spec_const(C, "int", "dilation_x", "1")}
${layout_declare_spec_const(C, "int", "dilation_y", "1")}
$else:
#include "conv2d_common.glslh"
${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
#define kernel_size_x conv2d_params.kernel_size.x
#define kernel_size_y conv2d_params.kernel_size.y
#define stride_x conv2d_params.stride.x
#define stride_y conv2d_params.stride.y
#define padding_x conv2d_params.padding.x
#define padding_y conv2d_params.padding.y
#define dilation_x conv2d_params.dilation.x
#define dilation_y conv2d_params.dilation.y

#include "block_indexing.glslh"

// Load a 4xint8 block of weights.
Expand Down Expand Up @@ -89,22 +109,22 @@ void main() {
}

// Compute weight addressing constants
const int KW4 = int(div_up_4(conv2d_params.kernel_size.x));
const int KW4 = int(div_up_4(kernel_size_x));

// Get strides for width and height dimensions (in texel space)
const int w_stride = int(inp.strides[0][0]);
const int h_stride = int(inp.strides[0][1]);

// Pre-compute step sizes for efficient indexing
const int w_texel_step = conv2d_params.dilation.x * w_stride;
const int h_texel_step = conv2d_params.dilation.y * h_stride;
const int w_texel_step = dilation_x * w_stride;
const int h_texel_step = dilation_y * h_stride;
// Step between adjacent output width positions in input texel space
const int subtile_w_step = conv2d_params.stride.x * w_stride;
const int subtile_w_step = stride_x * w_stride;

// Compute base input position for subtile_w=0
TensorIndex4D inp_tidx;
inp_tidx.data[0] = outp_tidx.data[0] * conv2d_params.stride.x - conv2d_params.padding.x;
inp_tidx.data[1] = outp_tidx.data[1] * conv2d_params.stride.y - conv2d_params.padding.y;
inp_tidx.data[0] = outp_tidx.data[0] * stride_x - padding_x;
inp_tidx.data[1] = outp_tidx.data[1] * stride_y - padding_y;
inp_tidx.data[2] = outp_tidx.data[2];
inp_tidx.data[3] = 0; // batch = 0 since N == 1

Expand All @@ -128,13 +148,13 @@ void main() {
const int inp_H = int(inp.sizes[0][1]);

// Perform depthwise convolution
for (int ky = 0; ky < conv2d_params.kernel_size.y; ky++) {
for (int ky = 0; ky < kernel_size_y; ky++) {
const bool h_in_bounds = (inp_tidx.data[1] >= 0 && inp_tidx.data[1] < inp_H);

// Reset width coordinate at start of each kernel row
inp_tidx.data[0] = base_inp_w;

for (int kx = 0; kx < conv2d_params.kernel_size.x; kx++) {
for (int kx = 0; kx < kernel_size_x; kx++) {
// Load weight once, reuse for all 4 width positions
const int packed_weight = load_weight(kx, ky, c4, KW4, C4);
const ivec4 weight_4c = unpack_int8x4(packed_weight);
Expand All @@ -148,7 +168,7 @@ void main() {
if (get_outer_packed_dim_block_size(inp_layout) == 1) {
inp_texel_idx = base_inp_texel_idx + kx * w_texel_step + subtile_w * subtile_w_step;
} else {
// const int w_offset = kx * conv2d_params.dilation.x + subtile_w * conv2d_params.stride.x;
// const int w_offset = kx * dilation_x + subtile_w * stride_x;
// inp_texel_idx = base_inp_texel_idx + div_4(w_offset) * w_stride + mod_4(w_offset);
// inp_texel_idx = tensor4d_idx_to_texel_idx(inp, inp_tidx, inp_layout);
const int w4 = div_4(inp_tidx.data[0]);
Expand All @@ -162,15 +182,15 @@ void main() {
acc[subtile_w] += weight_4c * input_4c;

// Advance to next output position's input coordinate
inp_tidx.data[0] += conv2d_params.stride.x;
inp_tidx.data[0] += stride_x;
}

// We advanced by 4*stride.x during subtile loop; adjust for net dilation step
inp_tidx.data[0] += conv2d_params.dilation.x - 4 * conv2d_params.stride.x;
inp_tidx.data[0] += dilation_x - 4 * stride_x;
}

// Advance height by dilation for next kernel row
inp_tidx.data[1] += conv2d_params.dilation.y;
inp_tidx.data[1] += dilation_y;

if (get_outer_packed_dim_block_size(inp_layout) == 1) {
// Advance base index by height step for next kernel row
Expand Down
3 changes: 3 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_dw.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
q8ta_conv2d_dw:
parameter_names_with_default_values:
DTYPE: float
USE_SPEC_CONST: 0
generate_variant_forall:
DTYPE:
- VALUE: float
shader_variants:
- NAME: q8ta_conv2d_dw
- NAME: q8ta_conv2d_dw_spec_const
USE_SPEC_CONST: 1
28 changes: 20 additions & 8 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_pw.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,22 @@ ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False
${layout_declare_ubo(B, "BufferMetadata", "outp")}
${layout_declare_ubo(B, "BufferMetadata", "inp")}

layout(push_constant) uniform restrict Block {
float input_scale;
int input_zp;
float output_inv_scale;
int output_zp;
int K4_per_group;
int OC4_per_group;
};
$if USE_SPEC_CONST:
layout(push_constant) uniform restrict Block {
float input_scale;
int input_zp;
float output_inv_scale;
int output_zp;
};
$else:
layout(push_constant) uniform restrict Block {
float input_scale;
int input_zp;
float output_inv_scale;
int output_zp;
int K4_per_group;
int OC4_per_group;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

Expand All @@ -68,6 +76,10 @@ ${layout_declare_spec_const(C, "int", "activation_type", "0")}
${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}
${layout_declare_spec_const(C, "int", "inp_layout", "CONTIG_LAYOUT_INT")}

$if USE_SPEC_CONST:
${layout_declare_spec_const(C, "int", "K4_per_group", "1")}
${layout_declare_spec_const(C, "int", "OC4_per_group", "1")}

int compute_outp_buffer_idx(
const int w_block_idx,
const int h_idx,
Expand Down
6 changes: 6 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/q8ta_conv2d_pw.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@ q8ta_conv2d_pw:
parameter_names_with_default_values:
DTYPE: float
USE_INT8_DOT_PRODUCT_EXT: 1
USE_SPEC_CONST: 0
generate_variant_forall:
DTYPE:
- VALUE: float
shader_variants:
- NAME: q8ta_conv2d_pw
- NAME: q8ta_conv2d_pw_fallback
USE_INT8_DOT_PRODUCT_EXT: 0
- NAME: q8ta_conv2d_pw_spec_const
USE_SPEC_CONST: 1
- NAME: q8ta_conv2d_pw_fallback_spec_const
USE_INT8_DOT_PRODUCT_EXT: 0
USE_SPEC_CONST: 1
Loading
Loading