From d78db9b476350307405a772c968746764379e815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 19 Mar 2026 12:11:28 +0100 Subject: [PATCH 1/4] inpaint: get max pixel max instead of single sample --- src/stable-diffusion.cpp | 5 ++- src/tensor.hpp | 97 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 91 insertions(+), 11 deletions(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index a59ff23e8..1ffb26bf7 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -2811,7 +2811,8 @@ static std::optional prepare_image_generation_latents(sd {request->width / request->vae_scale_factor, request->height / request->vae_scale_factor, 1, - 1}); + 1}, + sd::ops::InterpolateMode::MaxPool); sd::Tensor init_latent; sd::Tensor control_latent; @@ -2955,7 +2956,7 @@ static std::optional prepare_image_generation_latents(sd latents.ref_images = std::move(ref_images); latents.ref_latents = std::move(ref_latents); - if (sd_version_is_inpaint(sd_ctx->sd->version)) { + if (!sd_version_is_inpaint(sd_ctx->sd->version)) { latents.denoise_mask = std::move(latent_mask); } diff --git a/src/tensor.hpp b/src/tensor.hpp index 33a2bdeaa..d585742cf 100644 --- a/src/tensor.hpp +++ b/src/tensor.hpp @@ -303,6 +303,10 @@ namespace sd { return data_.at(static_cast(index)); } + int64_t get_flat_index(const std::vector& coord) const { + return static_cast(offset_of(coord)); + } + private: size_t offset_of(const std::vector& coord) const { if (coord.size() != shape_.size()) { @@ -815,6 +819,9 @@ namespace sd { namespace ops { enum class InterpolateMode { Nearest, + MaxPool, + MinPool, + AvgPool, }; inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) { @@ -1012,12 +1019,16 @@ namespace sd { std::vector output_shape, InterpolateMode mode = InterpolateMode::Nearest, bool align_corners = false) { - if (mode != InterpolateMode::Nearest) { - tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" + + bool is_nearest_like_mode = (mode == InterpolateMode::Nearest || + mode == InterpolateMode::MaxPool || + mode == InterpolateMode::MinPool || + mode == InterpolateMode::AvgPool); + if (!is_nearest_like_mode) { + tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" + std::to_string(static_cast(mode))); } if (align_corners) { - tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" + + tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" + tensor_shape_to_string(input.shape()) + ", output_shape=" + tensor_shape_to_string(output_shape)); } @@ -1044,14 +1055,82 @@ namespace sd { } } + bool pure_upsampling = true; + for(int64_t i=0; i output_shape[i]) pure_upsampling = false; + } + Tensor output(std::move(output_shape)); - for (int64_t flat = 0; flat < output.numel(); ++flat) { - std::vector output_coord = tensor_unravel_index(flat, output.shape()); - std::vector input_coord(static_cast(input.dim()), 0); - for (size_t i = 0; i < static_cast(input.dim()); ++i) { - input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i]; + if (!pure_upsampling && (mode != InterpolateMode::Nearest)) { + // Pooling modes only differ from nearest mode when downsampling + for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) { + std::vector output_coord = tensor_unravel_index(flat_out, output.shape()); + + std::vector input_start(output.dim(), 0); + std::vector input_end(output.dim(), 0); + + for (size_t i = 0; i < static_cast(output.dim()); ++i) { + int64_t I_dim = input.shape()[i]; + int64_t O_dim = output.shape()[i]; + + if (I_dim > 0 && O_dim > 0) { + input_start[i] = std::max(int64_t(0), static_cast(output_coord[i] * I_dim / O_dim)); + input_end[i] = std::min(I_dim, ((output_coord[i] + 1) * I_dim + O_dim - 1) / O_dim); + } else { + input_start[i] = 0; + input_end[i] = 1; + } + } + + T val; + if (mode == InterpolateMode::MaxPool) { + val = std::numeric_limits::lowest(); + } else if(mode == InterpolateMode::MinPool) { + val = std::numeric_limits::max(); + } else if(mode == InterpolateMode::AvgPool) { + val = T(0); + } + + bool done_window = false; + std::vector current_in_coord = input_start; + + while (!done_window) { + if (mode == InterpolateMode::MaxPool) { + val = std::max(val, input.index(current_in_coord)); + } else if(mode == InterpolateMode::MinPool) { + val = std::min(val, input.index(current_in_coord)); + } else if(mode == InterpolateMode::AvgPool) { + val += input.index(current_in_coord); + } + + for (int d = static_cast(output.dim()) - 1; d >= 0; --d) { + if (++current_in_coord[d] < input_end[d]) { + break; + } + current_in_coord[d] = input_start[d]; + if (d == 0) { + done_window = true; + } + } + } + if (mode == InterpolateMode::AvgPool) { + int64_t window_size = 1; + for (size_t i = 0; i < static_cast(output.dim()); ++i) { + window_size *= (input_end[i] - input_start[i]); + } + val /= static_cast(window_size); + } + output[flat_out] = val; + } + } else { + for (int64_t flat = 0; flat < output.numel(); ++flat) { + std::vector output_coord = tensor_unravel_index(flat, output.shape()); + std::vector input_coord(static_cast(input.dim()), 0); + for (size_t i = 0; i < static_cast(input.dim()); ++i) { + input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i]; + } + output[flat] = input.index(input_coord); } - output[flat] = input.index(input_coord); } return output; From 6d484cdbd423592fc83bccbfc8fdb806e5382bd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 19 Mar 2026 12:11:52 +0100 Subject: [PATCH 2/4] inpaint: masked diffusion for inpainting models with inflated mask --- src/stable-diffusion.cpp | 14 ++++-- src/tensor.hpp | 101 ++++++++++++++++++++++++++++++++++----- 2 files changed, 97 insertions(+), 18 deletions(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 1ffb26bf7..a94564311 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -2812,7 +2812,7 @@ static std::optional prepare_image_generation_latents(sd request->height / request->vae_scale_factor, 1, 1}, - sd::ops::InterpolateMode::MaxPool); + sd::ops::InterpolateMode::NearestMax); sd::Tensor init_latent; sd::Tensor control_latent; @@ -2956,10 +2956,14 @@ static std::optional prepare_image_generation_latents(sd latents.ref_images = std::move(ref_images); latents.ref_latents = std::move(ref_latents); - if (!sd_version_is_inpaint(sd_ctx->sd->version)) { - latents.denoise_mask = std::move(latent_mask); - } - + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + latent_mask = sd::ops::maxPool2D(latent_mask, + {3, 3}, + {1, 1}, + {1, 1}); + } + latents.denoise_mask = std::move(latent_mask); + return latents; } diff --git a/src/tensor.hpp b/src/tensor.hpp index d585742cf..b97b83986 100644 --- a/src/tensor.hpp +++ b/src/tensor.hpp @@ -819,9 +819,9 @@ namespace sd { namespace ops { enum class InterpolateMode { Nearest, - MaxPool, - MinPool, - AvgPool, + NearestMax, + NearestMin, + NearestAvg, }; inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) { @@ -1020,9 +1020,9 @@ namespace sd { InterpolateMode mode = InterpolateMode::Nearest, bool align_corners = false) { bool is_nearest_like_mode = (mode == InterpolateMode::Nearest || - mode == InterpolateMode::MaxPool || - mode == InterpolateMode::MinPool || - mode == InterpolateMode::AvgPool); + mode == InterpolateMode::NearestMax || + mode == InterpolateMode::NearestMin || + mode == InterpolateMode::NearestAvg); if (!is_nearest_like_mode) { tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" + std::to_string(static_cast(mode))); @@ -1083,11 +1083,11 @@ namespace sd { } T val; - if (mode == InterpolateMode::MaxPool) { + if (mode == InterpolateMode::NearestMax) { val = std::numeric_limits::lowest(); - } else if(mode == InterpolateMode::MinPool) { + } else if(mode == InterpolateMode::NearestMin) { val = std::numeric_limits::max(); - } else if(mode == InterpolateMode::AvgPool) { + } else if(mode == InterpolateMode::NearestAvg) { val = T(0); } @@ -1095,11 +1095,11 @@ namespace sd { std::vector current_in_coord = input_start; while (!done_window) { - if (mode == InterpolateMode::MaxPool) { + if (mode == InterpolateMode::NearestMax) { val = std::max(val, input.index(current_in_coord)); - } else if(mode == InterpolateMode::MinPool) { + } else if(mode == InterpolateMode::NearestMin) { val = std::min(val, input.index(current_in_coord)); - } else if(mode == InterpolateMode::AvgPool) { + } else if(mode == InterpolateMode::NearestAvg) { val += input.index(current_in_coord); } @@ -1113,7 +1113,7 @@ namespace sd { } } } - if (mode == InterpolateMode::AvgPool) { + if (mode == InterpolateMode::NearestAvg) { int64_t window_size = 1; for (size_t i = 0; i < static_cast(output.dim()); ++i) { window_size *= (input_end[i] - input_start[i]); @@ -1207,6 +1207,81 @@ namespace sd { align_corners); } + template + inline Tensor maxPool2D(const Tensor& input, + std::vector kernel_size, + std::vector stride, + std::vector padding) { + if (input.dim() != 4) { + tensor_throw_invalid_argument("Tensor maxPool2D requires 4D input: input_dim=" + + std::to_string(input.dim()) + ", input_shape=" + + tensor_shape_to_string(input.shape())); + } + if (kernel_size.size() != 2 || stride.size() != 2 || padding.size() != 2) { + tensor_throw_invalid_argument("Tensor maxPool2D requires kernel_size, stride, and padding to have length 2"); + } + for (size_t i = 0; i < 2; ++i) { + if (kernel_size[i] <= 0) { + tensor_throw_invalid_argument("Tensor maxPool2D kernel_size must be positive: kernel_size=" + + tensor_shape_to_string(kernel_size)); + } + if (stride[i] <= 0) { + tensor_throw_invalid_argument("Tensor maxPool2D stride must be positive: stride=" + + tensor_shape_to_string(stride)); + } + if (padding[i] < 0) { + tensor_throw_invalid_argument("Tensor maxPool2D padding must be non-negative: padding=" + + tensor_shape_to_string(padding)); + } + } + + const int64_t in_height = input.shape()[0]; + const int64_t in_width = input.shape()[1]; + const int64_t in_channels = input.shape()[2]; + const int64_t batch_size = input.shape()[3]; + + const int64_t out_height = (in_height + 2 * padding[0] - kernel_size[0]) / stride[0] + 1; + const int64_t out_width = (in_width + 2 * padding[1] - kernel_size[1]) / stride[1] + 1; + + if (out_height <= 0 || out_width <= 0) { + tensor_throw_invalid_argument("maxPool2D results in invalid output dimensions: " + + std::to_string(out_height) + "x" + std::to_string(out_width)); + } + + Tensor output({out_height, out_width, in_channels, batch_size}); + + for (int64_t oh = 0; oh < out_height; ++oh) { + for (int64_t ow = 0; ow < out_width; ++ow) { + for (int64_t c = 0; c < in_channels; ++c) { + for (int64_t b = 0; b < batch_size; ++b) { + T max_val = std::numeric_limits::lowest(); + bool has_valid_input = false; + + for (int64_t kh = 0; kh < kernel_size[0]; ++kh) { + for (int64_t kw = 0; kw < kernel_size[1]; ++kw) { + int64_t ih = oh * stride[0] + kh - padding[0]; + int64_t iw = ow * stride[1] + kw - padding[1]; + + if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { + T val = input.index(ih, iw, c, b); + max_val = std::max(max_val, val); + has_valid_input = true; + } + } + } + + if (has_valid_input) { + output.index(oh, ow, c, b) = max_val; + } else { + output.index(oh, ow, c, b) = T(0); + } + } + } + } + } + return output; + } + template inline Tensor concat(const Tensor& lhs, const Tensor& rhs, size_t dim) { if (lhs.dim() != rhs.dim()) { From 53f8b3d2dbe895984814facecfeae8775af3123c Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 6 Apr 2026 00:39:50 +0800 Subject: [PATCH 3/4] refactor tensor interpolate nearest-like reduction paths and generalize max_pool_2d --- src/stable-diffusion.cpp | 12 +- src/tensor.hpp | 243 +++++++++++++++++++++------------------ 2 files changed, 139 insertions(+), 116 deletions(-) diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 7d03974a8..683a07d53 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -2992,13 +2992,13 @@ static std::optional prepare_image_generation_latents(sd latents.ref_latents = std::move(ref_latents); if (sd_version_is_inpaint(sd_ctx->sd->version)) { - latent_mask = sd::ops::maxPool2D(latent_mask, - {3, 3}, - {1, 1}, - {1, 1}); - } + latent_mask = sd::ops::max_pool_2d(latent_mask, + {3, 3}, + {1, 1}, + {1, 1}); + } latents.denoise_mask = std::move(latent_mask); - + return latents; } diff --git a/src/tensor.hpp b/src/tensor.hpp index b97b83986..8aba69414 100644 --- a/src/tensor.hpp +++ b/src/tensor.hpp @@ -1019,10 +1019,10 @@ namespace sd { std::vector output_shape, InterpolateMode mode = InterpolateMode::Nearest, bool align_corners = false) { - bool is_nearest_like_mode = (mode == InterpolateMode::Nearest || - mode == InterpolateMode::NearestMax || - mode == InterpolateMode::NearestMin || - mode == InterpolateMode::NearestAvg); + const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest || + mode == InterpolateMode::NearestMax || + mode == InterpolateMode::NearestMin || + mode == InterpolateMode::NearestAvg); if (!is_nearest_like_mode) { tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" + std::to_string(static_cast(mode))); @@ -1055,82 +1055,102 @@ namespace sd { } } - bool pure_upsampling = true; - for(int64_t i=0; i output_shape[i]) pure_upsampling = false; + bool has_downsampling = false; + for (int64_t i = 0; i < input.dim(); ++i) { + if (input.shape()[i] > output_shape[i]) { + has_downsampling = true; + break; + } } Tensor output(std::move(output_shape)); - if (!pure_upsampling && (mode != InterpolateMode::Nearest)) { - // Pooling modes only differ from nearest mode when downsampling - for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) { - std::vector output_coord = tensor_unravel_index(flat_out, output.shape()); + if (mode == InterpolateMode::Nearest || !has_downsampling) { + for (int64_t flat = 0; flat < output.numel(); ++flat) { + std::vector output_coord = tensor_unravel_index(flat, output.shape()); + std::vector input_coord(static_cast(input.dim()), 0); + for (size_t i = 0; i < static_cast(input.dim()); ++i) { + input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i]; + } + output[flat] = input.index(input_coord); + } - std::vector input_start(output.dim(), 0); - std::vector input_end(output.dim(), 0); + return output; + } - for (size_t i = 0; i < static_cast(output.dim()); ++i) { - int64_t I_dim = input.shape()[i]; - int64_t O_dim = output.shape()[i]; - - if (I_dim > 0 && O_dim > 0) { - input_start[i] = std::max(int64_t(0), static_cast(output_coord[i] * I_dim / O_dim)); - input_end[i] = std::min(I_dim, ((output_coord[i] + 1) * I_dim + O_dim - 1) / O_dim); - } else { - input_start[i] = 0; - input_end[i] = 1; - } - } + auto init_reduction = [&]() -> T { + switch (mode) { + case InterpolateMode::NearestMax: + return std::numeric_limits::lowest(); + case InterpolateMode::NearestMin: + return std::numeric_limits::max(); + case InterpolateMode::NearestAvg: + return T(0); + case InterpolateMode::Nearest: + return T(0); + } - T val; - if (mode == InterpolateMode::NearestMax) { - val = std::numeric_limits::lowest(); - } else if(mode == InterpolateMode::NearestMin) { - val = std::numeric_limits::max(); - } else if(mode == InterpolateMode::NearestAvg) { - val = T(0); - } + tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" + + std::to_string(static_cast(mode))); + }; + + auto reduce_value = [&](T& acc, const T& sample) { + switch (mode) { + case InterpolateMode::NearestMax: + acc = std::max(acc, sample); + break; + case InterpolateMode::NearestMin: + acc = std::min(acc, sample); + break; + case InterpolateMode::NearestAvg: + acc += sample; + break; + case InterpolateMode::Nearest: + break; + } + }; - bool done_window = false; - std::vector current_in_coord = input_start; + // Reduction modes only differ from nearest mode when downsampling. + for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) { + std::vector output_coord = tensor_unravel_index(flat_out, output.shape()); - while (!done_window) { - if (mode == InterpolateMode::NearestMax) { - val = std::max(val, input.index(current_in_coord)); - } else if(mode == InterpolateMode::NearestMin) { - val = std::min(val, input.index(current_in_coord)); - } else if(mode == InterpolateMode::NearestAvg) { - val += input.index(current_in_coord); - } + std::vector input_start(output.dim(), 0); + std::vector input_end(output.dim(), 0); + + for (size_t i = 0; i < static_cast(output.dim()); ++i) { + const int64_t input_dim = input.shape()[i]; + const int64_t output_dim = output.shape()[i]; + + input_start[i] = std::max(int64_t(0), static_cast(output_coord[i] * input_dim / output_dim)); + input_end[i] = std::min(input_dim, ((output_coord[i] + 1) * input_dim + output_dim - 1) / output_dim); + } + + T value = init_reduction(); + bool done_window = false; + std::vector current_in_coord = input_start; + + while (!done_window) { + reduce_value(value, input.index(current_in_coord)); - for (int d = static_cast(output.dim()) - 1; d >= 0; --d) { - if (++current_in_coord[d] < input_end[d]) { - break; - } - current_in_coord[d] = input_start[d]; - if (d == 0) { - done_window = true; - } + for (int d = static_cast(output.dim()) - 1; d >= 0; --d) { + if (++current_in_coord[d] < input_end[d]) { + break; } - } - if (mode == InterpolateMode::NearestAvg) { - int64_t window_size = 1; - for (size_t i = 0; i < static_cast(output.dim()); ++i) { - window_size *= (input_end[i] - input_start[i]); + current_in_coord[d] = input_start[d]; + if (d == 0) { + done_window = true; } - val /= static_cast(window_size); } - output[flat_out] = val; } - } else { - for (int64_t flat = 0; flat < output.numel(); ++flat) { - std::vector output_coord = tensor_unravel_index(flat, output.shape()); - std::vector input_coord(static_cast(input.dim()), 0); - for (size_t i = 0; i < static_cast(input.dim()); ++i) { - input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i]; + + if (mode == InterpolateMode::NearestAvg) { + int64_t window_size = 1; + for (size_t i = 0; i < static_cast(output.dim()); ++i) { + window_size *= (input_end[i] - input_start[i]); } - output[flat] = input.index(input_coord); + value /= static_cast(window_size); } + + output[flat_out] = value; } return output; @@ -1142,12 +1162,16 @@ namespace sd { const std::optional>& scale_factor, InterpolateMode mode = InterpolateMode::Nearest, bool align_corners = false) { - if (mode != InterpolateMode::Nearest) { - tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" + + const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest || + mode == InterpolateMode::NearestMax || + mode == InterpolateMode::NearestMin || + mode == InterpolateMode::NearestAvg); + if (!is_nearest_like_mode) { + tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" + std::to_string(static_cast(mode))); } if (align_corners) { - tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" + + tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" + tensor_shape_to_string(input.shape())); } if (size.has_value() == scale_factor.has_value()) { @@ -1208,76 +1232,75 @@ namespace sd { } template - inline Tensor maxPool2D(const Tensor& input, - std::vector kernel_size, - std::vector stride, - std::vector padding) { - if (input.dim() != 4) { - tensor_throw_invalid_argument("Tensor maxPool2D requires 4D input: input_dim=" + + inline Tensor max_pool_2d(const Tensor& input, + std::vector kernel_size, + std::vector stride, + std::vector padding) { + if (input.dim() < 2) { + tensor_throw_invalid_argument("Tensor max_pool_2d requires input_dim >= 2: input_dim=" + std::to_string(input.dim()) + ", input_shape=" + tensor_shape_to_string(input.shape())); } if (kernel_size.size() != 2 || stride.size() != 2 || padding.size() != 2) { - tensor_throw_invalid_argument("Tensor maxPool2D requires kernel_size, stride, and padding to have length 2"); + tensor_throw_invalid_argument("Tensor max_pool_2d requires kernel_size, stride, and padding to have length 2"); } for (size_t i = 0; i < 2; ++i) { if (kernel_size[i] <= 0) { - tensor_throw_invalid_argument("Tensor maxPool2D kernel_size must be positive: kernel_size=" + + tensor_throw_invalid_argument("Tensor max_pool_2d kernel_size must be positive: kernel_size=" + tensor_shape_to_string(kernel_size)); } if (stride[i] <= 0) { - tensor_throw_invalid_argument("Tensor maxPool2D stride must be positive: stride=" + + tensor_throw_invalid_argument("Tensor max_pool_2d stride must be positive: stride=" + tensor_shape_to_string(stride)); } if (padding[i] < 0) { - tensor_throw_invalid_argument("Tensor maxPool2D padding must be non-negative: padding=" + + tensor_throw_invalid_argument("Tensor max_pool_2d padding must be non-negative: padding=" + tensor_shape_to_string(padding)); } } - const int64_t in_height = input.shape()[0]; - const int64_t in_width = input.shape()[1]; - const int64_t in_channels = input.shape()[2]; - const int64_t batch_size = input.shape()[3]; + const int64_t in_height = input.shape()[0]; + const int64_t in_width = input.shape()[1]; const int64_t out_height = (in_height + 2 * padding[0] - kernel_size[0]) / stride[0] + 1; const int64_t out_width = (in_width + 2 * padding[1] - kernel_size[1]) / stride[1] + 1; if (out_height <= 0 || out_width <= 0) { - tensor_throw_invalid_argument("maxPool2D results in invalid output dimensions: " + + tensor_throw_invalid_argument("max_pool_2d results in invalid output dimensions: " + std::to_string(out_height) + "x" + std::to_string(out_width)); } - Tensor output({out_height, out_width, in_channels, batch_size}); - - for (int64_t oh = 0; oh < out_height; ++oh) { - for (int64_t ow = 0; ow < out_width; ++ow) { - for (int64_t c = 0; c < in_channels; ++c) { - for (int64_t b = 0; b < batch_size; ++b) { - T max_val = std::numeric_limits::lowest(); - bool has_valid_input = false; - - for (int64_t kh = 0; kh < kernel_size[0]; ++kh) { - for (int64_t kw = 0; kw < kernel_size[1]; ++kw) { - int64_t ih = oh * stride[0] + kh - padding[0]; - int64_t iw = ow * stride[1] + kw - padding[1]; - - if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { - T val = input.index(ih, iw, c, b); - max_val = std::max(max_val, val); - has_valid_input = true; - } - } - } - - if (has_valid_input) { - output.index(oh, ow, c, b) = max_val; - } else { - output.index(oh, ow, c, b) = T(0); - } + std::vector output_shape = input.shape(); + output_shape[0] = out_height; + output_shape[1] = out_width; + + Tensor output(std::move(output_shape)); + + for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) { + std::vector output_coord = tensor_unravel_index(flat_out, output.shape()); + std::vector input_coord = output_coord; + + const int64_t oh = output_coord[0]; + const int64_t ow = output_coord[1]; + + T max_val = std::numeric_limits::lowest(); + bool has_valid_input = false; + + for (int64_t kh = 0; kh < kernel_size[0]; ++kh) { + for (int64_t kw = 0; kw < kernel_size[1]; ++kw) { + const int64_t ih = oh * stride[0] + kh - padding[0]; + const int64_t iw = ow * stride[1] + kw - padding[1]; + + if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { + input_coord[0] = ih; + input_coord[1] = iw; + max_val = std::max(max_val, input.index(input_coord)); + has_valid_input = true; } } } + + output[flat_out] = has_valid_input ? max_val : T(0); } return output; } From 090b914508982a28976c60fe3bbbf4bf9faee39f Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 6 Apr 2026 00:42:56 +0800 Subject: [PATCH 4/4] remove unused get_flat_index --- src/tensor.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/tensor.hpp b/src/tensor.hpp index 8aba69414..33302b056 100644 --- a/src/tensor.hpp +++ b/src/tensor.hpp @@ -303,10 +303,6 @@ namespace sd { return data_.at(static_cast(index)); } - int64_t get_flat_index(const std::vector& coord) const { - return static_cast(offset_of(coord)); - } - private: size_t offset_of(const std::vector& coord) const { if (coord.size() != shape_.size()) {