From d78db9b476350307405a772c968746764379e815 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 19 Mar 2026 12:11:28 +0100
Subject: [PATCH 1/4] inpaint: get max pixel max instead of single sample

---
 src/stable-diffusion.cpp |  5 ++-
 src/tensor.hpp           | 97 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 91 insertions(+), 11 deletions(-)
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index a59ff23e8..1ffb26bf7 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -2811,7 +2811,8 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
                                                          {request->width / request->vae_scale_factor,
                                                           request->height / request->vae_scale_factor,
                                                           1,
-                                                          1});
+                                                          1},
+                                                         sd::ops::InterpolateMode::MaxPool);
 
     sd::Tensor<float> init_latent;
     sd::Tensor<float> control_latent;
@@ -2955,7 +2956,7 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
     latents.ref_images           = std::move(ref_images);
     latents.ref_latents          = std::move(ref_latents);
 
-    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+    if (!sd_version_is_inpaint(sd_ctx->sd->version)) {
         latents.denoise_mask = std::move(latent_mask);
     }
 
diff --git a/src/tensor.hpp b/src/tensor.hpp
index 33a2bdeaa..d585742cf 100644
--- a/src/tensor.hpp
+++ b/src/tensor.hpp
@@ -303,6 +303,10 @@ namespace sd {
             return data_.at(static_cast<size_t>(index));
         }
 
+        int64_t get_flat_index(const std::vector<int64_t>& coord) const {
+            return static_cast<int64_t>(offset_of(coord));
+        }
+
     private:
         size_t offset_of(const std::vector<int64_t>& coord) const {
             if (coord.size() != shape_.size()) {
@@ -815,6 +819,9 @@ namespace sd {
     namespace ops {
         enum class InterpolateMode {
             Nearest,
+            MaxPool,
+            MinPool,
+            AvgPool,
         };
 
         inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
@@ -1012,12 +1019,16 @@ namespace sd {
                                      std::vector<int64_t> output_shape,
                                      InterpolateMode mode = InterpolateMode::Nearest,
                                      bool align_corners   = false) {
-            if (mode != InterpolateMode::Nearest) {
-                tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" +
+            bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
+                                         mode == InterpolateMode::MaxPool ||
+                                         mode == InterpolateMode::MinPool ||
+                                         mode == InterpolateMode::AvgPool);
+            if (!is_nearest_like_mode) {
+                tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
                                               std::to_string(static_cast<int>(mode)));
             }
             if (align_corners) {
-                tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" +
+                tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
                                               tensor_shape_to_string(input.shape()) + ", output_shape=" +
                                               tensor_shape_to_string(output_shape));
             }
@@ -1044,14 +1055,82 @@ namespace sd {
                 }
             }
 
+            bool pure_upsampling = true;
+            for(int64_t i=0; i<input.dim(); ++i) {
+                if (input.shape()[i] > output_shape[i]) pure_upsampling = false;
+            }
+
             Tensor<T> output(std::move(output_shape));
-            for (int64_t flat = 0; flat < output.numel(); ++flat) {
-                std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
-                std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
-                for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
-                    input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
+            if (!pure_upsampling && (mode != InterpolateMode::Nearest)) {
+                // Pooling modes only differ from nearest mode when downsampling
+                for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) {
+                    std::vector<int64_t> output_coord = tensor_unravel_index(flat_out, output.shape());
+
+                    std::vector<int64_t> input_start(output.dim(), 0);
+                    std::vector<int64_t> input_end(output.dim(), 0);
+
+                    for (size_t i = 0; i < static_cast<size_t>(output.dim()); ++i) {
+                        int64_t I_dim = input.shape()[i];
+                        int64_t O_dim = output.shape()[i];
+
+                        if (I_dim > 0 && O_dim > 0) {
+                            input_start[i] = std::max(int64_t(0), static_cast<int64_t>(output_coord[i] * I_dim / O_dim));
+                            input_end[i]   = std::min(I_dim, ((output_coord[i] + 1) * I_dim + O_dim - 1) / O_dim);
+                        } else {
+                            input_start[i] = 0;
+                            input_end[i]   = 1;
+                        }
+                    }
+
+                    T val;
+                    if (mode == InterpolateMode::MaxPool) {
+                        val = std::numeric_limits<T>::lowest();
+                    } else if(mode == InterpolateMode::MinPool) {
+                        val = std::numeric_limits<T>::max();
+                    } else if(mode == InterpolateMode::AvgPool) {
+                        val = T(0);
+                    }
+
+                    bool done_window                      = false;
+                    std::vector<int64_t> current_in_coord = input_start;
+
+                    while (!done_window) {
+                        if (mode == InterpolateMode::MaxPool) {
+                            val = std::max(val, input.index(current_in_coord));
+                        } else if(mode == InterpolateMode::MinPool) {
+                            val = std::min(val, input.index(current_in_coord));
+                        } else if(mode == InterpolateMode::AvgPool) {
+                            val += input.index(current_in_coord);
+                        }
+
+                        for (int d = static_cast<int>(output.dim()) - 1; d >= 0; --d) {
+                            if (++current_in_coord[d] < input_end[d]) {
+                                break;
+                            }
+                            current_in_coord[d] = input_start[d];
+                            if (d == 0) {
+                                done_window = true;
+                            }
+                        }
+                    }
+                    if (mode == InterpolateMode::AvgPool) {
+                        int64_t window_size = 1;
+                        for (size_t i = 0; i < static_cast<size_t>(output.dim()); ++i) {
+                            window_size *= (input_end[i] - input_start[i]);
+                        }
+                        val /= static_cast<T>(window_size);
+                    }
+                    output[flat_out] = val;
+                }
+            } else {
+                for (int64_t flat = 0; flat < output.numel(); ++flat) {
+                    std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
+                    std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
+                    for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
+                        input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
+                    }
+                    output[flat] = input.index(input_coord);
                 }
-                output[flat] = input.index(input_coord);
             }
 
             return output;

From 6d484cdbd423592fc83bccbfc8fdb806e5382bd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 19 Mar 2026 12:11:52 +0100
Subject: [PATCH 2/4] inpaint: masked diffusion for inpainting models with
 inflated mask

---
 src/stable-diffusion.cpp |  14 ++++--
 src/tensor.hpp           | 101 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 97 insertions(+), 18 deletions(-)

diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 1ffb26bf7..a94564311 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -2812,7 +2812,7 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
                                                           request->height / request->vae_scale_factor,
                                                           1,
                                                           1},
-                                                         sd::ops::InterpolateMode::MaxPool);
+                                                         sd::ops::InterpolateMode::NearestMax);
 
     sd::Tensor<float> init_latent;
     sd::Tensor<float> control_latent;
@@ -2956,10 +2956,14 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
     latents.ref_images           = std::move(ref_images);
     latents.ref_latents          = std::move(ref_latents);
 
-    if (!sd_version_is_inpaint(sd_ctx->sd->version)) {
-        latents.denoise_mask = std::move(latent_mask);
-    }
-
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+        latent_mask = sd::ops::maxPool2D(latent_mask,
+                                         {3, 3},
+                                         {1, 1},
+                                         {1, 1});
+    } 
+    latents.denoise_mask = std::move(latent_mask);
+    
     return latents;
 }
 
diff --git a/src/tensor.hpp b/src/tensor.hpp
index d585742cf..b97b83986 100644
--- a/src/tensor.hpp
+++ b/src/tensor.hpp
@@ -819,9 +819,9 @@ namespace sd {
     namespace ops {
         enum class InterpolateMode {
             Nearest,
-            MaxPool,
-            MinPool,
-            AvgPool,
+            NearestMax,
+            NearestMin,
+            NearestAvg,
         };
 
         inline int64_t normalize_slice_bound(int64_t index, int64_t dim_size) {
@@ -1020,9 +1020,9 @@ namespace sd {
                                      InterpolateMode mode = InterpolateMode::Nearest,
                                      bool align_corners   = false) {
             bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
-                                         mode == InterpolateMode::MaxPool ||
-                                         mode == InterpolateMode::MinPool ||
-                                         mode == InterpolateMode::AvgPool);
+                                         mode == InterpolateMode::NearestMax ||
+                                         mode == InterpolateMode::NearestMin ||
+                                         mode == InterpolateMode::NearestAvg);
             if (!is_nearest_like_mode) {
                 tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
                                               std::to_string(static_cast<int>(mode)));
@@ -1083,11 +1083,11 @@ namespace sd {
                     }
 
                     T val;
-                    if (mode == InterpolateMode::MaxPool) {
+                    if (mode == InterpolateMode::NearestMax) {
                         val = std::numeric_limits<T>::lowest();
-                    } else if(mode == InterpolateMode::MinPool) {
+                    } else if(mode == InterpolateMode::NearestMin) {
                         val = std::numeric_limits<T>::max();
-                    } else if(mode == InterpolateMode::AvgPool) {
+                    } else if(mode == InterpolateMode::NearestAvg) {
                         val = T(0);
                     }
 
@@ -1095,11 +1095,11 @@ namespace sd {
                     std::vector<int64_t> current_in_coord = input_start;
 
                     while (!done_window) {
-                        if (mode == InterpolateMode::MaxPool) {
+                        if (mode == InterpolateMode::NearestMax) {
                             val = std::max(val, input.index(current_in_coord));
-                        } else if(mode == InterpolateMode::MinPool) {
+                        } else if(mode == InterpolateMode::NearestMin) {
                             val = std::min(val, input.index(current_in_coord));
-                        } else if(mode == InterpolateMode::AvgPool) {
+                        } else if(mode == InterpolateMode::NearestAvg) {
                             val += input.index(current_in_coord);
                         }
 
@@ -1113,7 +1113,7 @@ namespace sd {
                             }
                         }
                     }
-                    if (mode == InterpolateMode::AvgPool) {
+                    if (mode == InterpolateMode::NearestAvg) {
                         int64_t window_size = 1;
                         for (size_t i = 0; i < static_cast<size_t>(output.dim()); ++i) {
                             window_size *= (input_end[i] - input_start[i]);
@@ -1207,6 +1207,81 @@ namespace sd {
                                align_corners);
         }
 
+        template <typename T>
+        inline Tensor<T> maxPool2D(const Tensor<T>& input,
+                                   std::vector<int64_t> kernel_size,
+                                   std::vector<int64_t> stride,
+                                   std::vector<int64_t> padding) {
+            if (input.dim() != 4) {
+                tensor_throw_invalid_argument("Tensor maxPool2D requires 4D input: input_dim=" +
+                                              std::to_string(input.dim()) + ", input_shape=" +
+                                              tensor_shape_to_string(input.shape()));
+            }
+            if (kernel_size.size() != 2 || stride.size() != 2 || padding.size() != 2) {
+                tensor_throw_invalid_argument("Tensor maxPool2D requires kernel_size, stride, and padding to have length 2");
+            }
+            for (size_t i = 0; i < 2; ++i) {
+                if (kernel_size[i] <= 0) {
+                    tensor_throw_invalid_argument("Tensor maxPool2D kernel_size must be positive: kernel_size=" +
+                                                  tensor_shape_to_string(kernel_size));
+                }
+                if (stride[i] <= 0) {
+                    tensor_throw_invalid_argument("Tensor maxPool2D stride must be positive: stride=" +
+                                                  tensor_shape_to_string(stride));
+                }
+                if (padding[i] < 0) {
+                    tensor_throw_invalid_argument("Tensor maxPool2D padding must be non-negative: padding=" +
+                                                  tensor_shape_to_string(padding));
+                }
+            }
+
+            const int64_t in_height   = input.shape()[0];
+            const int64_t in_width    = input.shape()[1];
+            const int64_t in_channels = input.shape()[2];
+            const int64_t batch_size  = input.shape()[3];
+
+            const int64_t out_height = (in_height + 2 * padding[0] - kernel_size[0]) / stride[0] + 1;
+            const int64_t out_width  = (in_width + 2 * padding[1] - kernel_size[1]) / stride[1] + 1;
+
+            if (out_height <= 0 || out_width <= 0) {
+                tensor_throw_invalid_argument("maxPool2D results in invalid output dimensions: " +
+                                              std::to_string(out_height) + "x" + std::to_string(out_width));
+            }
+
+            Tensor<T> output({out_height, out_width, in_channels, batch_size});
+
+            for (int64_t oh = 0; oh < out_height; ++oh) {
+                for (int64_t ow = 0; ow < out_width; ++ow) {
+                    for (int64_t c = 0; c < in_channels; ++c) {
+                        for (int64_t b = 0; b < batch_size; ++b) {
+                            T max_val            = std::numeric_limits<T>::lowest();
+                            bool has_valid_input = false;
+
+                            for (int64_t kh = 0; kh < kernel_size[0]; ++kh) {
+                                for (int64_t kw = 0; kw < kernel_size[1]; ++kw) {
+                                    int64_t ih = oh * stride[0] + kh - padding[0];
+                                    int64_t iw = ow * stride[1] + kw - padding[1];
+
+                                    if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
+                                        T val           = input.index(ih, iw, c, b);
+                                        max_val         = std::max(max_val, val);
+                                        has_valid_input = true;
+                                    }
+                                }
+                            }
+
+                            if (has_valid_input) {
+                                output.index(oh, ow, c, b) = max_val;
+                            } else {
+                                output.index(oh, ow, c, b) = T(0);
+                            }
+                        }
+                    }
+                }
+            }
+            return output;
+        }
+
         template <typename T>
         inline Tensor<T> concat(const Tensor<T>& lhs, const Tensor<T>& rhs, size_t dim) {
             if (lhs.dim() != rhs.dim()) {

From 53f8b3d2dbe895984814facecfeae8775af3123c Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Mon, 6 Apr 2026 00:39:50 +0800
Subject: [PATCH 3/4] refactor tensor interpolate nearest-like reduction paths
 and generalize max_pool_2d

---
 src/stable-diffusion.cpp |  12 +-
 src/tensor.hpp           | 243 +++++++++++++++++++++------------------
 2 files changed, 139 insertions(+), 116 deletions(-)

diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 7d03974a8..683a07d53 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -2992,13 +2992,13 @@ static std::optional<ImageGenerationLatents> prepare_image_generation_latents(sd
     latents.ref_latents          = std::move(ref_latents);
 
     if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        latent_mask = sd::ops::maxPool2D(latent_mask,
-                                         {3, 3},
-                                         {1, 1},
-                                         {1, 1});
-    } 
+        latent_mask = sd::ops::max_pool_2d(latent_mask,
+                                           {3, 3},
+                                           {1, 1},
+                                           {1, 1});
+    }
     latents.denoise_mask = std::move(latent_mask);
-    
+
     return latents;
 }
 
diff --git a/src/tensor.hpp b/src/tensor.hpp
index b97b83986..8aba69414 100644
--- a/src/tensor.hpp
+++ b/src/tensor.hpp
@@ -1019,10 +1019,10 @@ namespace sd {
                                      std::vector<int64_t> output_shape,
                                      InterpolateMode mode = InterpolateMode::Nearest,
                                      bool align_corners   = false) {
-            bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
-                                         mode == InterpolateMode::NearestMax ||
-                                         mode == InterpolateMode::NearestMin ||
-                                         mode == InterpolateMode::NearestAvg);
+            const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
+                                               mode == InterpolateMode::NearestMax ||
+                                               mode == InterpolateMode::NearestMin ||
+                                               mode == InterpolateMode::NearestAvg);
             if (!is_nearest_like_mode) {
                 tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
                                               std::to_string(static_cast<int>(mode)));
@@ -1055,82 +1055,102 @@ namespace sd {
                 }
             }
 
-            bool pure_upsampling = true;
-            for(int64_t i=0; i<input.dim(); ++i) {
-                if (input.shape()[i] > output_shape[i]) pure_upsampling = false;
+            bool has_downsampling = false;
+            for (int64_t i = 0; i < input.dim(); ++i) {
+                if (input.shape()[i] > output_shape[i]) {
+                    has_downsampling = true;
+                    break;
+                }
             }
 
             Tensor<T> output(std::move(output_shape));
-            if (!pure_upsampling && (mode != InterpolateMode::Nearest)) {
-                // Pooling modes only differ from nearest mode when downsampling
-                for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) {
-                    std::vector<int64_t> output_coord = tensor_unravel_index(flat_out, output.shape());
+            if (mode == InterpolateMode::Nearest || !has_downsampling) {
+                for (int64_t flat = 0; flat < output.numel(); ++flat) {
+                    std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
+                    std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
+                    for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
+                        input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
+                    }
+                    output[flat] = input.index(input_coord);
+                }
 
-                    std::vector<int64_t> input_start(output.dim(), 0);
-                    std::vector<int64_t> input_end(output.dim(), 0);
+                return output;
+            }
 
-                    for (size_t i = 0; i < static_cast<size_t>(output.dim()); ++i) {
-                        int64_t I_dim = input.shape()[i];
-                        int64_t O_dim = output.shape()[i];
-
-                        if (I_dim > 0 && O_dim > 0) {
-                            input_start[i] = std::max(int64_t(0), static_cast<int64_t>(output_coord[i] * I_dim / O_dim));
-                            input_end[i]   = std::min(I_dim, ((output_coord[i] + 1) * I_dim + O_dim - 1) / O_dim);
-                        } else {
-                            input_start[i] = 0;
-                            input_end[i]   = 1;
-                        }
-                    }
+            auto init_reduction = [&]() -> T {
+                switch (mode) {
+                    case InterpolateMode::NearestMax:
+                        return std::numeric_limits<T>::lowest();
+                    case InterpolateMode::NearestMin:
+                        return std::numeric_limits<T>::max();
+                    case InterpolateMode::NearestAvg:
+                        return T(0);
+                    case InterpolateMode::Nearest:
+                        return T(0);
+                }
 
-                    T val;
-                    if (mode == InterpolateMode::NearestMax) {
-                        val = std::numeric_limits<T>::lowest();
-                    } else if(mode == InterpolateMode::NearestMin) {
-                        val = std::numeric_limits<T>::max();
-                    } else if(mode == InterpolateMode::NearestAvg) {
-                        val = T(0);
-                    }
+                tensor_throw_invalid_argument("Unsupported interpolate mode: mode=" +
+                                              std::to_string(static_cast<int>(mode)));
+            };
+
+            auto reduce_value = [&](T& acc, const T& sample) {
+                switch (mode) {
+                    case InterpolateMode::NearestMax:
+                        acc = std::max(acc, sample);
+                        break;
+                    case InterpolateMode::NearestMin:
+                        acc = std::min(acc, sample);
+                        break;
+                    case InterpolateMode::NearestAvg:
+                        acc += sample;
+                        break;
+                    case InterpolateMode::Nearest:
+                        break;
+                }
+            };
 
-                    bool done_window                      = false;
-                    std::vector<int64_t> current_in_coord = input_start;
+            // Reduction modes only differ from nearest mode when downsampling.
+            for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) {
+                std::vector<int64_t> output_coord = tensor_unravel_index(flat_out, output.shape());
 
-                    while (!done_window) {
-                        if (mode == InterpolateMode::NearestMax) {
-                            val = std::max(val, input.index(current_in_coord));
-                        } else if(mode == InterpolateMode::NearestMin) {
-                            val = std::min(val, input.index(current_in_coord));
-                        } else if(mode == InterpolateMode::NearestAvg) {
-                            val += input.index(current_in_coord);
-                        }
+                std::vector<int64_t> input_start(output.dim(), 0);
+                std::vector<int64_t> input_end(output.dim(), 0);
+
+                for (size_t i = 0; i < static_cast<size_t>(output.dim()); ++i) {
+                    const int64_t input_dim  = input.shape()[i];
+                    const int64_t output_dim = output.shape()[i];
+
+                    input_start[i] = std::max(int64_t(0), static_cast<int64_t>(output_coord[i] * input_dim / output_dim));
+                    input_end[i]   = std::min(input_dim, ((output_coord[i] + 1) * input_dim + output_dim - 1) / output_dim);
+                }
+
+                T value                               = init_reduction();
+                bool done_window                      = false;
+                std::vector<int64_t> current_in_coord = input_start;
+
+                while (!done_window) {
+                    reduce_value(value, input.index(current_in_coord));
 
-                        for (int d = static_cast<int>(output.dim()) - 1; d >= 0; --d) {
-                            if (++current_in_coord[d] < input_end[d]) {
-                                break;
-                            }
-                            current_in_coord[d] = input_start[d];
-                            if (d == 0) {
-                                done_window = true;
-                            }
+                    for (int d = static_cast<int>(output.dim()) - 1; d >= 0; --d) {
+                        if (++current_in_coord[d] < input_end[d]) {
+                            break;
                         }
-                    }
-                    if (mode == InterpolateMode::NearestAvg) {
-                        int64_t window_size = 1;
-                        for (size_t i = 0; i < static_cast<size_t>(output.dim()); ++i) {
-                            window_size *= (input_end[i] - input_start[i]);
+                        current_in_coord[d] = input_start[d];
+                        if (d == 0) {
+                            done_window = true;
                         }
-                        val /= static_cast<T>(window_size);
                     }
-                    output[flat_out] = val;
                 }
-            } else {
-                for (int64_t flat = 0; flat < output.numel(); ++flat) {
-                    std::vector<int64_t> output_coord = tensor_unravel_index(flat, output.shape());
-                    std::vector<int64_t> input_coord(static_cast<size_t>(input.dim()), 0);
-                    for (size_t i = 0; i < static_cast<size_t>(input.dim()); ++i) {
-                        input_coord[i] = output_coord[i] * input.shape()[i] / output.shape()[i];
+
+                if (mode == InterpolateMode::NearestAvg) {
+                    int64_t window_size = 1;
+                    for (size_t i = 0; i < static_cast<size_t>(output.dim()); ++i) {
+                        window_size *= (input_end[i] - input_start[i]);
                     }
-                    output[flat] = input.index(input_coord);
+                    value /= static_cast<T>(window_size);
                 }
+
+                output[flat_out] = value;
             }
 
             return output;
@@ -1142,12 +1162,16 @@ namespace sd {
                                      const std::optional<std::vector<double>>& scale_factor,
                                      InterpolateMode mode = InterpolateMode::Nearest,
                                      bool align_corners   = false) {
-            if (mode != InterpolateMode::Nearest) {
-                tensor_throw_invalid_argument("Only nearest interpolate mode is implemented, got mode=" +
+            const bool is_nearest_like_mode = (mode == InterpolateMode::Nearest ||
+                                               mode == InterpolateMode::NearestMax ||
+                                               mode == InterpolateMode::NearestMin ||
+                                               mode == InterpolateMode::NearestAvg);
+            if (!is_nearest_like_mode) {
+                tensor_throw_invalid_argument("Only nearest-like interpolate modes are implemented, got mode=" +
                                               std::to_string(static_cast<int>(mode)));
             }
             if (align_corners) {
-                tensor_throw_invalid_argument("align_corners is not supported for nearest interpolate: input_shape=" +
+                tensor_throw_invalid_argument("align_corners is not supported for nearest-like interpolate: input_shape=" +
                                               tensor_shape_to_string(input.shape()));
             }
             if (size.has_value() == scale_factor.has_value()) {
@@ -1208,76 +1232,75 @@ namespace sd {
         }
 
         template <typename T>
-        inline Tensor<T> maxPool2D(const Tensor<T>& input,
-                                   std::vector<int64_t> kernel_size,
-                                   std::vector<int64_t> stride,
-                                   std::vector<int64_t> padding) {
-            if (input.dim() != 4) {
-                tensor_throw_invalid_argument("Tensor maxPool2D requires 4D input: input_dim=" +
+        inline Tensor<T> max_pool_2d(const Tensor<T>& input,
+                                     std::vector<int64_t> kernel_size,
+                                     std::vector<int64_t> stride,
+                                     std::vector<int64_t> padding) {
+            if (input.dim() < 2) {
+                tensor_throw_invalid_argument("Tensor max_pool_2d requires input_dim >= 2: input_dim=" +
                                               std::to_string(input.dim()) + ", input_shape=" +
                                               tensor_shape_to_string(input.shape()));
             }
             if (kernel_size.size() != 2 || stride.size() != 2 || padding.size() != 2) {
-                tensor_throw_invalid_argument("Tensor maxPool2D requires kernel_size, stride, and padding to have length 2");
+                tensor_throw_invalid_argument("Tensor max_pool_2d requires kernel_size, stride, and padding to have length 2");
             }
             for (size_t i = 0; i < 2; ++i) {
                 if (kernel_size[i] <= 0) {
-                    tensor_throw_invalid_argument("Tensor maxPool2D kernel_size must be positive: kernel_size=" +
+                    tensor_throw_invalid_argument("Tensor max_pool_2d kernel_size must be positive: kernel_size=" +
                                                   tensor_shape_to_string(kernel_size));
                 }
                 if (stride[i] <= 0) {
-                    tensor_throw_invalid_argument("Tensor maxPool2D stride must be positive: stride=" +
+                    tensor_throw_invalid_argument("Tensor max_pool_2d stride must be positive: stride=" +
                                                   tensor_shape_to_string(stride));
                 }
                 if (padding[i] < 0) {
-                    tensor_throw_invalid_argument("Tensor maxPool2D padding must be non-negative: padding=" +
+                    tensor_throw_invalid_argument("Tensor max_pool_2d padding must be non-negative: padding=" +
                                                   tensor_shape_to_string(padding));
                 }
             }
 
-            const int64_t in_height   = input.shape()[0];
-            const int64_t in_width    = input.shape()[1];
-            const int64_t in_channels = input.shape()[2];
-            const int64_t batch_size  = input.shape()[3];
+            const int64_t in_height = input.shape()[0];
+            const int64_t in_width  = input.shape()[1];
 
             const int64_t out_height = (in_height + 2 * padding[0] - kernel_size[0]) / stride[0] + 1;
             const int64_t out_width  = (in_width + 2 * padding[1] - kernel_size[1]) / stride[1] + 1;
 
             if (out_height <= 0 || out_width <= 0) {
-                tensor_throw_invalid_argument("maxPool2D results in invalid output dimensions: " +
+                tensor_throw_invalid_argument("max_pool_2d results in invalid output dimensions: " +
                                               std::to_string(out_height) + "x" + std::to_string(out_width));
             }
 
-            Tensor<T> output({out_height, out_width, in_channels, batch_size});
-
-            for (int64_t oh = 0; oh < out_height; ++oh) {
-                for (int64_t ow = 0; ow < out_width; ++ow) {
-                    for (int64_t c = 0; c < in_channels; ++c) {
-                        for (int64_t b = 0; b < batch_size; ++b) {
-                            T max_val            = std::numeric_limits<T>::lowest();
-                            bool has_valid_input = false;
-
-                            for (int64_t kh = 0; kh < kernel_size[0]; ++kh) {
-                                for (int64_t kw = 0; kw < kernel_size[1]; ++kw) {
-                                    int64_t ih = oh * stride[0] + kh - padding[0];
-                                    int64_t iw = ow * stride[1] + kw - padding[1];
-
-                                    if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
-                                        T val           = input.index(ih, iw, c, b);
-                                        max_val         = std::max(max_val, val);
-                                        has_valid_input = true;
-                                    }
-                                }
-                            }
-
-                            if (has_valid_input) {
-                                output.index(oh, ow, c, b) = max_val;
-                            } else {
-                                output.index(oh, ow, c, b) = T(0);
-                            }
+            std::vector<int64_t> output_shape = input.shape();
+            output_shape[0]                   = out_height;
+            output_shape[1]                   = out_width;
+
+            Tensor<T> output(std::move(output_shape));
+
+            for (int64_t flat_out = 0; flat_out < output.numel(); ++flat_out) {
+                std::vector<int64_t> output_coord = tensor_unravel_index(flat_out, output.shape());
+                std::vector<int64_t> input_coord  = output_coord;
+
+                const int64_t oh = output_coord[0];
+                const int64_t ow = output_coord[1];
+
+                T max_val            = std::numeric_limits<T>::lowest();
+                bool has_valid_input = false;
+
+                for (int64_t kh = 0; kh < kernel_size[0]; ++kh) {
+                    for (int64_t kw = 0; kw < kernel_size[1]; ++kw) {
+                        const int64_t ih = oh * stride[0] + kh - padding[0];
+                        const int64_t iw = ow * stride[1] + kw - padding[1];
+
+                        if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
+                            input_coord[0]  = ih;
+                            input_coord[1]  = iw;
+                            max_val         = std::max(max_val, input.index(input_coord));
+                            has_valid_input = true;
                         }
                     }
                 }
+
+                output[flat_out] = has_valid_input ? max_val : T(0);
             }
             return output;
         }

From 090b914508982a28976c60fe3bbbf4bf9faee39f Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Mon, 6 Apr 2026 00:42:56 +0800
Subject: [PATCH 4/4] remove unused get_flat_index

---
 src/tensor.hpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/tensor.hpp b/src/tensor.hpp
index 8aba69414..33302b056 100644
--- a/src/tensor.hpp
+++ b/src/tensor.hpp
@@ -303,10 +303,6 @@ namespace sd {
             return data_.at(static_cast<size_t>(index));
         }
 
-        int64_t get_flat_index(const std::vector<int64_t>& coord) const {
-            return static_cast<int64_t>(offset_of(coord));
-        }
-
     private:
         size_t offset_of(const std::vector<int64_t>& coord) const {
             if (coord.size() != shape_.size()) {