diff --git a/08_bilateral_filter/README.md b/08_bilateral_filter/README.md index c4034f0..cdc12d5 100644 --- a/08_bilateral_filter/README.md +++ b/08_bilateral_filter/README.md @@ -1,16 +1,68 @@ -## 简介 +# 实时双边滤波-CUDA -凭借你 InfiniTensor 训练营的经历和人工智能与并行计算方面的出色成绩,你受雇于一家领先的虚拟现实公司,负责为下一代 VR 头显开发实时图像增强模块。用户经常抱怨在弱光环境下看到的图像噪点严重,而现有降噪算法要么模糊细节,要么延迟太高导致眩晕。 +## 项目结构 -你提出采用双边滤波算法,它能够在去除噪声的同时保持边缘清晰,非常适合VR图像处理。但双边滤波计算量极大,每个像素需要访问整个邻域并计算复杂的权重,难以在移动端实时运行。幸运的是,VR 头显内置了你知道的某单位生产的黑科技 GPU,你需要利用 CUDA 实现一个高度优化的双边滤波核,能够对 4K 分辨率、60 帧/秒的视频流进行实时处理。这将大幅提升用户的沉浸感和舒适度,为公司抢占市场提供核心技术。 +当前项目结构如下: -## 任务内容 -开发一个 CUDA 程序,实现并行双边滤波。 +```txt +实时双边滤波-CUDA/ +├─ 朴素实现代码及结果/ +├─ 实时双边滤波-CUDA论文.pdf +├─ 项目要求.md +├─ baboon.bmp +├─ girl.bmp +├─ main.cu +├─ monarch.bmp +├─ params.txt +└─ README.md +``` -## 📬 有疑问? +其中: -更多详细信息和要求请参考本季度项目文档。 +- `main.cu`:共享内存 + 网格跨步循环优化后代码 +- `params.txt`:双边滤波参数文件 +- `baboon.bmp`、`girl.bmp`、`monarch.bmp`:测试图像 +- `实时双边滤波-CUDA论文.pdf`:最终提交的总结报告 PDF +- `项目要求.md`:课程项目要求 +- `朴素实现代码及结果/`:朴素版本代码及阶段性结果记录 -可以在项目群里直接询问导师和助教! +## 英伟达平台编译与运行 -Good luck and happy coding! 🚀 +在已经安装 CUDA 和 OpenCV 的英伟达平台上,可以直接使用下面的命令编译并运行: + +```bash +nvcc -std=c++11 main.cu -o bilateral_filter \ +-I/usr/local/include/opencv4 \ +-L/usr/local/lib \ +-lopencv_core \ +-lopencv_imgproc \ +-lopencv_imgcodecs + +./bilateral_filter <测试图像名称> params.txt +``` + +例如: + +```bash +./bilateral_filter monarch.bmp params.txt +``` + +## 参数文件格式 + +`params.txt` 示例: + +```txt +radius = 5 +sigma_spatial = 3.0 +sigma_color = 30.0 +use_adaptive_radius = false +``` + +## 输出内容 + +程序运行后会输出: + +- 滤波后的 raw 文件 +- 滤波后的图像文件 +- 性能日志 +- 终端中的 GPU/CPU 时间、吞吐量、加速比和 MAE diff --git a/08_bilateral_filter/baboon.bmp b/08_bilateral_filter/baboon.bmp new file mode 100644 index 0000000..2aef6c9 Binary files /dev/null and b/08_bilateral_filter/baboon.bmp differ diff --git a/08_bilateral_filter/girl.bmp b/08_bilateral_filter/girl.bmp new file mode 100644 index 0000000..42d5c98 Binary files /dev/null and b/08_bilateral_filter/girl.bmp differ diff --git a/08_bilateral_filter/main.cu b/08_bilateral_filter/main.cu new file mode 100644 index 0000000..ea8806b --- /dev/null +++ b/08_bilateral_filter/main.cu @@ -0,0 +1,607 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// 错误检查宏 +#define CUDA_CHECK(call) do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +constexpr int blockSizeX = 32; +constexpr int blockSizeY = 8; +constexpr int maxGrayDistance = 255; +constexpr int maxColorDistance = 255 * 3; + +struct BilateralParams { + int radius = 5; + float sigmaSpatial = 3.0f; + float sigmaColor = 30.0f; + bool useAdaptiveRadius = false; + float noiseLevel = 0.0f; +}; + +// 灰度图双边滤波核函数 +__global__ void BilateralFilterGrayKernel( + const uchar* inputImage, + uchar* outputImage, + const float* spatialLookupWeights, + const float* colorLookupWeights, + int imageWidth, + int imageHeight, + size_t inputStep, + size_t outputStep, + int filterRadius +) { + extern __shared__ uchar sharedTile[]; + + int tileWidth = blockDim.x + 2 * filterRadius; + int tileHeight = blockDim.y + 2 * filterRadius; + int paddedWidth = imageWidth + 2 * filterRadius; + int paddedHeight = imageHeight + 2 * filterRadius; + + for (int tileBlockY = blockIdx.y; tileBlockY * blockDim.y < imageHeight; tileBlockY += gridDim.y) { + for (int tileBlockX = blockIdx.x; tileBlockX * blockDim.x < imageWidth; tileBlockX += gridDim.x) { + int tileStartX = tileBlockX * blockDim.x; + int tileStartY = tileBlockY * blockDim.y; + + for (int localY = threadIdx.y; localY < tileHeight; localY += blockDim.y) { + for (int localX = threadIdx.x; localX < tileWidth; localX += blockDim.x) { + int globalX = tileStartX + localX; + int globalY = tileStartY + localY; + + if (globalX > paddedWidth - 1) globalX = paddedWidth - 1; + if (globalY > paddedHeight - 1) globalY = paddedHeight - 1; + + sharedTile[localY * tileWidth + localX] = inputImage[globalY * inputStep + globalX]; + } + } + __syncthreads(); + + int pixelX = tileStartX + threadIdx.x; + int pixelY = tileStartY + threadIdx.y; + + if (pixelX < imageWidth && pixelY < imageHeight) { + int centerLocalX = threadIdx.x + filterRadius; + int centerLocalY = threadIdx.y + filterRadius; + + float centerValue = static_cast(sharedTile[centerLocalY * tileWidth + centerLocalX]); + + float weightValueSum = 0.0f; + float weightSum = 0.0f; + int spatialIndex = 0; + + for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) { + for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) { + float nearValue = static_cast(sharedTile[(centerLocalY + offsetY) * tileWidth + (centerLocalX + offsetX)]); + int colorDistance = static_cast(fabsf(nearValue - centerValue)); + + if (colorDistance > maxGrayDistance) colorDistance = maxGrayDistance; + + float finalWeight = spatialLookupWeights[spatialIndex] * colorLookupWeights[colorDistance]; + weightValueSum += finalWeight * nearValue; + weightSum += finalWeight; + } + } + + outputImage[pixelY * outputStep + pixelX] = static_cast(weightValueSum / weightSum); + } + __syncthreads(); + } + } +} + +// 彩色图双边滤波核函数 +__global__ void BilateralFilterColorKernel( + const uchar* inputImage, + uchar* outputImage, + const float* spatialLookupWeights, + const float* colorLookupWeights, + int imageWidth, + int imageHeight, + size_t inputStep, + size_t outputStep, + int filterRadius +) { + extern __shared__ uchar sharedTile[]; + + int tileWidth = blockDim.x + 2 * filterRadius; + int tileHeight = blockDim.y + 2 * filterRadius; + int paddedWidth = imageWidth + 2 * filterRadius; + int paddedHeight = imageHeight + 2 * filterRadius; + + // 网格跨步循环 + for (int tileBlockY = blockIdx.y; tileBlockY * blockDim.y < imageHeight; tileBlockY += gridDim.y) { + for (int tileBlockX = blockIdx.x; tileBlockX * blockDim.x < imageWidth; tileBlockX += gridDim.x) { + int tileStartX = tileBlockX * blockDim.x; + int tileStartY = tileBlockY * blockDim.y; + + for (int localY = threadIdx.y; localY < tileHeight; localY += blockDim.y) { + for (int localX = threadIdx.x; localX < tileWidth; localX += blockDim.x) { + int globalX = tileStartX + localX; + int globalY = tileStartY + localY; + + if (globalX > paddedWidth - 1) globalX = paddedWidth - 1; + if (globalY > paddedHeight - 1) globalY = paddedHeight - 1; + + size_t globalIndex = globalY * inputStep + globalX * 3; + size_t sharedIndex = (localY * tileWidth + localX) * 3; + + sharedTile[sharedIndex + 0] = inputImage[globalIndex + 0]; + sharedTile[sharedIndex + 1] = inputImage[globalIndex + 1]; + sharedTile[sharedIndex + 2] = inputImage[globalIndex + 2]; + } + } + __syncthreads(); + + int pixelX = tileStartX + threadIdx.x; + int pixelY = tileStartY + threadIdx.y; + + if (pixelX < imageWidth && pixelY < imageHeight) { + int centerLocalX = threadIdx.x + filterRadius; + int centerLocalY = threadIdx.y + filterRadius; + + size_t centerIndex = (centerLocalY * tileWidth + centerLocalX) * 3; + + float centerBlue = static_cast(sharedTile[centerIndex + 0]); + float centerGreen = static_cast(sharedTile[centerIndex + 1]); + float centerRed = static_cast(sharedTile[centerIndex + 2]); + + float blueSum = 0.0f; + float greenSum = 0.0f; + float redSum = 0.0f; + float weightSum = 0.0f; + int spatialIndex = 0; + + for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) { + for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) { + size_t nearIndex = ((centerLocalY + offsetY) * tileWidth + (centerLocalX + offsetX)) * 3; + + float nearBlue = static_cast(sharedTile[nearIndex + 0]); + float nearGreen = static_cast(sharedTile[nearIndex + 1]); + float nearRed = static_cast(sharedTile[nearIndex + 2]); + + int colorDistance = + static_cast(fabsf(nearBlue - centerBlue)) + + static_cast(fabsf(nearGreen - centerGreen)) + + static_cast(fabsf(nearRed - centerRed)); + + if (colorDistance > maxColorDistance) colorDistance = maxColorDistance; + float finalWeight = spatialLookupWeights[spatialIndex] * colorLookupWeights[colorDistance]; + + blueSum += finalWeight * nearBlue; + greenSum += finalWeight * nearGreen; + redSum += finalWeight * nearRed; + weightSum += finalWeight; + } + } + + size_t outputIndex = pixelY * outputStep + pixelX * 3; + + outputImage[outputIndex + 0] = static_cast(blueSum / weightSum); + outputImage[outputIndex + 1] = static_cast(greenSum / weightSum); + outputImage[outputIndex + 2] = static_cast(redSum / weightSum); + } + __syncthreads(); + } + } +} + +// 创建空间权重和颜色权重查找表 +void lookupWeights( + std::vector& spatialLookupWeights, + std::vector& colorLookupWeights, + int filterRadius, + float spatialSigma, + float colorSigma, + int colorTableSize +) { + int windowLength = 2 * filterRadius + 1; + + spatialLookupWeights.resize(windowLength * windowLength); + colorLookupWeights.resize(colorTableSize + 1); + + float spatialFactor = -0.5f / (spatialSigma * spatialSigma); + float colorCoefficient = -0.5f / (colorSigma * colorSigma); + int spatialIndex = 0; + + for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) { + for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) { + float distanceSquare = static_cast(offsetX * offsetX + offsetY * offsetY); + spatialLookupWeights[spatialIndex] = expf(distanceSquare * spatialFactor); + } + } + + for (int colorDistance = 0; colorDistance <= colorTableSize; ++colorDistance) { + colorLookupWeights[colorDistance] = expf(colorDistance * colorDistance * colorCoefficient); + } +} + +float calculateMeanAbsoluteError(const cv::Mat& image1, const cv::Mat& image2) { + cv::Mat diff; + cv::absdiff(image1, image2, diff); + cv::Scalar meanVal = cv::mean(diff); + float mae = 0.0f; + for (int c = 0; c < image1.channels(); ++c) { + mae += static_cast(meanVal[c]); + } + return mae / image1.channels(); +} + +float estimateNoiseLevel(const cv::Mat& image) { + cv::Mat gray; + if (image.channels() == 3) { + cv::cvtColor(image, gray, cv::COLOR_BGR2GRAY); + } else { + gray = image.clone(); + } + + cv::Mat laplacian; + cv::Laplacian(gray, laplacian, CV_32F); + cv::Mat absLaplacian = cv::abs(laplacian); + + cv::Scalar meanVal, stdVal; + cv::meanStdDev(absLaplacian, meanVal, stdVal); + return static_cast(stdVal[0]); +} + +int selectAdaptiveRadius(const cv::Mat& image, float baseRadius, float noiseLevel) { + if (noiseLevel <= 0) { + noiseLevel = estimateNoiseLevel(image); + } + + float noiseFactor = std::min(noiseLevel / 30.0f, 2.0f); + int adaptiveRadius = static_cast(baseRadius * (1.0f + noiseFactor * 0.5f)); + return std::max(2, std::min(adaptiveRadius, 10)); +} + +bool parseParamsFile(const std::string& filename, BilateralParams& params) { + std::ifstream file(filename); + if (!file.is_open()) { + std::cerr << "Warning: Cannot open params file: " << filename << ", using default values." << std::endl; + return false; + } + + std::string line; + while (std::getline(file, line)) { + if (line.empty() || line[0] == '#') continue; + + std::istringstream iss(line); + std::string key; + if (std::getline(iss, key, '=')) { + key.erase(0, key.find_first_not_of(" \t")); + key.erase(key.find_last_not_of(" \t") + 1); + + std::string value; + if (std::getline(iss, value, '#')) { + value.erase(0, value.find_first_not_of(" \t")); + value.erase(value.find_last_not_of(" \t") + 1); + } + + if (key == "radius") { + params.radius = std::stoi(value); + } else if (key == "sigma_spatial") { + params.sigmaSpatial = std::stof(value); + } else if (key == "sigma_color") { + params.sigmaColor = std::stof(value); + } else if (key == "use_adaptive_radius") { + params.useAdaptiveRadius = (value == "true" || value == "1"); + } + } + } + return true; +} + +bool writeRawImage(const std::string& filename, const cv::Mat& image) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Error: Cannot write to file: " << filename << std::endl; + return false; + } + + int width = image.cols; + int height = image.rows; + int channels = image.channels(); + + file.write(reinterpret_cast(&width), sizeof(int)); + file.write(reinterpret_cast(&height), sizeof(int)); + file.write(reinterpret_cast(&channels), sizeof(int)); + + for (int y = 0; y < height; ++y) { + file.write(reinterpret_cast(image.ptr(y)), width * channels); + } + + return file.good(); +} + +cv::Mat readRawImage(const std::string& filename) { + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Error: Cannot open file: " << filename << std::endl; + return cv::Mat(); + } + + int width, height, channels; + file.read(reinterpret_cast(&width), sizeof(int)); + file.read(reinterpret_cast(&height), sizeof(int)); + file.read(reinterpret_cast(&channels), sizeof(int)); + + int type = (channels == 1) ? CV_8UC1 : CV_8UC3; + cv::Mat image(height, width, type); + + for (int y = 0; y < height; ++y) { + file.read(reinterpret_cast(image.ptr(y)), width * channels); + } + + return image; +} + +void writePerformanceLog( + const std::string& filename, + const std::string& imagePath, + int width, + int height, + int channels, + int radius, + float spatialSigma, + float colorSigma, + double gpuTimeMs, + double cpuTimeMs, + float mae +) { + std::ofstream file(filename); + if (!file.is_open()) { + std::cerr << "Error: Cannot write log file: " << filename << std::endl; + return; + } + + double totalPixels = static_cast(width) * height; + double gpuThroughput = totalPixels / (gpuTimeMs / 1000.0) / 1000000.0; + double cpuThroughput = totalPixels / (cpuTimeMs / 1000.0) / 1000000.0; + double speedup = cpuTimeMs / gpuTimeMs; + + file << "========== Bilateral Filter Performance Log ==========" << std::endl; + file << "Image: " << imagePath << std::endl; + file << "Resolution: " << width << "x" << height << std::endl; + file << "Channels: " << channels << std::endl; + file << "Total Pixels: " << totalPixels << std::endl; + file << std::endl; + file << "Filter Parameters:" << std::endl; + file << " Radius: " << radius << std::endl; + file << " Sigma Spatial: " << spatialSigma << std::endl; + file << " Sigma Color: " << colorSigma << std::endl; + file << std::endl; + file << "Performance Metrics:" << std::endl; + file << " GPU Time: " << gpuTimeMs << " ms" << std::endl; + file << " GPU Throughput: " << gpuThroughput << " MP/s" << std::endl; + file << " CPU Time: " << cpuTimeMs << " ms" << std::endl; + file << " CPU Throughput: " << cpuThroughput << " MP/s" << std::endl; + file << " Speedup (CPU/GPU): " << speedup << "x" << std::endl; + file << std::endl; + file << "Correctness:" << std::endl; + file << " MAE vs OpenCV: " << mae << std::endl; + file << " Pass (MAE < 1.0): " << (mae < 1.0 ? "YES" : "NO") << std::endl; + file << "======================================================" << std::endl; +} + +double measureGPUTime( + const cv::Mat& inputImage, + const cv::Mat& paddedImage, + cv::Mat& resultImage, + const std::vector& spatialLookupWeights, + const std::vector& colorLookupWeights, + int filterRadius +) { + uchar* deviceInputImage = nullptr; + uchar* deviceOutputImage = nullptr; + float* deviceSpatialLookupWeights = nullptr; + float* deviceColorLookupWeights = nullptr; + + CUDA_CHECK(cudaMalloc(&deviceInputImage, paddedImage.step * paddedImage.rows)); + CUDA_CHECK(cudaMalloc(&deviceOutputImage, resultImage.step * resultImage.rows)); + CUDA_CHECK(cudaMalloc(&deviceSpatialLookupWeights, spatialLookupWeights.size() * sizeof(float))); + CUDA_CHECK(cudaMalloc(&deviceColorLookupWeights, colorLookupWeights.size() * sizeof(float))); + + CUDA_CHECK(cudaMemcpy(deviceInputImage, paddedImage.data, paddedImage.step * paddedImage.rows, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(deviceSpatialLookupWeights, spatialLookupWeights.data(), spatialLookupWeights.size() * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(deviceColorLookupWeights, colorLookupWeights.data(), colorLookupWeights.size() * sizeof(float), cudaMemcpyHostToDevice)); + + dim3 blockSize(blockSizeX, blockSizeY); + + //如果 block 数量直接铺满整张图,网格跨步循环往往只会执行一轮;适当限制 grid 后,一个 block 才会继续跨步处理后续 tile。 + cudaDeviceProp deviceInfo{}; + CUDA_CHECK(cudaGetDeviceProperties(&deviceInfo, 0)); + int neededGridX = (inputImage.cols + blockSizeX - 1) / blockSizeX; + int neededGridY = (inputImage.rows + blockSizeY - 1) / blockSizeY; + int launchGridX = std::max(1, std::min(neededGridX, deviceInfo.multiProcessorCount / 4)); + int launchGridY = std::max(1, std::min(neededGridY, 4)); + dim3 blockGroup(launchGridX, launchGridY); + + cudaEvent_t start, stop; + CUDA_CHECK(cudaEventCreate(&start)); + CUDA_CHECK(cudaEventCreate(&stop)); + + //共享内存 tile 的大小取决于线程块大小、滤波半径和图像通道数 + if (inputImage.channels() == 1) { + size_t sharedMemoryBytes = + static_cast(blockSize.x + 2 * filterRadius) * + static_cast(blockSize.y + 2 * filterRadius) * + sizeof(uchar); + + BilateralFilterGrayKernel<<>>( + deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights, + inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius); + } else { + size_t sharedMemoryBytes = + static_cast(blockSize.x + 2 * filterRadius) * + static_cast(blockSize.y + 2 * filterRadius) * + 3 * sizeof(uchar); + + BilateralFilterColorKernel<<>>( + deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights, + inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius); + } + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + const int iterations = 10; + CUDA_CHECK(cudaEventRecord(start)); + for (int i = 0; i < iterations; ++i) { + if (inputImage.channels() == 1) { + size_t sharedMemoryBytes = + static_cast(blockSize.x + 2 * filterRadius) * + static_cast(blockSize.y + 2 * filterRadius) * + sizeof(uchar); + BilateralFilterGrayKernel<<>>( + deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights, + inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius); + } else { + size_t sharedMemoryBytes = + static_cast(blockSize.x + 2 * filterRadius) * + static_cast(blockSize.y + 2 * filterRadius) * + 3 * sizeof(uchar); + BilateralFilterColorKernel<<>>( + deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights, + inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius); + } + } + CUDA_CHECK(cudaEventRecord(stop)); + CUDA_CHECK(cudaEventSynchronize(stop)); + + float elapsedTime = 0.0f; + CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start, stop)); + double avgTime = elapsedTime / iterations; + + CUDA_CHECK(cudaMemcpy(resultImage.data, deviceOutputImage, resultImage.step * resultImage.rows, cudaMemcpyDeviceToHost)); + + CUDA_CHECK(cudaEventDestroy(start)); + CUDA_CHECK(cudaEventDestroy(stop)); + CUDA_CHECK(cudaFree(deviceInputImage)); + CUDA_CHECK(cudaFree(deviceOutputImage)); + CUDA_CHECK(cudaFree(deviceSpatialLookupWeights)); + CUDA_CHECK(cudaFree(deviceColorLookupWeights)); + + return avgTime; +} + +double measureCPUTime(const cv::Mat& inputImage, cv::Mat& outputImage, int diameter, float colorSigma, float spatialSigma) { + const int iterations = 5; + cv::bilateralFilter(inputImage, outputImage, diameter, colorSigma, spatialSigma); + + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iterations; ++i) { + cv::bilateralFilter(inputImage, outputImage, diameter, colorSigma, spatialSigma); + } + auto end = std::chrono::high_resolution_clock::now(); + + std::chrono::duration elapsed = end - start; + return elapsed.count() / iterations; +} + +int main(int argc, char** argv) { + std::string imagePath = (argc > 1) ? argv[1] : "test.jpg"; + std::string paramsFile = (argc > 2) ? argv[2] : "params.txt"; + + BilateralParams params; + parseParamsFile(paramsFile, params); + + std::cout << "Parameters:" << std::endl; + std::cout << " Radius: " << params.radius << std::endl; + std::cout << " Sigma Spatial: " << params.sigmaSpatial << std::endl; + std::cout << " Sigma Color: " << params.sigmaColor << std::endl; + std::cout << " Use Adaptive Radius: " << (params.useAdaptiveRadius ? "true" : "false") << std::endl; + + cv::Mat inputImage = cv::imread(imagePath, cv::IMREAD_UNCHANGED); + if (inputImage.empty()) { + std::cerr << "Error: Cannot load image: " << imagePath << std::endl; + return EXIT_FAILURE; + } + + if (inputImage.channels() == 4) { + cv::cvtColor(inputImage, inputImage, cv::COLOR_BGRA2BGR); + } + + if (inputImage.channels() != 1 && inputImage.channels() != 3) { + std::cerr << "Error: Only 1-channel (grayscale) or 3-channel (RGB) images are supported." << std::endl; + return EXIT_FAILURE; + } + + int filterRadius = params.radius; + if (params.useAdaptiveRadius) { + filterRadius = selectAdaptiveRadius(inputImage, params.radius, params.noiseLevel); + std::cout << "Adaptive radius selected: " << filterRadius << std::endl; + } + + int colorTableSize = (inputImage.channels() == 1) ? maxGrayDistance : maxColorDistance; + std::vector spatialLookupWeights; + std::vector colorLookupWeights; + lookupWeights(spatialLookupWeights, colorLookupWeights, filterRadius, params.sigmaSpatial, params.sigmaColor, colorTableSize); + + cv::Mat paddedImage; + cv::copyMakeBorder(inputImage, paddedImage, filterRadius, filterRadius, filterRadius, filterRadius, cv::BORDER_REFLECT); + cv::Mat resultImage = cv::Mat::zeros(inputImage.size(), inputImage.type()); + + double gpuTimeMs = measureGPUTime(inputImage, paddedImage, resultImage, spatialLookupWeights, colorLookupWeights, filterRadius); + + cv::Mat referenceImage; + double cpuTimeMs = measureCPUTime(inputImage, referenceImage, 2 * filterRadius + 1, params.sigmaColor, params.sigmaSpatial); + + float mae = calculateMeanAbsoluteError(resultImage, referenceImage); + + std::cout << std::endl; + std::cout << "========== Results ==========" << std::endl; + std::cout << "Image: " << imagePath << " (" << inputImage.cols << "x" << inputImage.rows << ", " << inputImage.channels() << " channels)" << std::endl; + std::cout << "Filter Radius: " << filterRadius << std::endl; + std::cout << std::endl; + std::cout << "Performance:" << std::endl; + std::cout << " GPU Time: " << gpuTimeMs << " ms" << std::endl; + std::cout << " GPU Throughput: " << (inputImage.cols * inputImage.rows / (gpuTimeMs / 1000.0) / 1000000.0) << " MP/s" << std::endl; + std::cout << " CPU Time: " << cpuTimeMs << " ms" << std::endl; + std::cout << " CPU Throughput: " << (inputImage.cols * inputImage.rows / (cpuTimeMs / 1000.0) / 1000000.0) << " MP/s" << std::endl; + std::cout << " Speedup (CPU/GPU): " << (cpuTimeMs / gpuTimeMs) << "x" << std::endl; + std::cout << std::endl; + std::cout << "Correctness:" << std::endl; + std::cout << " MAE vs OpenCV: " << mae << std::endl; + std::cout << " Pass (MAE < 1.0): " << (mae < 1.0 ? "YES" : "NO") << std::endl; + std::cout << "=============================" << std::endl; + + std::string outputFilename = "bilateralFilter_result.raw"; + if (!writeRawImage(outputFilename, resultImage)) { + std::cerr << "Warning: Failed to write output raw file." << std::endl; + } + + cv::imwrite("bilateralFilter_result.jpg", resultImage); + + writePerformanceLog( + "performance_log.txt", + imagePath, + inputImage.cols, + inputImage.rows, + inputImage.channels(), + filterRadius, + params.sigmaSpatial, + params.sigmaColor, + gpuTimeMs, + cpuTimeMs, + mae + ); + + if (mae >= 1.0f) { + std::cerr << "Warning: MAE (" << mae << ") >= 1.0, results may not meet accuracy requirements." << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/08_bilateral_filter/monarch.bmp b/08_bilateral_filter/monarch.bmp new file mode 100644 index 0000000..85b01ef Binary files /dev/null and b/08_bilateral_filter/monarch.bmp differ diff --git a/08_bilateral_filter/params.txt b/08_bilateral_filter/params.txt new file mode 100644 index 0000000..7f62ba4 --- /dev/null +++ b/08_bilateral_filter/params.txt @@ -0,0 +1,14 @@ +# 双边滤波参数配置文件 + +# 窗口半径,实际窗口大小为 (2*radius+1)^2 +radius = 5 + +# 空间域标准差 +sigma_spatial = 3.0 + +# 颜色域标准差(对于8位图像,范围0-255) +sigma_color = 30.0 + +# 是否启用自适应半径选择:根据图像噪声自动调整滤波半径 +# 可选值: true / false +use_adaptive_radius = false diff --git "a/08_bilateral_filter/\345\256\236\346\227\266\345\217\214\350\276\271\346\273\244\346\263\242-CUDA\350\256\272\346\226\207.pdf" "b/08_bilateral_filter/\345\256\236\346\227\266\345\217\214\350\276\271\346\273\244\346\263\242-CUDA\350\256\272\346\226\207.pdf" new file mode 100644 index 0000000..eabcb35 Binary files /dev/null and "b/08_bilateral_filter/\345\256\236\346\227\266\345\217\214\350\276\271\346\273\244\346\263\242-CUDA\350\256\272\346\226\207.pdf" differ diff --git "a/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/baboon.jpg" "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/baboon.jpg" new file mode 100644 index 0000000..337312a Binary files /dev/null and "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/baboon.jpg" differ diff --git "a/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/girl.jpg" "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/girl.jpg" new file mode 100644 index 0000000..686edaa Binary files /dev/null and "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/girl.jpg" differ diff --git "a/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/main.cu" "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/main.cu" new file mode 100644 index 0000000..9175249 --- /dev/null +++ "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/main.cu" @@ -0,0 +1,548 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// 错误检查宏 +#define CUDA_CHECK(call) do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +constexpr int blockSizeX = 32; +constexpr int blockSizeY = 8; +constexpr int maxGrayDistance = 255; +constexpr int maxColorDistance = 255 * 3; + +// 参数结构体 +struct BilateralParams { + int radius = 5; + float sigmaSpatial = 3.0f; + float sigmaColor = 30.0f; + bool useAdaptiveRadius = false; + float noiseLevel = 0.0f; // 用于自适应半径估计的噪声水平 +}; + +// 灰度图双边滤波核函数 +__global__ void BilateralFilterGrayKernel( + const uchar* inputImage, + uchar* outputImage, + const float* spatialLookupWeights, + const float* colorLookupWeights, + int imageWidth, + int imageHeight, + size_t inputStep, + size_t outputStep, + int filterRadius +) { + int pixelX = blockIdx.x * blockDim.x + threadIdx.x; + int pixelY = blockIdx.y * blockDim.y + threadIdx.y; + + if (pixelX >= imageWidth || pixelY >= imageHeight) return; + + int paddedX = pixelX + filterRadius; + int paddedY = pixelY + filterRadius; + + // 计算中心像素在输入图像中的内存地址 + float centerValue = static_cast(inputImage[paddedY * inputStep + paddedX]); + + float weightValueSum = 0.0f; + float weightSum = 0.0f; + int spatialIndex = 0; + + for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) { + for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) { + float nearValue = static_cast(inputImage[(paddedY + offsetY) * inputStep + (paddedX + offsetX)]); + int colorDistance = static_cast(fabsf(nearValue - centerValue)); + + if (colorDistance > maxGrayDistance) colorDistance = maxGrayDistance; + + float finalWeight = spatialLookupWeights[spatialIndex] * colorLookupWeights[colorDistance]; + weightValueSum += finalWeight * nearValue; + weightSum += finalWeight; + } + } + + outputImage[pixelY * outputStep + paddedX] = static_cast(weightValueSum / weightSum); +} + +// 彩色图双边滤波核函数 +__global__ void BilateralFilterColorKernel( + const uchar* inputImage, + uchar* outputImage, + const float* spatialLookupWeights, + const float* colorLookupWeights, + int imageWidth, + int imageHeight, + size_t inputStep, + size_t outputStep, + int filterRadius +) { + int pixelX = blockIdx.x * blockDim.x + threadIdx.x; + int pixelY = blockIdx.y * blockDim.y + threadIdx.y; + + if (pixelX >= imageWidth || pixelY >= imageHeight) return; + + int paddedX = pixelX + filterRadius; + int paddedY = pixelY + filterRadius; + size_t centerIndex = paddedY * inputStep + paddedX * 3; + + // 读取中心像素的BGR值 + float centerBlue = static_cast(inputImage[centerIndex + 0]); + float centerGreen = static_cast(inputImage[centerIndex + 1]); + float centerRed = static_cast(inputImage[centerIndex + 2]); + + float blueSum = 0.0f; + float greenSum = 0.0f; + float redSum = 0.0f; + float weightSum = 0.0f; + int spatialIndex = 0; + + for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) { + for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) { + size_t nearIndex = (paddedY + offsetY) * inputStep + (paddedX + offsetX) * 3; + + float nearBlue = static_cast(inputImage[nearIndex + 0]); + float nearGreen = static_cast(inputImage[nearIndex + 1]); + float nearRed = static_cast(inputImage[nearIndex + 2]); + + int colorDistance = + static_cast(fabsf(nearBlue - centerBlue)) + + static_cast(fabsf(nearGreen - centerGreen)) + + static_cast(fabsf(nearRed - centerRed)); + + if (colorDistance > maxColorDistance) colorDistance = maxColorDistance; + + float finalWeight = spatialLookupWeights[spatialIndex] * colorLookupWeights[colorDistance]; + blueSum += finalWeight * nearBlue; + greenSum += finalWeight * nearGreen; + redSum += finalWeight * nearRed; + weightSum += finalWeight; + } + } + + size_t outputIndex = pixelY * outputStep + pixelX * 3; + outputImage[outputIndex + 0] = static_cast(blueSum / weightSum); + outputImage[outputIndex + 1] = static_cast(greenSum / weightSum); + outputImage[outputIndex + 2] = static_cast(redSum / weightSum); +} + +// 创建空间权重和颜色权重查找表 +void lookupWeights( + std::vector& spatialLookupWeights, + std::vector& colorLookupWeights, + int filterRadius, + float spatialSigma, + float colorSigma, + int colorTableSize +) { + int windowLength = 2 * filterRadius + 1; + + spatialLookupWeights.resize(windowLength * windowLength); + colorLookupWeights.resize(colorTableSize + 1); + + float spatialFactor = -0.5f / (spatialSigma * spatialSigma); + float colorCoefficient = -0.5f / (colorSigma * colorSigma); + int spatialIndex = 0; + + for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) { + for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) { + float distanceSquare = static_cast(offsetX * offsetX + offsetY * offsetY); + spatialLookupWeights[spatialIndex] = expf(distanceSquare * spatialFactor); + } + } + + for (int colorDistance = 0; colorDistance <= colorTableSize; ++colorDistance) { + colorLookupWeights[colorDistance] = expf(colorDistance * colorDistance * colorCoefficient); + } +} + +// MAE +float calculateMeanAbsoluteError(const cv::Mat& image1, const cv::Mat& image2) { + cv::Mat diff; + cv::absdiff(image1, image2, diff); + cv::Scalar meanVal = cv::mean(diff); + float mae = 0.0f; + for (int c = 0; c < image1.channels(); ++c) { + mae += static_cast(meanVal[c]); + } + return mae / image1.channels(); +} + +// 估计图像噪声水平(简单估计:计算局部方差的中位数) +float estimateNoiseLevel(const cv::Mat& image) { + cv::Mat gray; + if (image.channels() == 3) { + cv::cvtColor(image, gray, cv::COLOR_BGR2GRAY); + } else { + gray = image.clone(); + } + + cv::Mat laplacian; + cv::Laplacian(gray, laplacian, CV_32F); + cv::Mat absLaplacian = cv::abs(laplacian); + + cv::Scalar meanVal, stdVal; + cv::meanStdDev(absLaplacian, meanVal, stdVal); + + return static_cast(stdVal[0]); +} + +// 自适应半径选择:根据噪声水平动态调整滤波半径 +int selectAdaptiveRadius(const cv::Mat& image, float baseRadius, float noiseLevel) { + if (noiseLevel <= 0) { + noiseLevel = estimateNoiseLevel(image); + } + + // 噪声越大,使用更大的半径来平滑;噪声小,保持细节 + float noiseFactor = std::min(noiseLevel / 30.0f, 2.0f); // 归一化噪声因子 + int adaptiveRadius = static_cast(baseRadius * (1.0f + noiseFactor * 0.5f)); + + return std::max(2, std::min(adaptiveRadius, 10)); +} + +// 解析参数文件 +bool parseParamsFile(const std::string& filename, BilateralParams& params) { + std::ifstream file(filename); + if (!file.is_open()) { + std::cerr << "Warning: Cannot open params file: " << filename << ", using default values." << std::endl; + return false; + } + + std::string line; + while (std::getline(file, line)) { + if (line.empty() || line[0] == '#') continue; + + std::istringstream iss(line); + std::string key, eq; + if (std::getline(iss, key, '=')) { + key.erase(0, key.find_first_not_of(" \t")); + key.erase(key.find_last_not_of(" \t") + 1); + + std::string value; + if (std::getline(iss, value, '#')) { + value.erase(0, value.find_first_not_of(" \t")); + value.erase(value.find_last_not_of(" \t") + 1); + } + + if (key == "radius") { + params.radius = std::stoi(value); + } else if (key == "sigma_spatial") { + params.sigmaSpatial = std::stof(value); + } else if (key == "sigma_color") { + params.sigmaColor = std::stof(value); + } else if (key == "use_adaptive_radius") { + params.useAdaptiveRadius = (value == "true" || value == "1"); + } + } + } + return true; +} + +// 写入二进制raw文件 +bool writeRawImage(const std::string& filename, const cv::Mat& image) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Error: Cannot write to file: " << filename << std::endl; + return false; + } + + int width = image.cols; + int height = image.rows; + int channels = image.channels(); + + // 写入头部信息:宽、高、通道数 + file.write(reinterpret_cast(&width), sizeof(int)); + file.write(reinterpret_cast(&height), sizeof(int)); + file.write(reinterpret_cast(&channels), sizeof(int)); + + // 写入像素数据(按行) + for (int y = 0; y < height; ++y) { + file.write(reinterpret_cast(image.ptr(y)), width * channels); + } + + return file.good(); +} + +// 读取二进制raw文件 +cv::Mat readRawImage(const std::string& filename) { + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Error: Cannot open file: " << filename << std::endl; + return cv::Mat(); + } + + int width, height, channels; + file.read(reinterpret_cast(&width), sizeof(int)); + file.read(reinterpret_cast(&height), sizeof(int)); + file.read(reinterpret_cast(&channels), sizeof(int)); + + int type = (channels == 1) ? CV_8UC1 : CV_8UC3; + cv::Mat image(height, width, type); + + for (int y = 0; y < height; ++y) { + file.read(reinterpret_cast(image.ptr(y)), width * channels); + } + + return image; +} + +// 写入性能日志 +void writePerformanceLog( + const std::string& filename, + const std::string& imagePath, + int width, + int height, + int channels, + int radius, + float spatialSigma, + float colorSigma, + double gpuTimeMs, + double cpuTimeMs, + float mae +) { + std::ofstream file(filename); + if (!file.is_open()) { + std::cerr << "Error: Cannot write log file: " << filename << std::endl; + return; + } + + double totalPixels = static_cast(width) * height; + double gpuThroughput = totalPixels / (gpuTimeMs / 1000.0) / 1000000.0; // 百万像素/秒 + double cpuThroughput = totalPixels / (cpuTimeMs / 1000.0) / 1000000.0; + double speedup = cpuTimeMs / gpuTimeMs; + + file << "========== Bilateral Filter Performance Log ==========" << std::endl; + file << "Image: " << imagePath << std::endl; + file << "Resolution: " << width << "x" << height << std::endl; + file << "Channels: " << channels << std::endl; + file << "Total Pixels: " << totalPixels << std::endl; + file << std::endl; + file << "Filter Parameters:" << std::endl; + file << " Radius: " << radius << std::endl; + file << " Sigma Spatial: " << spatialSigma << std::endl; + file << " Sigma Color: " << colorSigma << std::endl; + file << std::endl; + file << "Performance Metrics:" << std::endl; + file << " GPU Time: " << gpuTimeMs << " ms" << std::endl; + file << " GPU Throughput: " << gpuThroughput << " MP/s" << std::endl; + file << " CPU Time: " << cpuTimeMs << " ms" << std::endl; + file << " CPU Throughput: " << cpuThroughput << " MP/s" << std::endl; + file << " Speedup (CPU/GPU): " << speedup << "x" << std::endl; + file << std::endl; + file << "Correctness:" << std::endl; + file << " MAE vs OpenCV: " << mae << std::endl; + file << " Pass (MAE < 1.0): " << (mae < 1.0 ? "YES" : "NO") << std::endl; + file << "======================================================" << std::endl; +} + +// 使用CUDA事件计时GPU执行时间 +double measureGPUTime( + const cv::Mat& inputImage, + const cv::Mat& paddedImage, + cv::Mat& resultImage, + const std::vector& spatialLookupWeights, + const std::vector& colorLookupWeights, + int filterRadius +) { + + uchar* deviceInputImage = nullptr; + uchar* deviceOutputImage = nullptr; + float* deviceSpatialLookupWeights = nullptr; + float* deviceColorLookupWeights = nullptr; + + CUDA_CHECK(cudaMalloc(&deviceInputImage, paddedImage.step * paddedImage.rows)); + CUDA_CHECK(cudaMalloc(&deviceOutputImage, resultImage.step * resultImage.rows)); + + CUDA_CHECK(cudaMalloc(&deviceSpatialLookupWeights, spatialLookupWeights.size() * sizeof(float))); + CUDA_CHECK(cudaMalloc(&deviceColorLookupWeights, colorLookupWeights.size() * sizeof(float))); + + CUDA_CHECK(cudaMemcpy(deviceInputImage, paddedImage.data, paddedImage.step * paddedImage.rows, cudaMemcpyHostToDevice)); + + CUDA_CHECK(cudaMemcpy(deviceSpatialLookupWeights, spatialLookupWeights.data(), spatialLookupWeights.size() * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(deviceColorLookupWeights, colorLookupWeights.data(), colorLookupWeights.size() * sizeof(float), cudaMemcpyHostToDevice)); + + dim3 blockSize(blockSizeX, blockSizeY); + dim3 blockGroup((inputImage.cols + blockSizeX - 1) / blockSizeX, (inputImage.rows + blockSizeY - 1) / blockSizeY); + + cudaEvent_t start, stop; + CUDA_CHECK(cudaEventCreate(&start)); + CUDA_CHECK(cudaEventCreate(&stop)); + + if (inputImage.channels() == 1) { + BilateralFilterGrayKernel<<>>( + deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights, + inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius); + } else { + BilateralFilterColorKernel<<>>( + deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights, + inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius); + } + CUDA_CHECK(cudaDeviceSynchronize()); + + // 正式计时,运行10次取平均 + const int iterations = 10; + CUDA_CHECK(cudaEventRecord(start)); + for (int i = 0; i < iterations; ++i) { + if (inputImage.channels() == 1) { + BilateralFilterGrayKernel<<>>( + deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights, + inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius); + } else { + BilateralFilterColorKernel<<>>( + deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights, + inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius); + } + } + + CUDA_CHECK(cudaEventRecord(stop)); + CUDA_CHECK(cudaEventSynchronize(stop)); + + float elapsedTime = 0.0f; + CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start, stop)); + double avgTime = elapsedTime / iterations; + + CUDA_CHECK(cudaMemcpy(resultImage.data, deviceOutputImage, resultImage.step * resultImage.rows, cudaMemcpyDeviceToHost)); + + CUDA_CHECK(cudaEventDestroy(start)); + CUDA_CHECK(cudaEventDestroy(stop)); + + CUDA_CHECK(cudaFree(deviceInputImage)); + CUDA_CHECK(cudaFree(deviceOutputImage)); + + CUDA_CHECK(cudaFree(deviceSpatialLookupWeights)); + CUDA_CHECK(cudaFree(deviceColorLookupWeights)); + + return avgTime; +} + +// 测量CPU OpenCV双边滤波时间 +double measureCPUTime(const cv::Mat& inputImage, cv::Mat& outputImage, + int diameter, float colorSigma, float spatialSigma) { + const int iterations = 5; + + cv::bilateralFilter(inputImage, outputImage, diameter, colorSigma, spatialSigma); + + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iterations; ++i) { + cv::bilateralFilter(inputImage, outputImage, diameter, colorSigma, spatialSigma); + } + auto end = std::chrono::high_resolution_clock::now(); + + std::chrono::duration elapsed = end - start; + return elapsed.count() / iterations; +} + +int main(int argc, char** argv) { + std::string imagePath = (argc > 1) ? argv[1] : "test.jpg"; + std::string paramsFile = (argc > 2) ? argv[2] : "params.txt"; + + // 解析参数文件 + BilateralParams params; + parseParamsFile(paramsFile, params); + + std::cout << "Parameters:" << std::endl; + std::cout << " Radius: " << params.radius << std::endl; + std::cout << " Sigma Spatial: " << params.sigmaSpatial << std::endl; + std::cout << " Sigma Color: " << params.sigmaColor << std::endl; + std::cout << " Use Adaptive Radius: " << (params.useAdaptiveRadius ? "true" : "false") << std::endl; + + cv::Mat inputImage = cv::imread(imagePath, cv::IMREAD_UNCHANGED); + if (inputImage.empty()) { + std::cerr << "Error: Cannot load image: " << imagePath << std::endl; + return EXIT_FAILURE; + } + + if (inputImage.channels() == 4) { + cv::cvtColor(inputImage, inputImage, cv::COLOR_BGRA2BGR); + } + + if (inputImage.channels() != 1 && inputImage.channels() != 3) { + std::cerr << "Error: Only 1-channel (grayscale) or 3-channel (RGB) images are supported." << std::endl; + return EXIT_FAILURE; + } + + int filterRadius = params.radius; + if (params.useAdaptiveRadius) { + filterRadius = selectAdaptiveRadius(inputImage, params.radius, params.noiseLevel); + std::cout << "Adaptive radius selected: " << filterRadius << std::endl; + } + + int colorTableSize = (inputImage.channels() == 1) ? maxGrayDistance : maxColorDistance; + + std::vector spatialLookupWeights; + std::vector colorLookupWeights; + lookupWeights(spatialLookupWeights, colorLookupWeights, + filterRadius, + params.sigmaSpatial, params.sigmaColor, + colorTableSize); + + // 填充边界 + cv::Mat paddedImage; + cv::copyMakeBorder(inputImage, paddedImage, filterRadius, filterRadius, filterRadius, filterRadius, cv::BORDER_REFLECT); + + cv::Mat resultImage = cv::Mat::zeros(inputImage.size(), inputImage.type()); + + double gpuTimeMs = measureGPUTime(inputImage, paddedImage, resultImage, spatialLookupWeights, colorLookupWeights, filterRadius); + + // CPU OpenCV对比 + cv::Mat referenceImage; + double cpuTimeMs = measureCPUTime(inputImage, referenceImage, 2 * filterRadius + 1, params.sigmaColor, params.sigmaSpatial); + + float mae = calculateMeanAbsoluteError(resultImage, referenceImage); + + std::cout << std::endl; + std::cout << "========== Results ==========" << std::endl; + std::cout << "Image: " << imagePath << " (" << inputImage.cols << "x" << inputImage.rows << ", " << inputImage.channels() << " channels)" << std::endl; + std::cout << "Filter Radius: " << filterRadius << std::endl; + std::cout << std::endl; + std::cout << "Performance:" << std::endl; + std::cout << " GPU Time: " << gpuTimeMs << " ms" << std::endl; + std::cout << " GPU Throughput: " << (inputImage.cols * inputImage.rows / (gpuTimeMs / 1000.0) / 1000000.0) << " MP/s" << std::endl; + std::cout << " CPU Time: " << cpuTimeMs << " ms" << std::endl; + std::cout << " CPU Throughput: " << (inputImage.cols * inputImage.rows / (cpuTimeMs / 1000.0) / 1000000.0) << " MP/s" << std::endl; + std::cout << " Speedup (CPU/GPU): " << (cpuTimeMs / gpuTimeMs) << "x" << std::endl; + std::cout << std::endl; + std::cout << "Correctness:" << std::endl; + std::cout << " MAE vs OpenCV: " << mae << std::endl; + std::cout << " Pass (MAE < 1.0): " << (mae < 1.0 ? "YES" : "NO") << std::endl; + std::cout << "=============================" << std::endl; + + std::string outputFilename = "bilateralFilter_result.raw"; + if (!writeRawImage(outputFilename, resultImage)) { + std::cerr << "Warning: Failed to write output raw file." << std::endl; + } + + cv::imwrite("bilateralFilter_result.jpg", resultImage); + + writePerformanceLog( + "performance_log.txt", + imagePath, + inputImage.cols, + inputImage.rows, + inputImage.channels(), + filterRadius, + params.sigmaSpatial, + params.sigmaColor, + gpuTimeMs, + cpuTimeMs, + mae + ); + + if (mae >= 1.0f) { + std::cerr << "Warning: MAE (" << mae << ") >= 1.0, results may not meet accuracy requirements." << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git "a/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/monarch.jpg" "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/monarch.jpg" new file mode 100644 index 0000000..ee297f7 Binary files /dev/null and "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/monarch.jpg" differ diff --git "a/08_bilateral_filter/\351\241\271\347\233\256\350\246\201\346\261\202.md" "b/08_bilateral_filter/\351\241\271\347\233\256\350\246\201\346\261\202.md" new file mode 100644 index 0000000..d2e652c --- /dev/null +++ "b/08_bilateral_filter/\351\241\271\347\233\256\350\246\201\346\261\202.md" @@ -0,0 +1,54 @@ +## 八. 实时图像双边滤波(CUDA) + +### **任务背景** + +凭借你 InfiniTensor 训练营的经历和人工智能与并行计算方面的出色成绩,你受雇于一家领先的虚拟现实公司,负责为下一代 VR 头显开发实时图像增强模块。用户经常抱怨在弱光环境下看到的图像噪点严重,而现有降噪算法要么模糊细节,要么延迟太高导致眩晕。 + +你提出采用双边滤波算法,它能够在去除噪声的同时保持边缘清晰,非常适合VR图像处理。但双边滤波计算量极大,每个像素需要访问整个邻域并计算复杂的权重,难以在移动端实时运行。幸运的是,VR 头显内置了你知道的某单位生产的黑科技 GPU,你需要利用 CUDA 实现一个高度优化的双边滤波核,能够对 4K 分辨率、60 帧/秒的视频流进行实时处理。这将大幅提升用户的沉浸感和舒适度,为公司抢占市场提供核心技术。 + +### **领域知识简介** + +双边滤波是一种非线性的保边滤波器,其输出像素值是邻域内所有像素的加权平均。权重由两部分组成:空间权重(依赖于像素间的空间距离,高斯函数)和颜色权重(依赖于像素值的差异,高斯函数)。因此,在平坦区域颜色相似,权重近似空间高斯,实现平滑;在边缘处颜色差异大,颜色权重小,避免跨边缘平滑。 + +双边滤波的难点在于计算量大:对于半径为 `r`的窗口,每个像素需要计算 `(2r+1)²`个权重,且涉及指数运算。实时性要求(如 4K 60fps)对算法效率提出了极高挑战。 + +### **任务内容** + +开发一个 CUDA 程序,实现每个像素独立计算的并行双边滤波。 + +#### **输入文件定义** + +1. **图像文件** (标准格式,如 PNG、JPG),RGB 三通道或灰度。可使用 OpenCV 或 stb_image 读取,但本项目只关注 GPU 部分,因此可提供已读取的像素数组。为简化,输入可为二进制 raw 格式:宽、高、通道数,然后像素数据按行主序存储(RGBRGB...或灰度)。 +2. **参数文件** (文本格式) + +```Plain +radius = 5 # 窗口半径,实际窗口大小为 (2*radius+1)^2 +sigma_spatial = 3.0 # 空间域标准差 +sigma_color = 30.0 # 颜色域标准差(对于8位图像,范围0-255) +``` + +#### 输出文件定义 + +* **滤波后的图像** :与输入同格式的二进制 raw 文件。 +* **性能日志** :处理时间(ms)、吞吐量(百万像素/秒)、加速比(对比 CPU OpenCV 实现)。 + +### 要求 + +1. **图像格式** :输入图像支持灰度图或 RGB 彩色图,像素值范围为 0-255(整数),处理时转换为浮点。输出为同格式。 +2. **滤波器参数** :窗口半径 `r`至少支持到 5(即 `11×11` 窗口),空间标准差 `σs`(sigma_spatial) 和颜色标准差 `σc`(sigma_color) 可配置。 +3. **正确性验证** :输出图像应与 OpenCV 的 `bilateralFilter` 结果对比,计算平均绝对误差(MAE)或峰值信噪比(PSNR)。平均绝对误差小于 1(对于 8 位图像)。 +4. **进阶优化** :实现自适应半径选择。 +5. **平台适配** :默认需在英伟达平台上支持。每在一款国产平台上支持,额外加 20% 乘算系数。 + + **需提交内容** : + +* 整个完成上述功能的程序(包含测试) + * 提交地址:[Learning-CUDA 2025-winter-project 分支](https://github.com/InfiniTensor/Learning-CUDA/tree/2025-winter-project) + * 提交要求:参照项目代码要求中的第四条 +* 总结报告。报告需包括: + * 详细阐述实现思路以及优化方法,欢迎包含自己的优化历程和记录,开发中发现的问题等(取决于质量可以加分) + * 最终的性能指标和分析 + * 未来可继续提升的地方(可选) + * 包含 ncu 和/或 nsys 使用和分析的加分 + +---