diff --git a/08_bilateral_filter/README.md b/08_bilateral_filter/README.md
index c4034f0..cdc12d5 100644
--- a/08_bilateral_filter/README.md
+++ b/08_bilateral_filter/README.md
@@ -1,16 +1,68 @@
-## 简介
+# 实时双边滤波-CUDA 
 
-凭借你 InfiniTensor 训练营的经历和人工智能与并行计算方面的出色成绩，你受雇于一家领先的虚拟现实公司，负责为下一代 VR 头显开发实时图像增强模块。用户经常抱怨在弱光环境下看到的图像噪点严重，而现有降噪算法要么模糊细节，要么延迟太高导致眩晕。
+## 项目结构
 
-你提出采用双边滤波算法，它能够在去除噪声的同时保持边缘清晰，非常适合VR图像处理。但双边滤波计算量极大，每个像素需要访问整个邻域并计算复杂的权重，难以在移动端实时运行。幸运的是，VR 头显内置了你知道的某单位生产的黑科技 GPU，你需要利用 CUDA 实现一个高度优化的双边滤波核，能够对 4K 分辨率、60 帧/秒的视频流进行实时处理。这将大幅提升用户的沉浸感和舒适度，为公司抢占市场提供核心技术。
+当前项目结构如下：
 
-## 任务内容
-开发一个 CUDA 程序，实现并行双边滤波。
+```txt
+实时双边滤波-CUDA/
+├─ 朴素实现代码及结果/
+├─ 实时双边滤波-CUDA论文.pdf
+├─ 项目要求.md
+├─ baboon.bmp
+├─ girl.bmp
+├─ main.cu
+├─ monarch.bmp
+├─ params.txt
+└─ README.md
+```
 
-## 📬 有疑问?
+其中：
 
-更多详细信息和要求请参考本季度项目文档。
+- `main.cu`：共享内存 + 网格跨步循环优化后代码
+- `params.txt`：双边滤波参数文件
+- `baboon.bmp`、`girl.bmp`、`monarch.bmp`：测试图像
+- `实时双边滤波-CUDA论文.pdf`：最终提交的总结报告 PDF
+- `项目要求.md`：课程项目要求
+- `朴素实现代码及结果/`：朴素版本代码及阶段性结果记录
 
-可以在项目群里直接询问导师和助教！
+## 英伟达平台编译与运行
 
-Good luck and happy coding! 🚀
+在已经安装 CUDA 和 OpenCV 的英伟达平台上，可以直接使用下面的命令编译并运行：
+
+```bash
+nvcc -std=c++11 main.cu -o bilateral_filter \
+-I/usr/local/include/opencv4 \
+-L/usr/local/lib \
+-lopencv_core \
+-lopencv_imgproc \
+-lopencv_imgcodecs
+
+./bilateral_filter <测试图像名称> params.txt
+```
+
+例如：
+
+```bash
+./bilateral_filter monarch.bmp params.txt
+```
+
+## 参数文件格式
+
+`params.txt` 示例：
+
+```txt
+radius = 5
+sigma_spatial = 3.0
+sigma_color = 30.0
+use_adaptive_radius = false
+```
+
+## 输出内容
+
+程序运行后会输出：
+
+- 滤波后的 raw 文件
+- 滤波后的图像文件
+- 性能日志
+- 终端中的 GPU/CPU 时间、吞吐量、加速比和 MAE
diff --git a/08_bilateral_filter/baboon.bmp b/08_bilateral_filter/baboon.bmp
new file mode 100644
index 0000000..2aef6c9
Binary files /dev/null and b/08_bilateral_filter/baboon.bmp differ
diff --git a/08_bilateral_filter/girl.bmp b/08_bilateral_filter/girl.bmp
new file mode 100644
index 0000000..42d5c98
Binary files /dev/null and b/08_bilateral_filter/girl.bmp differ
diff --git a/08_bilateral_filter/main.cu b/08_bilateral_filter/main.cu
new file mode 100644
index 0000000..ea8806b
--- /dev/null
+++ b/08_bilateral_filter/main.cu
@@ -0,0 +1,607 @@
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <chrono>
+#include <algorithm>
+#include <cuda_runtime.h>
+#include <opencv2/opencv.hpp>
+
+// 错误检查宏
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = (call); \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while (0)
+
+constexpr int blockSizeX = 32;
+constexpr int blockSizeY = 8;
+constexpr int maxGrayDistance = 255;
+constexpr int maxColorDistance = 255 * 3;
+
+struct BilateralParams {
+    int radius = 5;
+    float sigmaSpatial = 3.0f;
+    float sigmaColor = 30.0f;
+    bool useAdaptiveRadius = false;
+    float noiseLevel = 0.0f;
+};
+
+// 灰度图双边滤波核函数
+__global__ void BilateralFilterGrayKernel(
+    const uchar* inputImage,
+    uchar* outputImage,
+    const float* spatialLookupWeights,
+    const float* colorLookupWeights,
+    int imageWidth,
+    int imageHeight,
+    size_t inputStep,
+    size_t outputStep,
+    int filterRadius
+) {
+    extern __shared__ uchar sharedTile[];
+
+    int tileWidth = blockDim.x + 2 * filterRadius;
+    int tileHeight = blockDim.y + 2 * filterRadius;
+    int paddedWidth = imageWidth + 2 * filterRadius;
+    int paddedHeight = imageHeight + 2 * filterRadius;
+
+    for (int tileBlockY = blockIdx.y; tileBlockY * blockDim.y < imageHeight; tileBlockY += gridDim.y) {
+        for (int tileBlockX = blockIdx.x; tileBlockX * blockDim.x < imageWidth; tileBlockX += gridDim.x) {
+            int tileStartX = tileBlockX * blockDim.x;
+            int tileStartY = tileBlockY * blockDim.y;
+
+            for (int localY = threadIdx.y; localY < tileHeight; localY += blockDim.y) {
+                for (int localX = threadIdx.x; localX < tileWidth; localX += blockDim.x) {
+                    int globalX = tileStartX + localX;
+                    int globalY = tileStartY + localY;
+
+                    if (globalX > paddedWidth - 1) globalX = paddedWidth - 1;
+                    if (globalY > paddedHeight - 1) globalY = paddedHeight - 1;
+
+                    sharedTile[localY * tileWidth + localX] = inputImage[globalY * inputStep + globalX];
+                }
+            }
+            __syncthreads();
+
+            int pixelX = tileStartX + threadIdx.x;
+            int pixelY = tileStartY + threadIdx.y;
+
+            if (pixelX < imageWidth && pixelY < imageHeight) {
+                int centerLocalX = threadIdx.x + filterRadius;
+                int centerLocalY = threadIdx.y + filterRadius;
+
+                float centerValue = static_cast<float>(sharedTile[centerLocalY * tileWidth + centerLocalX]);
+
+                float weightValueSum = 0.0f;
+                float weightSum = 0.0f;
+                int spatialIndex = 0;
+
+                for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) {
+                    for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) {
+                        float nearValue = static_cast<float>(sharedTile[(centerLocalY + offsetY) * tileWidth + (centerLocalX + offsetX)]);
+                        int colorDistance = static_cast<int>(fabsf(nearValue - centerValue));
+
+                        if (colorDistance > maxGrayDistance) colorDistance = maxGrayDistance;
+
+                        float finalWeight = spatialLookupWeights[spatialIndex] * colorLookupWeights[colorDistance];
+                        weightValueSum += finalWeight * nearValue;
+                        weightSum += finalWeight;
+                    }
+                }
+
+                outputImage[pixelY * outputStep + pixelX] = static_cast<uchar>(weightValueSum / weightSum);
+            }
+            __syncthreads();
+        }
+    }
+}
+
+// 彩色图双边滤波核函数
+__global__ void BilateralFilterColorKernel(
+    const uchar* inputImage,
+    uchar* outputImage,
+    const float* spatialLookupWeights,
+    const float* colorLookupWeights,
+    int imageWidth,
+    int imageHeight,
+    size_t inputStep,
+    size_t outputStep,
+    int filterRadius
+) {
+    extern __shared__ uchar sharedTile[];
+
+    int tileWidth = blockDim.x + 2 * filterRadius;
+    int tileHeight = blockDim.y + 2 * filterRadius;
+    int paddedWidth = imageWidth + 2 * filterRadius;
+    int paddedHeight = imageHeight + 2 * filterRadius;
+
+    // 网格跨步循环
+    for (int tileBlockY = blockIdx.y; tileBlockY * blockDim.y < imageHeight; tileBlockY += gridDim.y) {
+        for (int tileBlockX = blockIdx.x; tileBlockX * blockDim.x < imageWidth; tileBlockX += gridDim.x) {
+            int tileStartX = tileBlockX * blockDim.x;
+            int tileStartY = tileBlockY * blockDim.y;
+
+            for (int localY = threadIdx.y; localY < tileHeight; localY += blockDim.y) {
+                for (int localX = threadIdx.x; localX < tileWidth; localX += blockDim.x) {
+                    int globalX = tileStartX + localX;
+                    int globalY = tileStartY + localY;
+
+                    if (globalX > paddedWidth - 1) globalX = paddedWidth - 1;
+                    if (globalY > paddedHeight - 1) globalY = paddedHeight - 1;
+
+                    size_t globalIndex = globalY * inputStep + globalX * 3;
+                    size_t sharedIndex = (localY * tileWidth + localX) * 3;
+
+                    sharedTile[sharedIndex + 0] = inputImage[globalIndex + 0];
+                    sharedTile[sharedIndex + 1] = inputImage[globalIndex + 1];
+                    sharedTile[sharedIndex + 2] = inputImage[globalIndex + 2];
+                }
+            }
+            __syncthreads();
+
+            int pixelX = tileStartX + threadIdx.x;
+            int pixelY = tileStartY + threadIdx.y;
+
+            if (pixelX < imageWidth && pixelY < imageHeight) {
+                int centerLocalX = threadIdx.x + filterRadius;
+                int centerLocalY = threadIdx.y + filterRadius;
+
+                size_t centerIndex = (centerLocalY * tileWidth + centerLocalX) * 3;
+
+                float centerBlue = static_cast<float>(sharedTile[centerIndex + 0]);
+                float centerGreen = static_cast<float>(sharedTile[centerIndex + 1]);
+                float centerRed = static_cast<float>(sharedTile[centerIndex + 2]);
+
+                float blueSum = 0.0f;
+                float greenSum = 0.0f;
+                float redSum = 0.0f;
+                float weightSum = 0.0f;
+                int spatialIndex = 0;
+
+                for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) {
+                    for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) {
+                        size_t nearIndex = ((centerLocalY + offsetY) * tileWidth + (centerLocalX + offsetX)) * 3;
+                        
+						float nearBlue = static_cast<float>(sharedTile[nearIndex + 0]);
+                        float nearGreen = static_cast<float>(sharedTile[nearIndex + 1]);
+                        float nearRed = static_cast<float>(sharedTile[nearIndex + 2]);
+
+                        int colorDistance =
+                            static_cast<int>(fabsf(nearBlue - centerBlue)) +
+                            static_cast<int>(fabsf(nearGreen - centerGreen)) +
+                            static_cast<int>(fabsf(nearRed - centerRed));
+
+                        if (colorDistance > maxColorDistance) colorDistance = maxColorDistance;
+                        float finalWeight = spatialLookupWeights[spatialIndex] * colorLookupWeights[colorDistance];
+                        
+						blueSum += finalWeight * nearBlue;
+                        greenSum += finalWeight * nearGreen;
+                        redSum += finalWeight * nearRed;
+                        weightSum += finalWeight;
+                    }
+                }
+
+                size_t outputIndex = pixelY * outputStep + pixelX * 3;
+
+                outputImage[outputIndex + 0] = static_cast<uchar>(blueSum / weightSum);
+                outputImage[outputIndex + 1] = static_cast<uchar>(greenSum / weightSum);
+                outputImage[outputIndex + 2] = static_cast<uchar>(redSum / weightSum);
+            }
+            __syncthreads();
+        }
+    }
+}
+
+// 创建空间权重和颜色权重查找表
+void lookupWeights(
+    std::vector<float>& spatialLookupWeights,
+    std::vector<float>& colorLookupWeights,
+    int filterRadius,
+    float spatialSigma,
+    float colorSigma,
+    int colorTableSize
+) {
+    int windowLength = 2 * filterRadius + 1;
+
+    spatialLookupWeights.resize(windowLength * windowLength);
+    colorLookupWeights.resize(colorTableSize + 1);
+
+    float spatialFactor = -0.5f / (spatialSigma * spatialSigma);
+    float colorCoefficient = -0.5f / (colorSigma * colorSigma);
+    int spatialIndex = 0;
+
+    for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) {
+        for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) {
+            float distanceSquare = static_cast<float>(offsetX * offsetX + offsetY * offsetY);
+            spatialLookupWeights[spatialIndex] = expf(distanceSquare * spatialFactor);
+        }
+    }
+
+    for (int colorDistance = 0; colorDistance <= colorTableSize; ++colorDistance) {
+        colorLookupWeights[colorDistance] = expf(colorDistance * colorDistance * colorCoefficient);
+    }
+}
+
+float calculateMeanAbsoluteError(const cv::Mat& image1, const cv::Mat& image2) {
+    cv::Mat diff;
+    cv::absdiff(image1, image2, diff);
+    cv::Scalar meanVal = cv::mean(diff);
+    float mae = 0.0f;
+    for (int c = 0; c < image1.channels(); ++c) {
+        mae += static_cast<float>(meanVal[c]);
+    }
+    return mae / image1.channels();
+}
+
+float estimateNoiseLevel(const cv::Mat& image) {
+    cv::Mat gray;
+    if (image.channels() == 3) {
+        cv::cvtColor(image, gray, cv::COLOR_BGR2GRAY);
+    } else {
+        gray = image.clone();
+    }
+
+    cv::Mat laplacian;
+    cv::Laplacian(gray, laplacian, CV_32F);
+    cv::Mat absLaplacian = cv::abs(laplacian);
+
+    cv::Scalar meanVal, stdVal;
+    cv::meanStdDev(absLaplacian, meanVal, stdVal);
+    return static_cast<float>(stdVal[0]);
+}
+
+int selectAdaptiveRadius(const cv::Mat& image, float baseRadius, float noiseLevel) {
+    if (noiseLevel <= 0) {
+        noiseLevel = estimateNoiseLevel(image);
+    }
+
+    float noiseFactor = std::min(noiseLevel / 30.0f, 2.0f);
+    int adaptiveRadius = static_cast<int>(baseRadius * (1.0f + noiseFactor * 0.5f));
+    return std::max(2, std::min(adaptiveRadius, 10));
+}
+
+bool parseParamsFile(const std::string& filename, BilateralParams& params) {
+    std::ifstream file(filename);
+    if (!file.is_open()) {
+        std::cerr << "Warning: Cannot open params file: " << filename << ", using default values." << std::endl;
+        return false;
+    }
+
+    std::string line;
+    while (std::getline(file, line)) {
+        if (line.empty() || line[0] == '#') continue;
+
+        std::istringstream iss(line);
+        std::string key;
+        if (std::getline(iss, key, '=')) {
+            key.erase(0, key.find_first_not_of(" \t"));
+            key.erase(key.find_last_not_of(" \t") + 1);
+
+            std::string value;
+            if (std::getline(iss, value, '#')) {
+                value.erase(0, value.find_first_not_of(" \t"));
+                value.erase(value.find_last_not_of(" \t") + 1);
+            }
+
+            if (key == "radius") {
+                params.radius = std::stoi(value);
+            } else if (key == "sigma_spatial") {
+                params.sigmaSpatial = std::stof(value);
+            } else if (key == "sigma_color") {
+                params.sigmaColor = std::stof(value);
+            } else if (key == "use_adaptive_radius") {
+                params.useAdaptiveRadius = (value == "true" || value == "1");
+            }
+        }
+    }
+    return true;
+}
+
+bool writeRawImage(const std::string& filename, const cv::Mat& image) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Error: Cannot write to file: " << filename << std::endl;
+        return false;
+    }
+
+    int width = image.cols;
+    int height = image.rows;
+    int channels = image.channels();
+
+    file.write(reinterpret_cast<const char*>(&width), sizeof(int));
+    file.write(reinterpret_cast<const char*>(&height), sizeof(int));
+    file.write(reinterpret_cast<const char*>(&channels), sizeof(int));
+
+    for (int y = 0; y < height; ++y) {
+        file.write(reinterpret_cast<const char*>(image.ptr(y)), width * channels);
+    }
+
+    return file.good();
+}
+
+cv::Mat readRawImage(const std::string& filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Error: Cannot open file: " << filename << std::endl;
+        return cv::Mat();
+    }
+
+    int width, height, channels;
+    file.read(reinterpret_cast<char*>(&width), sizeof(int));
+    file.read(reinterpret_cast<char*>(&height), sizeof(int));
+    file.read(reinterpret_cast<char*>(&channels), sizeof(int));
+
+    int type = (channels == 1) ? CV_8UC1 : CV_8UC3;
+    cv::Mat image(height, width, type);
+
+    for (int y = 0; y < height; ++y) {
+        file.read(reinterpret_cast<char*>(image.ptr(y)), width * channels);
+    }
+
+    return image;
+}
+
+void writePerformanceLog(
+    const std::string& filename,
+    const std::string& imagePath,
+    int width,
+    int height,
+    int channels,
+    int radius,
+    float spatialSigma,
+    float colorSigma,
+    double gpuTimeMs,
+    double cpuTimeMs,
+    float mae
+) {
+    std::ofstream file(filename);
+    if (!file.is_open()) {
+        std::cerr << "Error: Cannot write log file: " << filename << std::endl;
+        return;
+    }
+
+    double totalPixels = static_cast<double>(width) * height;
+    double gpuThroughput = totalPixels / (gpuTimeMs / 1000.0) / 1000000.0;
+    double cpuThroughput = totalPixels / (cpuTimeMs / 1000.0) / 1000000.0;
+    double speedup = cpuTimeMs / gpuTimeMs;
+
+    file << "========== Bilateral Filter Performance Log ==========" << std::endl;
+    file << "Image: " << imagePath << std::endl;
+    file << "Resolution: " << width << "x" << height << std::endl;
+    file << "Channels: " << channels << std::endl;
+    file << "Total Pixels: " << totalPixels << std::endl;
+    file << std::endl;
+    file << "Filter Parameters:" << std::endl;
+    file << "  Radius: " << radius << std::endl;
+    file << "  Sigma Spatial: " << spatialSigma << std::endl;
+    file << "  Sigma Color: " << colorSigma << std::endl;
+    file << std::endl;
+    file << "Performance Metrics:" << std::endl;
+    file << "  GPU Time: " << gpuTimeMs << " ms" << std::endl;
+    file << "  GPU Throughput: " << gpuThroughput << " MP/s" << std::endl;
+    file << "  CPU Time: " << cpuTimeMs << " ms" << std::endl;
+    file << "  CPU Throughput: " << cpuThroughput << " MP/s" << std::endl;
+    file << "  Speedup (CPU/GPU): " << speedup << "x" << std::endl;
+    file << std::endl;
+    file << "Correctness:" << std::endl;
+    file << "  MAE vs OpenCV: " << mae << std::endl;
+    file << "  Pass (MAE < 1.0): " << (mae < 1.0 ? "YES" : "NO") << std::endl;
+    file << "======================================================" << std::endl;
+}
+
+double measureGPUTime(
+    const cv::Mat& inputImage,
+    const cv::Mat& paddedImage,
+    cv::Mat& resultImage,
+    const std::vector<float>& spatialLookupWeights,
+    const std::vector<float>& colorLookupWeights,
+    int filterRadius
+) {
+    uchar* deviceInputImage = nullptr;
+    uchar* deviceOutputImage = nullptr;
+    float* deviceSpatialLookupWeights = nullptr;
+    float* deviceColorLookupWeights = nullptr;
+
+    CUDA_CHECK(cudaMalloc(&deviceInputImage, paddedImage.step * paddedImage.rows));
+    CUDA_CHECK(cudaMalloc(&deviceOutputImage, resultImage.step * resultImage.rows));
+    CUDA_CHECK(cudaMalloc(&deviceSpatialLookupWeights, spatialLookupWeights.size() * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&deviceColorLookupWeights, colorLookupWeights.size() * sizeof(float)));
+
+    CUDA_CHECK(cudaMemcpy(deviceInputImage, paddedImage.data, paddedImage.step * paddedImage.rows, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(deviceSpatialLookupWeights, spatialLookupWeights.data(), spatialLookupWeights.size() * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(deviceColorLookupWeights, colorLookupWeights.data(), colorLookupWeights.size() * sizeof(float), cudaMemcpyHostToDevice));
+
+    dim3 blockSize(blockSizeX, blockSizeY);
+
+    //如果 block 数量直接铺满整张图，网格跨步循环往往只会执行一轮；适当限制 grid 后，一个 block 才会继续跨步处理后续 tile。
+    cudaDeviceProp deviceInfo{};
+    CUDA_CHECK(cudaGetDeviceProperties(&deviceInfo, 0));
+    int neededGridX = (inputImage.cols + blockSizeX - 1) / blockSizeX;
+    int neededGridY = (inputImage.rows + blockSizeY - 1) / blockSizeY;
+    int launchGridX = std::max(1, std::min(neededGridX, deviceInfo.multiProcessorCount / 4));
+    int launchGridY = std::max(1, std::min(neededGridY, 4));
+    dim3 blockGroup(launchGridX, launchGridY);
+
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+
+    //共享内存 tile 的大小取决于线程块大小、滤波半径和图像通道数
+    if (inputImage.channels() == 1) {
+        size_t sharedMemoryBytes =
+            static_cast<size_t>(blockSize.x + 2 * filterRadius) *
+            static_cast<size_t>(blockSize.y + 2 * filterRadius) *
+            sizeof(uchar);
+
+        BilateralFilterGrayKernel<<<blockGroup, blockSize, sharedMemoryBytes>>>(
+            deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights,
+            inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius);
+    } else {
+        size_t sharedMemoryBytes =
+            static_cast<size_t>(blockSize.x + 2 * filterRadius) *
+            static_cast<size_t>(blockSize.y + 2 * filterRadius) *
+            3 * sizeof(uchar);
+
+        BilateralFilterColorKernel<<<blockGroup, blockSize, sharedMemoryBytes>>>(
+            deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights,
+            inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius);
+    }
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    const int iterations = 10;
+    CUDA_CHECK(cudaEventRecord(start));
+    for (int i = 0; i < iterations; ++i) {
+        if (inputImage.channels() == 1) {
+            size_t sharedMemoryBytes =
+                static_cast<size_t>(blockSize.x + 2 * filterRadius) *
+                static_cast<size_t>(blockSize.y + 2 * filterRadius) *
+                sizeof(uchar);
+            BilateralFilterGrayKernel<<<blockGroup, blockSize, sharedMemoryBytes>>>(
+                deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights,
+                inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius);
+        } else {
+            size_t sharedMemoryBytes =
+                static_cast<size_t>(blockSize.x + 2 * filterRadius) *
+                static_cast<size_t>(blockSize.y + 2 * filterRadius) *
+                3 * sizeof(uchar);
+            BilateralFilterColorKernel<<<blockGroup, blockSize, sharedMemoryBytes>>>(
+                deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights,
+                inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius);
+        }
+    }
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+
+    float elapsedTime = 0.0f;
+    CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start, stop));
+    double avgTime = elapsedTime / iterations;
+
+    CUDA_CHECK(cudaMemcpy(resultImage.data, deviceOutputImage, resultImage.step * resultImage.rows, cudaMemcpyDeviceToHost));
+
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+    CUDA_CHECK(cudaFree(deviceInputImage));
+    CUDA_CHECK(cudaFree(deviceOutputImage));
+    CUDA_CHECK(cudaFree(deviceSpatialLookupWeights));
+    CUDA_CHECK(cudaFree(deviceColorLookupWeights));
+
+    return avgTime;
+}
+
+double measureCPUTime(const cv::Mat& inputImage, cv::Mat& outputImage, int diameter, float colorSigma, float spatialSigma) {
+    const int iterations = 5;
+    cv::bilateralFilter(inputImage, outputImage, diameter, colorSigma, spatialSigma);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < iterations; ++i) {
+        cv::bilateralFilter(inputImage, outputImage, diameter, colorSigma, spatialSigma);
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double, std::milli> elapsed = end - start;
+    return elapsed.count() / iterations;
+}
+
+int main(int argc, char** argv) {
+    std::string imagePath = (argc > 1) ? argv[1] : "test.jpg";
+    std::string paramsFile = (argc > 2) ? argv[2] : "params.txt";
+
+    BilateralParams params;
+    parseParamsFile(paramsFile, params);
+
+    std::cout << "Parameters:" << std::endl;
+    std::cout << "  Radius: " << params.radius << std::endl;
+    std::cout << "  Sigma Spatial: " << params.sigmaSpatial << std::endl;
+    std::cout << "  Sigma Color: " << params.sigmaColor << std::endl;
+    std::cout << "  Use Adaptive Radius: " << (params.useAdaptiveRadius ? "true" : "false") << std::endl;
+
+    cv::Mat inputImage = cv::imread(imagePath, cv::IMREAD_UNCHANGED);
+    if (inputImage.empty()) {
+        std::cerr << "Error: Cannot load image: " << imagePath << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    if (inputImage.channels() == 4) {
+        cv::cvtColor(inputImage, inputImage, cv::COLOR_BGRA2BGR);
+    }
+
+    if (inputImage.channels() != 1 && inputImage.channels() != 3) {
+        std::cerr << "Error: Only 1-channel (grayscale) or 3-channel (RGB) images are supported." << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    int filterRadius = params.radius;
+    if (params.useAdaptiveRadius) {
+        filterRadius = selectAdaptiveRadius(inputImage, params.radius, params.noiseLevel);
+        std::cout << "Adaptive radius selected: " << filterRadius << std::endl;
+    }
+
+    int colorTableSize = (inputImage.channels() == 1) ? maxGrayDistance : maxColorDistance;
+    std::vector<float> spatialLookupWeights;
+    std::vector<float> colorLookupWeights;
+    lookupWeights(spatialLookupWeights, colorLookupWeights, filterRadius, params.sigmaSpatial, params.sigmaColor, colorTableSize);
+
+    cv::Mat paddedImage;
+    cv::copyMakeBorder(inputImage, paddedImage, filterRadius, filterRadius, filterRadius, filterRadius, cv::BORDER_REFLECT);
+    cv::Mat resultImage = cv::Mat::zeros(inputImage.size(), inputImage.type());
+
+    double gpuTimeMs = measureGPUTime(inputImage, paddedImage, resultImage, spatialLookupWeights, colorLookupWeights, filterRadius);
+
+    cv::Mat referenceImage;
+    double cpuTimeMs = measureCPUTime(inputImage, referenceImage, 2 * filterRadius + 1, params.sigmaColor, params.sigmaSpatial);
+
+    float mae = calculateMeanAbsoluteError(resultImage, referenceImage);
+
+    std::cout << std::endl;
+    std::cout << "========== Results ==========" << std::endl;
+    std::cout << "Image: " << imagePath << " (" << inputImage.cols << "x" << inputImage.rows << ", " << inputImage.channels() << " channels)" << std::endl;
+    std::cout << "Filter Radius: " << filterRadius << std::endl;
+    std::cout << std::endl;
+    std::cout << "Performance:" << std::endl;
+    std::cout << "  GPU Time: " << gpuTimeMs << " ms" << std::endl;
+    std::cout << "  GPU Throughput: " << (inputImage.cols * inputImage.rows / (gpuTimeMs / 1000.0) / 1000000.0) << " MP/s" << std::endl;
+    std::cout << "  CPU Time: " << cpuTimeMs << " ms" << std::endl;
+    std::cout << "  CPU Throughput: " << (inputImage.cols * inputImage.rows / (cpuTimeMs / 1000.0) / 1000000.0) << " MP/s" << std::endl;
+    std::cout << "  Speedup (CPU/GPU): " << (cpuTimeMs / gpuTimeMs) << "x" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Correctness:" << std::endl;
+    std::cout << "  MAE vs OpenCV: " << mae << std::endl;
+    std::cout << "  Pass (MAE < 1.0): " << (mae < 1.0 ? "YES" : "NO") << std::endl;
+    std::cout << "=============================" << std::endl;
+
+    std::string outputFilename = "bilateralFilter_result.raw";
+    if (!writeRawImage(outputFilename, resultImage)) {
+        std::cerr << "Warning: Failed to write output raw file." << std::endl;
+    }
+
+    cv::imwrite("bilateralFilter_result.jpg", resultImage);
+
+    writePerformanceLog(
+        "performance_log.txt",
+        imagePath,
+        inputImage.cols,
+        inputImage.rows,
+        inputImage.channels(),
+        filterRadius,
+        params.sigmaSpatial,
+        params.sigmaColor,
+        gpuTimeMs,
+        cpuTimeMs,
+        mae
+    );
+
+    if (mae >= 1.0f) {
+        std::cerr << "Warning: MAE (" << mae << ") >= 1.0, results may not meet accuracy requirements." << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/08_bilateral_filter/monarch.bmp b/08_bilateral_filter/monarch.bmp
new file mode 100644
index 0000000..85b01ef
Binary files /dev/null and b/08_bilateral_filter/monarch.bmp differ
diff --git a/08_bilateral_filter/params.txt b/08_bilateral_filter/params.txt
new file mode 100644
index 0000000..7f62ba4
--- /dev/null
+++ b/08_bilateral_filter/params.txt
@@ -0,0 +1,14 @@
+# 双边滤波参数配置文件
+
+# 窗口半径，实际窗口大小为 (2*radius+1)^2
+radius = 5
+
+# 空间域标准差
+sigma_spatial = 3.0
+
+# 颜色域标准差（对于8位图像，范围0-255）
+sigma_color = 30.0
+
+# 是否启用自适应半径选择：根据图像噪声自动调整滤波半径
+# 可选值: true / false
+use_adaptive_radius = false
diff --git "a/08_bilateral_filter/\345\256\236\346\227\266\345\217\214\350\276\271\346\273\244\346\263\242-CUDA\350\256\272\346\226\207.pdf" "b/08_bilateral_filter/\345\256\236\346\227\266\345\217\214\350\276\271\346\273\244\346\263\242-CUDA\350\256\272\346\226\207.pdf"
new file mode 100644
index 0000000..eabcb35
Binary files /dev/null and "b/08_bilateral_filter/\345\256\236\346\227\266\345\217\214\350\276\271\346\273\244\346\263\242-CUDA\350\256\272\346\226\207.pdf" differ
diff --git "a/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/baboon.jpg" "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/baboon.jpg"
new file mode 100644
index 0000000..337312a
Binary files /dev/null and "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/baboon.jpg" differ
diff --git "a/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/girl.jpg" "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/girl.jpg"
new file mode 100644
index 0000000..686edaa
Binary files /dev/null and "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/girl.jpg" differ
diff --git "a/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/main.cu" "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/main.cu"
new file mode 100644
index 0000000..9175249
--- /dev/null
+++ "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/main.cu"
@@ -0,0 +1,548 @@
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <chrono>
+#include <cuda_runtime.h>
+#include <opencv2/opencv.hpp>
+
+// 错误检查宏
+#define CUDA_CHECK(call) do { \
+    cudaError_t err = (call); \
+    if (err != cudaSuccess) { \
+        fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+    } \
+} while (0)
+
+constexpr int blockSizeX = 32;
+constexpr int blockSizeY = 8;
+constexpr int maxGrayDistance = 255;
+constexpr int maxColorDistance = 255 * 3;
+
+// 参数结构体
+struct BilateralParams {
+    int radius = 5;
+    float sigmaSpatial = 3.0f;
+    float sigmaColor = 30.0f;
+    bool useAdaptiveRadius = false;
+    float noiseLevel = 0.0f;  // 用于自适应半径估计的噪声水平
+};
+
+// 灰度图双边滤波核函数
+__global__ void BilateralFilterGrayKernel(
+    const uchar* inputImage,
+    uchar* outputImage,
+    const float* spatialLookupWeights,
+    const float* colorLookupWeights,
+    int imageWidth,
+    int imageHeight,
+    size_t inputStep,
+    size_t outputStep,
+    int filterRadius
+) {
+    int pixelX = blockIdx.x * blockDim.x + threadIdx.x;
+    int pixelY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (pixelX >= imageWidth || pixelY >= imageHeight) return;
+
+    int paddedX = pixelX + filterRadius;
+    int paddedY = pixelY + filterRadius;
+
+    // 计算中心像素在输入图像中的内存地址
+    float centerValue = static_cast<float>(inputImage[paddedY * inputStep + paddedX]);
+
+    float weightValueSum = 0.0f;
+    float weightSum = 0.0f;
+    int spatialIndex = 0;
+
+    for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) {
+        for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) {
+            float nearValue = static_cast<float>(inputImage[(paddedY + offsetY) * inputStep + (paddedX + offsetX)]);
+            int colorDistance = static_cast<int>(fabsf(nearValue - centerValue));
+
+            if (colorDistance > maxGrayDistance) colorDistance = maxGrayDistance;
+
+            float finalWeight = spatialLookupWeights[spatialIndex] * colorLookupWeights[colorDistance];
+            weightValueSum += finalWeight * nearValue;
+            weightSum += finalWeight;
+        }
+    }
+
+    outputImage[pixelY * outputStep + paddedX] = static_cast<uchar>(weightValueSum / weightSum);
+}
+
+// 彩色图双边滤波核函数
+__global__ void BilateralFilterColorKernel(
+    const uchar* inputImage,
+    uchar* outputImage,
+    const float* spatialLookupWeights,
+    const float* colorLookupWeights,
+    int imageWidth,
+    int imageHeight,
+    size_t inputStep,
+    size_t outputStep,
+    int filterRadius
+) {
+    int pixelX = blockIdx.x * blockDim.x + threadIdx.x;
+    int pixelY = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (pixelX >= imageWidth || pixelY >= imageHeight) return;
+
+    int paddedX = pixelX + filterRadius;
+    int paddedY = pixelY + filterRadius;
+    size_t centerIndex = paddedY * inputStep + paddedX * 3;
+
+    // 读取中心像素的BGR值
+    float centerBlue = static_cast<float>(inputImage[centerIndex + 0]);
+    float centerGreen = static_cast<float>(inputImage[centerIndex + 1]);
+    float centerRed = static_cast<float>(inputImage[centerIndex + 2]);
+
+    float blueSum = 0.0f;
+    float greenSum = 0.0f;
+    float redSum = 0.0f;
+    float weightSum = 0.0f;
+    int spatialIndex = 0;
+
+    for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) {
+        for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) {
+            size_t nearIndex = (paddedY + offsetY) * inputStep + (paddedX + offsetX) * 3;
+
+            float nearBlue = static_cast<float>(inputImage[nearIndex + 0]);
+            float nearGreen = static_cast<float>(inputImage[nearIndex + 1]);
+            float nearRed = static_cast<float>(inputImage[nearIndex + 2]);
+
+            int colorDistance =
+                static_cast<int>(fabsf(nearBlue - centerBlue)) +
+                static_cast<int>(fabsf(nearGreen - centerGreen)) +
+                static_cast<int>(fabsf(nearRed - centerRed));
+
+            if (colorDistance > maxColorDistance) colorDistance = maxColorDistance;
+
+            float finalWeight = spatialLookupWeights[spatialIndex] * colorLookupWeights[colorDistance];
+            blueSum += finalWeight * nearBlue;
+            greenSum += finalWeight * nearGreen;
+            redSum += finalWeight * nearRed;
+            weightSum += finalWeight;
+        }
+    }
+
+    size_t outputIndex = pixelY * outputStep + pixelX * 3;
+    outputImage[outputIndex + 0] = static_cast<uchar>(blueSum / weightSum);
+    outputImage[outputIndex + 1] = static_cast<uchar>(greenSum / weightSum);
+    outputImage[outputIndex + 2] = static_cast<uchar>(redSum / weightSum);
+}
+
+// 创建空间权重和颜色权重查找表
+void lookupWeights(
+    std::vector<float>& spatialLookupWeights,
+    std::vector<float>& colorLookupWeights,
+    int filterRadius,
+    float spatialSigma,
+    float colorSigma,
+    int colorTableSize
+) {
+    int windowLength = 2 * filterRadius + 1;
+
+    spatialLookupWeights.resize(windowLength * windowLength);
+    colorLookupWeights.resize(colorTableSize + 1);
+
+    float spatialFactor = -0.5f / (spatialSigma * spatialSigma);
+    float colorCoefficient = -0.5f / (colorSigma * colorSigma);
+    int spatialIndex = 0;
+
+    for (int offsetY = -filterRadius; offsetY <= filterRadius; ++offsetY) {
+        for (int offsetX = -filterRadius; offsetX <= filterRadius; ++offsetX, ++spatialIndex) {
+            float distanceSquare = static_cast<float>(offsetX * offsetX + offsetY * offsetY);
+            spatialLookupWeights[spatialIndex] = expf(distanceSquare * spatialFactor);
+        }
+    }
+
+    for (int colorDistance = 0; colorDistance <= colorTableSize; ++colorDistance) {
+        colorLookupWeights[colorDistance] = expf(colorDistance * colorDistance * colorCoefficient);
+    }
+}
+
+// MAE
+float calculateMeanAbsoluteError(const cv::Mat& image1, const cv::Mat& image2) {
+    cv::Mat diff;
+    cv::absdiff(image1, image2, diff);
+    cv::Scalar meanVal = cv::mean(diff);
+    float mae = 0.0f;
+    for (int c = 0; c < image1.channels(); ++c) {
+        mae += static_cast<float>(meanVal[c]);
+    }
+    return mae / image1.channels();
+}
+
+// 估计图像噪声水平（简单估计：计算局部方差的中位数）
+float estimateNoiseLevel(const cv::Mat& image) {
+    cv::Mat gray;
+    if (image.channels() == 3) {
+        cv::cvtColor(image, gray, cv::COLOR_BGR2GRAY);
+    } else {
+        gray = image.clone();
+    }
+
+    cv::Mat laplacian;
+    cv::Laplacian(gray, laplacian, CV_32F);
+    cv::Mat absLaplacian = cv::abs(laplacian);
+
+    cv::Scalar meanVal, stdVal;
+    cv::meanStdDev(absLaplacian, meanVal, stdVal);
+
+    return static_cast<float>(stdVal[0]);
+}
+
+// 自适应半径选择：根据噪声水平动态调整滤波半径
+int selectAdaptiveRadius(const cv::Mat& image, float baseRadius, float noiseLevel) {
+    if (noiseLevel <= 0) {
+        noiseLevel = estimateNoiseLevel(image);
+    }
+
+    // 噪声越大，使用更大的半径来平滑；噪声小，保持细节
+    float noiseFactor = std::min(noiseLevel / 30.0f, 2.0f);  // 归一化噪声因子
+    int adaptiveRadius = static_cast<int>(baseRadius * (1.0f + noiseFactor * 0.5f));
+
+    return std::max(2, std::min(adaptiveRadius, 10));
+}
+
+// 解析参数文件
+bool parseParamsFile(const std::string& filename, BilateralParams& params) {
+    std::ifstream file(filename);
+    if (!file.is_open()) {
+        std::cerr << "Warning: Cannot open params file: " << filename << ", using default values." << std::endl;
+        return false;
+    }
+
+    std::string line;
+    while (std::getline(file, line)) {
+        if (line.empty() || line[0] == '#') continue;
+
+        std::istringstream iss(line);
+        std::string key, eq;
+        if (std::getline(iss, key, '=')) {
+            key.erase(0, key.find_first_not_of(" \t"));
+            key.erase(key.find_last_not_of(" \t") + 1);
+
+            std::string value;
+            if (std::getline(iss, value, '#')) {  
+                value.erase(0, value.find_first_not_of(" \t"));
+                value.erase(value.find_last_not_of(" \t") + 1);
+            }
+
+            if (key == "radius") {
+                params.radius = std::stoi(value);
+            } else if (key == "sigma_spatial") {
+                params.sigmaSpatial = std::stof(value);
+            } else if (key == "sigma_color") {
+                params.sigmaColor = std::stof(value);
+            } else if (key == "use_adaptive_radius") {
+                params.useAdaptiveRadius = (value == "true" || value == "1");
+            }
+        }
+    }
+    return true;
+}
+
+// 写入二进制raw文件
+bool writeRawImage(const std::string& filename, const cv::Mat& image) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Error: Cannot write to file: " << filename << std::endl;
+        return false;
+    }
+
+    int width = image.cols;
+    int height = image.rows;
+    int channels = image.channels();
+
+    // 写入头部信息：宽、高、通道数
+    file.write(reinterpret_cast<const char*>(&width), sizeof(int));
+    file.write(reinterpret_cast<const char*>(&height), sizeof(int));
+    file.write(reinterpret_cast<const char*>(&channels), sizeof(int));
+
+    // 写入像素数据（按行）
+    for (int y = 0; y < height; ++y) {
+        file.write(reinterpret_cast<const char*>(image.ptr(y)), width * channels);
+    }
+
+    return file.good();
+}
+
+// 读取二进制raw文件
+cv::Mat readRawImage(const std::string& filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Error: Cannot open file: " << filename << std::endl;
+        return cv::Mat();
+    }
+
+    int width, height, channels;
+    file.read(reinterpret_cast<char*>(&width), sizeof(int));
+    file.read(reinterpret_cast<char*>(&height), sizeof(int));
+    file.read(reinterpret_cast<char*>(&channels), sizeof(int));
+
+    int type = (channels == 1) ? CV_8UC1 : CV_8UC3;
+    cv::Mat image(height, width, type);
+
+    for (int y = 0; y < height; ++y) {
+        file.read(reinterpret_cast<char*>(image.ptr(y)), width * channels);
+    }
+
+    return image;
+}
+
+// 写入性能日志
+void writePerformanceLog(
+    const std::string& filename,
+    const std::string& imagePath,
+    int width,
+    int height,
+    int channels,
+    int radius,
+    float spatialSigma,
+    float colorSigma,
+    double gpuTimeMs,
+    double cpuTimeMs,
+    float mae
+) {
+    std::ofstream file(filename);
+    if (!file.is_open()) {
+        std::cerr << "Error: Cannot write log file: " << filename << std::endl;
+        return;
+    }
+
+    double totalPixels = static_cast<double>(width) * height;
+    double gpuThroughput = totalPixels / (gpuTimeMs / 1000.0) / 1000000.0;  // 百万像素/秒
+    double cpuThroughput = totalPixels / (cpuTimeMs / 1000.0) / 1000000.0;
+    double speedup = cpuTimeMs / gpuTimeMs;
+
+    file << "========== Bilateral Filter Performance Log ==========" << std::endl;
+    file << "Image: " << imagePath << std::endl;
+    file << "Resolution: " << width << "x" << height << std::endl;
+    file << "Channels: " << channels << std::endl;
+    file << "Total Pixels: " << totalPixels << std::endl;
+    file << std::endl;
+    file << "Filter Parameters:" << std::endl;
+    file << "  Radius: " << radius << std::endl;
+    file << "  Sigma Spatial: " << spatialSigma << std::endl;
+    file << "  Sigma Color: " << colorSigma << std::endl;
+    file << std::endl;
+    file << "Performance Metrics:" << std::endl;
+    file << "  GPU Time: " << gpuTimeMs << " ms" << std::endl;
+    file << "  GPU Throughput: " << gpuThroughput << " MP/s" << std::endl;
+    file << "  CPU Time: " << cpuTimeMs << " ms" << std::endl;
+    file << "  CPU Throughput: " << cpuThroughput << " MP/s" << std::endl;
+    file << "  Speedup (CPU/GPU): " << speedup << "x" << std::endl;
+    file << std::endl;
+    file << "Correctness:" << std::endl;
+    file << "  MAE vs OpenCV: " << mae << std::endl;
+    file << "  Pass (MAE < 1.0): " << (mae < 1.0 ? "YES" : "NO") << std::endl;
+    file << "======================================================" << std::endl;
+}
+
+// 使用CUDA事件计时GPU执行时间
+double measureGPUTime(
+    const cv::Mat& inputImage,
+    const cv::Mat& paddedImage,
+    cv::Mat& resultImage,
+    const std::vector<float>& spatialLookupWeights,
+    const std::vector<float>& colorLookupWeights,
+    int filterRadius
+) {
+    
+    uchar* deviceInputImage = nullptr;
+    uchar* deviceOutputImage = nullptr;
+    float* deviceSpatialLookupWeights = nullptr;
+    float* deviceColorLookupWeights = nullptr;
+
+    CUDA_CHECK(cudaMalloc(&deviceInputImage, paddedImage.step * paddedImage.rows));
+    CUDA_CHECK(cudaMalloc(&deviceOutputImage, resultImage.step * resultImage.rows));
+
+    CUDA_CHECK(cudaMalloc(&deviceSpatialLookupWeights, spatialLookupWeights.size() * sizeof(float)));
+    CUDA_CHECK(cudaMalloc(&deviceColorLookupWeights, colorLookupWeights.size() * sizeof(float)));
+
+    CUDA_CHECK(cudaMemcpy(deviceInputImage, paddedImage.data, paddedImage.step * paddedImage.rows, cudaMemcpyHostToDevice));
+
+    CUDA_CHECK(cudaMemcpy(deviceSpatialLookupWeights, spatialLookupWeights.data(), spatialLookupWeights.size() * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(deviceColorLookupWeights, colorLookupWeights.data(), colorLookupWeights.size() * sizeof(float), cudaMemcpyHostToDevice));
+
+    dim3 blockSize(blockSizeX, blockSizeY);
+    dim3 blockGroup((inputImage.cols + blockSizeX - 1) / blockSizeX, (inputImage.rows + blockSizeY - 1) / blockSizeY);
+
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+
+    if (inputImage.channels() == 1) {
+        BilateralFilterGrayKernel<<<blockGroup, blockSize>>>(
+            deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights,
+            inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius);
+    } else {
+        BilateralFilterColorKernel<<<blockGroup, blockSize>>>(
+            deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights,
+            inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius);
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // 正式计时，运行10次取平均
+    const int iterations = 10;
+    CUDA_CHECK(cudaEventRecord(start));
+    for (int i = 0; i < iterations; ++i) {
+        if (inputImage.channels() == 1) {
+            BilateralFilterGrayKernel<<<blockGroup, blockSize>>>(
+                deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights,
+                inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius);
+        } else {
+            BilateralFilterColorKernel<<<blockGroup, blockSize>>>(
+                deviceInputImage, deviceOutputImage, deviceSpatialLookupWeights, deviceColorLookupWeights,
+                inputImage.cols, inputImage.rows, paddedImage.step, resultImage.step, filterRadius);
+        }
+    }
+
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+
+    float elapsedTime = 0.0f;
+    CUDA_CHECK(cudaEventElapsedTime(&elapsedTime, start, stop));
+    double avgTime = elapsedTime / iterations;
+
+    CUDA_CHECK(cudaMemcpy(resultImage.data, deviceOutputImage, resultImage.step * resultImage.rows, cudaMemcpyDeviceToHost));
+
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+
+    CUDA_CHECK(cudaFree(deviceInputImage));
+    CUDA_CHECK(cudaFree(deviceOutputImage));
+
+    CUDA_CHECK(cudaFree(deviceSpatialLookupWeights));
+    CUDA_CHECK(cudaFree(deviceColorLookupWeights));
+
+    return avgTime;
+}
+
+// 测量CPU OpenCV双边滤波时间
+double measureCPUTime(const cv::Mat& inputImage, cv::Mat& outputImage, 
+    int diameter, float colorSigma, float spatialSigma) {
+    const int iterations = 5;
+
+    cv::bilateralFilter(inputImage, outputImage, diameter, colorSigma, spatialSigma);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < iterations; ++i) {
+        cv::bilateralFilter(inputImage, outputImage, diameter, colorSigma, spatialSigma);
+    }
+    auto end = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double, std::milli> elapsed = end - start;
+    return elapsed.count() / iterations;
+}
+
+int main(int argc, char** argv) {
+    std::string imagePath = (argc > 1) ? argv[1] : "test.jpg";
+    std::string paramsFile = (argc > 2) ? argv[2] : "params.txt";
+
+    // 解析参数文件
+    BilateralParams params;
+    parseParamsFile(paramsFile, params);
+
+    std::cout << "Parameters:" << std::endl;
+    std::cout << "  Radius: " << params.radius << std::endl;
+    std::cout << "  Sigma Spatial: " << params.sigmaSpatial << std::endl;
+    std::cout << "  Sigma Color: " << params.sigmaColor << std::endl;
+    std::cout << "  Use Adaptive Radius: " << (params.useAdaptiveRadius ? "true" : "false") << std::endl;
+
+    cv::Mat inputImage = cv::imread(imagePath, cv::IMREAD_UNCHANGED);
+    if (inputImage.empty()) {
+        std::cerr << "Error: Cannot load image: " << imagePath << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    if (inputImage.channels() == 4) {
+        cv::cvtColor(inputImage, inputImage, cv::COLOR_BGRA2BGR);
+    }
+
+    if (inputImage.channels() != 1 && inputImage.channels() != 3) {
+        std::cerr << "Error: Only 1-channel (grayscale) or 3-channel (RGB) images are supported." << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    int filterRadius = params.radius;
+    if (params.useAdaptiveRadius) {
+        filterRadius = selectAdaptiveRadius(inputImage, params.radius, params.noiseLevel);
+        std::cout << "Adaptive radius selected: " << filterRadius << std::endl;
+    }
+
+    int colorTableSize = (inputImage.channels() == 1) ? maxGrayDistance : maxColorDistance;
+
+    std::vector<float> spatialLookupWeights;
+    std::vector<float> colorLookupWeights;
+    lookupWeights(spatialLookupWeights, colorLookupWeights, 
+        filterRadius, 
+        params.sigmaSpatial, params.sigmaColor, 
+        colorTableSize);
+
+    // 填充边界
+    cv::Mat paddedImage;
+    cv::copyMakeBorder(inputImage, paddedImage, filterRadius, filterRadius, filterRadius, filterRadius, cv::BORDER_REFLECT);
+
+    cv::Mat resultImage = cv::Mat::zeros(inputImage.size(), inputImage.type());
+
+    double gpuTimeMs = measureGPUTime(inputImage, paddedImage, resultImage, spatialLookupWeights, colorLookupWeights, filterRadius);
+
+    // CPU OpenCV对比
+    cv::Mat referenceImage;
+    double cpuTimeMs = measureCPUTime(inputImage, referenceImage, 2 * filterRadius + 1, params.sigmaColor, params.sigmaSpatial);
+
+    float mae = calculateMeanAbsoluteError(resultImage, referenceImage);
+
+    std::cout << std::endl;
+    std::cout << "========== Results ==========" << std::endl;
+    std::cout << "Image: " << imagePath << " (" << inputImage.cols << "x" << inputImage.rows << ", " << inputImage.channels() << " channels)" << std::endl;
+    std::cout << "Filter Radius: " << filterRadius << std::endl;
+    std::cout << std::endl;
+    std::cout << "Performance:" << std::endl;
+    std::cout << "  GPU Time: " << gpuTimeMs << " ms" << std::endl;
+    std::cout << "  GPU Throughput: " << (inputImage.cols * inputImage.rows / (gpuTimeMs / 1000.0) / 1000000.0) << " MP/s" << std::endl;
+    std::cout << "  CPU Time: " << cpuTimeMs << " ms" << std::endl;
+    std::cout << "  CPU Throughput: " << (inputImage.cols * inputImage.rows / (cpuTimeMs / 1000.0) / 1000000.0) << " MP/s" << std::endl;
+    std::cout << "  Speedup (CPU/GPU): " << (cpuTimeMs / gpuTimeMs) << "x" << std::endl;
+    std::cout << std::endl;
+    std::cout << "Correctness:" << std::endl;
+    std::cout << "  MAE vs OpenCV: " << mae << std::endl;
+    std::cout << "  Pass (MAE < 1.0): " << (mae < 1.0 ? "YES" : "NO") << std::endl;
+    std::cout << "=============================" << std::endl;
+
+    std::string outputFilename = "bilateralFilter_result.raw";
+    if (!writeRawImage(outputFilename, resultImage)) {
+        std::cerr << "Warning: Failed to write output raw file." << std::endl;
+    }
+
+    cv::imwrite("bilateralFilter_result.jpg", resultImage);
+
+    writePerformanceLog(
+        "performance_log.txt",
+        imagePath,
+        inputImage.cols,
+        inputImage.rows,
+        inputImage.channels(),
+        filterRadius,
+        params.sigmaSpatial,
+        params.sigmaColor,
+        gpuTimeMs,
+        cpuTimeMs,
+        mae
+    );
+    
+    if (mae >= 1.0f) {
+        std::cerr << "Warning: MAE (" << mae << ") >= 1.0, results may not meet accuracy requirements." << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git "a/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/monarch.jpg" "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/monarch.jpg"
new file mode 100644
index 0000000..ee297f7
Binary files /dev/null and "b/08_bilateral_filter/\346\234\264\347\264\240\345\256\236\347\216\260\344\273\243\347\240\201\345\217\212\347\273\223\346\236\234/monarch.jpg" differ
diff --git "a/08_bilateral_filter/\351\241\271\347\233\256\350\246\201\346\261\202.md" "b/08_bilateral_filter/\351\241\271\347\233\256\350\246\201\346\261\202.md"
new file mode 100644
index 0000000..d2e652c
--- /dev/null
+++ "b/08_bilateral_filter/\351\241\271\347\233\256\350\246\201\346\261\202.md"
@@ -0,0 +1,54 @@
+## 八. 实时图像双边滤波（CUDA）
+
+### **任务背景**
+
+凭借你 InfiniTensor 训练营的经历和人工智能与并行计算方面的出色成绩，你受雇于一家领先的虚拟现实公司，负责为下一代 VR 头显开发实时图像增强模块。用户经常抱怨在弱光环境下看到的图像噪点严重，而现有降噪算法要么模糊细节，要么延迟太高导致眩晕。
+
+你提出采用双边滤波算法，它能够在去除噪声的同时保持边缘清晰，非常适合VR图像处理。但双边滤波计算量极大，每个像素需要访问整个邻域并计算复杂的权重，难以在移动端实时运行。幸运的是，VR 头显内置了你知道的某单位生产的黑科技 GPU，你需要利用 CUDA 实现一个高度优化的双边滤波核，能够对 4K 分辨率、60 帧/秒的视频流进行实时处理。这将大幅提升用户的沉浸感和舒适度，为公司抢占市场提供核心技术。
+
+### **领域知识简介**
+
+双边滤波是一种非线性的保边滤波器，其输出像素值是邻域内所有像素的加权平均。权重由两部分组成：空间权重（依赖于像素间的空间距离，高斯函数）和颜色权重（依赖于像素值的差异，高斯函数）。因此，在平坦区域颜色相似，权重近似空间高斯，实现平滑；在边缘处颜色差异大，颜色权重小，避免跨边缘平滑。
+
+双边滤波的难点在于计算量大：对于半径为 `r`的窗口，每个像素需要计算 `(2r+1)²`个权重，且涉及指数运算。实时性要求（如 4K 60fps）对算法效率提出了极高挑战。
+
+### **任务内容**
+
+开发一个 CUDA 程序，实现每个像素独立计算的并行双边滤波。
+
+#### **输入文件定义**
+
+1. **图像文件** （标准格式，如 PNG、JPG），RGB 三通道或灰度。可使用 OpenCV 或 stb_image 读取，但本项目只关注 GPU 部分，因此可提供已读取的像素数组。为简化，输入可为二进制 raw 格式：宽、高、通道数，然后像素数据按行主序存储（RGBRGB...或灰度）。
+2. **参数文件** （文本格式）
+
+```Plain
+radius = 5               # 窗口半径，实际窗口大小为 (2*radius+1)^2
+sigma_spatial = 3.0      # 空间域标准差
+sigma_color = 30.0       # 颜色域标准差（对于8位图像，范围0-255）
+```
+
+#### 输出文件定义
+
+* **滤波后的图像** ：与输入同格式的二进制 raw 文件。
+* **性能日志** ：处理时间（ms）、吞吐量（百万像素/秒）、加速比（对比 CPU OpenCV 实现）。
+
+### 要求
+
+1. **图像格式** ：输入图像支持灰度图或 RGB 彩色图，像素值范围为 0-255（整数），处理时转换为浮点。输出为同格式。
+2. **滤波器参数** ：窗口半径 `r`至少支持到 5（即 `11×11` 窗口），空间标准差 `σs`(sigma_spatial) 和颜色标准差 `σc`(sigma_color) 可配置。
+3. **正确性验证** ：输出图像应与 OpenCV 的 `bilateralFilter` 结果对比，计算平均绝对误差（MAE）或峰值信噪比（PSNR）。平均绝对误差小于 1（对于 8 位图像）。
+4. **进阶优化** ：实现自适应半径选择。
+5. **平台适配** ：默认需在英伟达平台上支持。每在一款国产平台上支持，额外加 20% 乘算系数。
+
+ **需提交内容** ：
+
+* 整个完成上述功能的程序（包含测试）
+  * 提交地址：[Learning-CUDA 2025-winter-project 分支](https://github.com/InfiniTensor/Learning-CUDA/tree/2025-winter-project)
+  * 提交要求：参照项目代码要求中的第四条
+* 总结报告。报告需包括：
+  * 详细阐述实现思路以及优化方法，欢迎包含自己的优化历程和记录，开发中发现的问题等（取决于质量可以加分）
+  * 最终的性能指标和分析
+  * 未来可继续提升的地方（可选）
+  * 包含 ncu 和/或 nsys 使用和分析的加分
+
+---