Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions 08_bilateral_filter/Snowkyo16/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# 编译产物
build/

# 运行输出
output/

# 系统文件
.DS_Store
116 changes: 116 additions & 0 deletions 08_bilateral_filter/Snowkyo16/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# *********************************************************************
# 双边滤波 CUDA 项目 - Makefile
#
# 平台选择:
# PLATFORM=nvidia make build : NVIDIA A100 GPU(默认)
# PLATFORM=iluvatar make build : Iluvatar BI-V100 GPU
# PLATFORM=metax make build : MetaX C500 GPU
# PLATFORM=moore make build : Moore Threads S5000 GPU
#
# *********************************************************************

# -------------------------------
# Configuration
# -------------------------------
PLATFORM ?= nvidia
CFLAGS := -std=c++17 -O2
LDFLAGS :=
INCLUDE := -I include

ifeq ($(PLATFORM),nvidia)
CC := /usr/local/cuda/bin/nvcc
CFLAGS += -arch=sm_80 -DPLATFORM_NVIDIA
else ifeq ($(PLATFORM),iluvatar)
CC := clang++
CFLAGS += -DPLATFORM_ILUVATAR --cuda-path=/usr/local/corex
LDFLAGS := -L/usr/local/corex/lib -lcudart
else ifeq ($(PLATFORM),metax)
CC := /opt/maca/mxgpu_llvm/bin/mxcc
CFLAGS += -DPLATFORM_METAX
else ifeq ($(PLATFORM),moore)
MUSA_HOME ?= /usr/local/musa
CC := $(MUSA_HOME)/bin/mcc
CFLAGS += -DPLATFORM_MOORE
LDFLAGS += -L$(MUSA_HOME)/lib -lmusart -L/usr/lib/gcc/x86_64-linux-gnu/11 -lstdc++
else
$(error 仅支持 nvidia / iluvatar / metax / moore 平台)
endif

# 目录定义
BUILD_DIR := build
OUTPUT_DIR := output

# 可执行文件
TARGET := $(BUILD_DIR)/bilateral_filter

# 目标文件
MAIN_OBJ := $(BUILD_DIR)/main.o
CU_OBJS := $(BUILD_DIR)/kernels.o
CPP_OBJS := $(BUILD_DIR)/bilateral_cpu.o $(BUILD_DIR)/image_io.o $(BUILD_DIR)/params.o $(BUILD_DIR)/benchmark.o
ALL_OBJS := $(MAIN_OBJ) $(CU_OBJS) $(CPP_OBJS)

# 用户可指定输入图像和运行版本
INPUT ?=
MODE ?= all

# -------------------------------
# Phony Targets
# -------------------------------
.PHONY: all build run clean

# 默认目标:编译 + 运行
all: build run

# 编译目标
build: $(TARGET)

# 运行目标
run: $(TARGET)
@mkdir -p $(OUTPUT_DIR)/images
@echo "=== 运行双边滤波 ($(MODE)) ==="
@if [ -n "$(INPUT)" ]; then \
IMG="$(INPUT)"; \
else \
IMG=$$(ls test_images/*.png test_images/*.jpg 2>/dev/null | head -1); \
fi; \
if [ -z "$$IMG" ]; then \
echo "错误: test_images/ 下没有找到 png/jpg 图片"; \
echo "用法: make run INPUT=test_images/xxx.png"; \
exit 1; \
fi; \
echo "输入图像: $$IMG"; \
./$(TARGET) $$IMG params.txt $(OUTPUT_DIR) $(MODE)

# 清理目标
clean:
@echo "=== 清理临时文件 ==="
rm -rf $(BUILD_DIR) $(OUTPUT_DIR)

# -------------------------------
# Dependency Rules
# -------------------------------

# 链接
$(TARGET): $(ALL_OBJS)
@mkdir -p $(BUILD_DIR)
@$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^

# GPU 源文件编译规则(根据平台选择 .cu / .maca / .mu)
ifeq ($(PLATFORM),metax)
$(BUILD_DIR)/%.o: src/%.maca
@mkdir -p $(BUILD_DIR)
@$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
else ifeq ($(PLATFORM),moore)
$(BUILD_DIR)/%.o: src/%.mu
@mkdir -p $(BUILD_DIR)
@$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
else
$(BUILD_DIR)/%.o: src/%.cu
@mkdir -p $(BUILD_DIR)
@$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
endif

# 共享:从 .cpp 编译
$(BUILD_DIR)/%.o: src/%.cpp
@mkdir -p $(BUILD_DIR)
@$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
190 changes: 190 additions & 0 deletions 08_bilateral_filter/Snowkyo16/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# 基于CUDA实现实时图像双边滤波

## 项目结构

```
Snowkyo16/
├── src/
│ ├── main.cu # 主程序入口,版本调度器(NVIDIA A100 / Iluvatar BI100)
│ ├── main.maca # 主程序入口,版本调度器(MetaX C500)
│ ├── main.mu # 主程序入口,版本调度器(Moore Threads S5000)
│ ├── kernels.cu # V1-V4 GPU kernel 及 wrapper 实现 (NVIDIA A100 / Iluvatar BI100)
│ ├── kernels.maca # V5 GPU kernel 及 wrapper 实现(MetaX C500)
│ ├── kernels.mu # V6 GPU kernel 及 wrapper 实现(Moore Threads S5000)
│ ├── bilateral_cpu.cpp # V0 CPU 基线实现
│ ├── image_io.cpp # 图像读写(基于 stb_image)
│ ├── params.cpp # 滤波参数解析
│ └── benchmark.cpp # 计时框架、性能汇总表
├── include/
│ ├── bilateral_filter.cuh # 各版本滤波函数声明
│ ├── image_io.h # Image 结构体 + 读写接口
│ ├── params.h # FilterParams 结构体
│ ├── benchmark.h # 计时框架接口
│ ├── utils.cuh # CUDA 错误检查宏
│ ├── stb_image.h # 第三方:图像解码
│ └── stb_image_write.h # 第三方:图像编码
├── scripts/
│ └── compare_opencv.py # OpenCV 对比验证脚本
├—— lab_report.pdf # 实验报告
├—— assets/ # 实验截图
├── test_images/ # 测试图像
├── params.txt # 默认滤波参数
├── Makefile # 编译构建
└── README.md
```

## 版本说明

| 版本 | 说明 | MODE 参数 |
|------|------|-----------|
| v0_cpu | CPU 基线实现 | `v0` |
| v1_naive | GPU Naive,一个线程一个像素 | `v1` |
| v2_smem | GPU Shared Memory Tiling | `v2` |
| v3_constmem | GPU Constant Memory LUT | `v3` |
| v4_stream | GPU Pinned Memory Stream Pipeline | `v4` |
| v5_metax | MetaX C500 MACA 移植(基于 V4 流水线) | `v5`(仅 MACA 平台) |
| v6_moore | Moore Threads S5000 MUSA 移植(基于 V4 流水线) | `v6`(仅 MUSA 平台) |
| all | 跑所有版本 + 性能对比表 | `all`(默认) |


## 编译与运行

### NVIDIA A100 平台

```bash
# 清理
make clean

# 编译
make build

# 运行
make run INPUT=test_images/yosemite.jpg
make run INPUT=test_images/yosemite.jpg MODE=v4

# 指定空闲 GPU
CUDA_VISIBLE_DEVICES=6 make run INPUT=test_images/yosemite.jpg
```

### Iluvatar BI-V100 平台

```bash
# 清理
PLATFORM=iluvatar make clean

# 编译
PLATFORM=iluvatar make build

# 运行
PLATFORM=iluvatar make run INPUT=test_images/yosemite.jpg
PLATFORM=iluvatar make run INPUT=test_images/yosemite.jpg MODE=v4
```

### MetaX C500 平台(MACA)

```bash
# 清理
PLATFORM=metax make clean

# 编译
PLATFORM=metax make build

# 运行
PLATFORM=metax make run INPUT=test_images/yosemite.jpg MODE=all
PLATFORM=metax make run INPUT=test_images/yosemite.jpg MODE=v5
```

### Moore Threads S5000 平台(MUSA)

```bash
# 清理
PLATFORM=moore make clean

# 编译
PLATFORM=moore make build

# 运行
PLATFORM=moore make run INPUT=test_images/yosemite.jpg MODE=all
PLATFORM=moore make run INPUT=test_images/yosemite.jpg MODE=v6
```

## OpenCV 对比验证

**通过标准:** MAE < 1,PSNR > 40 dB

```bash
# 验证各版本输出
python3 scripts/compare_opencv.py test_images/yosemite.jpg output/images/yosemite_v0_cpu.png

python3 scripts/compare_opencv.py test_images/yosemite.jpg output/images/yosemite_v1_naive.png

python3 scripts/compare_opencv.py test_images/yosemite.jpg output/images/yosemite_v2_smem.png

python3 scripts/compare_opencv.py test_images/yosemite.jpg output/images/yosemite_v3_constmem.png

python3 scripts/compare_opencv.py test_images/yosemite.jpg output/images/yosemite_v4_stream.png

python3 scripts/compare_opencv.py test_images/yosemite.jpg output/images/yosemite_v5_metax.png

python3 scripts/compare_opencv.py test_images/yosemite.jpg output/images/yosemite_v6_moore.png
```


## 性能分析 (nsys)
### nsys (时间线分析)
分析 API 调用耗时、内存传输、Kernel执行时间

```bash
# 采集profile
# 以 V4 为例(指定空闲 GPU)
CUDA_VISIBLE_DEVICES=6 nsys profile --trace=cuda -o output/v4_yosemite \
make run INPUT=test_images/yosemite.jpg MODE=v4

# 查看统计报告
nsys stats output/v4_yosemite.nsys-rep
```

### ncu (Kernel 级分析)
分析单个 kernel 的计算吞吐、内存带宽、占用率、缓存命中率

```bash
# 分析 V3 kernel(跳过1次预热,分析1次调用)
CUDA_VISIBLE_DEVICES=6 sudo ncu \
--kernel-name bilateral_filter_kernel_v3 \
--launch-skip 1 --launch-count 1 \
./build/bilateral_filter test_images/yosemite.jpg params.txt output v3

# 分析 V4 的4个 strip kernel
CUDA_VISIBLE_DEVICES=6 sudo ncu \
--kernel-name bilateral_filter_kernel_v3 \
--launch-skip 1 --launch-count 4 \
./build/bilateral_filter test_images/yosemite.jpg params.txt output v4

# 导出报告文件(可用 Nsight Compute GUI 打开)
CUDA_VISIBLE_DEVICES=6 sudo ncu \
--set full \
--kernel-name bilateral_filter_kernel_v3 \
--launch-skip 1 --launch-count 4 \
-o output/ncu_v4_stream_report \
./build/bilateral_filter test_images/yosemite.jpg params.txt output v4
```

#### ncu 性能分析图(V4 Stream,NVIDIA A100)

**GPU Speed Of Light — Throughput Chart**

![GPU Throughput Chart](assets/ncu_ui/ncu_throughput.png)

**GPU Speed Of Light — Roofline**

![Roofline Analysis](assets/ncu_ui/ncu_roofline.png)

**Memory Workload Analysis**

![Memory Workload](assets/ncu_ui/ncu_memory.png)

## 各版本输出效果对比

从图中可以看出,所有版本(V0–V4)的输出图像与 OpenCV 参考结果在视觉上一致,无可察觉差异。双边滤波成功保留了图像的边缘细节(如树木轮廓、岩石纹理),同时有效平滑了噪声区域(如天空的颜色渐变)。这验证了本项目实现的正确性——所有优化版本(V2 Shared Memory、V3 ConstantMemory、V4 Stream 流水线)在追求性能的同时,保持了算法的数值精度和视觉质量。

![各版本双边滤波输出效果对比](assets/bf_version.png)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
29 changes: 29 additions & 0 deletions 08_bilateral_filter/Snowkyo16/include/benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include "image_io.h"
#include "params.h"

#include <string>
#include <vector>
#include <functional>

using namespace std;

// 滤波函数统一签名
using FilterFunc = function<Image(const Image&, const FilterParams&)>;

// 单个版本的运行结果
struct VersionResult {
string name; // 如 "v0_cpu", "v1_naive"
double time_ms;
Image output;
};

// 运行一个版本:计时 + 打印 + 保存图片,返回结果
VersionResult run_version(const string& name, const Image& input,
const FilterParams& params, FilterFunc func,
const string& image_dir, const string& basename,
const string& ext, int num_runs = 1);

// 打印汇总对比表 + 一致性验证
void print_summary(const vector<VersionResult>& results, int width, int height);
33 changes: 33 additions & 0 deletions 08_bilateral_filter/Snowkyo16/include/bilateral_filter.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#pragma once

#include "image_io.h"
#include "params.h"


// V0: CPU 基线实现
Image bilateral_filter_cpu_v0(const Image& input, const FilterParams& params);

// V1: GPU Naive 实现,一个线程处理一个像素,全局内存访问
Image bilateral_filter_gpu_v1(const Image& input, const FilterParams& params);

// V2: GPU Shared Memory 优化,Tiling + Halo协作加载
Image bilateral_filter_gpu_v2(const Image& input, const FilterParams& params);

// V3: GPU 常量内存LUT + __expf快速数学 + 循环展开
void bilateral_filter_gpu_v3_init(const FilterParams& params); // LUT上传
Image bilateral_filter_gpu_v3(const Image& input, const FilterParams& params);

// V4: Pinned Memory + CUDA Streams 流水线,预分配 Device Buffer
void bilateral_filter_gpu_v4_init(const Image& input, const FilterParams& params); // LUT + buffer + pinned + streams 初始化
Image bilateral_filter_gpu_v4(const Image& input, const FilterParams& params);
void bilateral_filter_gpu_v4_cleanup(); // 释放 buffer、pinned memory、销毁 streams

// V5: Metax C500(MACA) 平台适配,Pinned Memory + Streams 流水线
void bilateral_filter_gpu_v5_init(const Image& input, const FilterParams& params); // LUT + buffer + pinned + stream初始化
Image bilateral_filter_gpu_v5(const Image& input, const FilterParams& params);
void bilateral_filter_gpu_v5_cleanup(); // 释放buffer、pinned memory、销毁 streams

// V6: Moore Threads S5000(MUSA) 平台适配,Pinned Memory + Streams 流水线
void bilateral_filter_gpu_v6_init(const Image& input, const FilterParams& params); // LUT + buffer + pinned + streams 初始化
Image bilateral_filter_gpu_v6(const Image& input, const FilterParams& params);
void bilateral_filter_gpu_v6_cleanup(); // 释放 buffer、pinned memory、销毁 streams
Loading