栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

基于CUDA编程二维卷积(Conv2)实现

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

基于CUDA编程二维卷积(Conv2)实现

最近做的CUDA C++的高性能并行作业

本人小菜鸡一个,padding,stride...,全局/共享内存啥的都没做

如果你的输入矩阵不是方针,可自行修改输入大小的宽高,本文默认方针宽高皆为arr_size

1. 二维卷积如下,便不多做解释(图片来自网络,侵权请联系删除)

 2. CPU串行实现代码

void Conv2(float** filter, float** arr, float** result, int filter_size, int arr_size) {
	float temp;

	for (int i = 0; i < arr_size - filter_size + 1; i++) {
		for (int j = 0; j < arr_size - filter_size + 1; j++) {
			temp = 0;
			for (int m = 0; m < filter_size; m++) {
				for (int n = 0; n < filter_size; n++) {
					temp += filter[m][n] * arr[i + m][j + n];
				}
			}
			result[i][j] = temp;
		}
	}
}

3.GPU并行代码

此出采用了一维数组表示二维数组的方法

__global__
void convolution_2D_basic(float* filter, float* arr, float* result, int filter_size, int arr_size)
{
	int Col = blockIdx.x*blockDim.x + threadIdx.x;
	int Row = blockIdx.y*blockDim.y + threadIdx.y;
	if (Row < arr_size - filter_size + 1 && Col < arr_size - filter_size + 1)
	{
		float pixVal = 0;
		//start
		int startCol = Col;
		int startRow = Row;	
		//caculate the res
		for (int i = 0; i < filter_size; i++)
		{
			for (int j = 0; j < filter_size; j++)
			{
				int curRow = startRow + i;
				int curCol = startCol + j;
				if (curRow > -1 && curRow-1 && curCol < arr_size)
				{
					pixVal += filter[i*filter_size + j] * arr[curRow*arr_size + curCol];
				}
			}
		}
		result[Row*arr_size + Col] = pixVal;
	}
}

4. 文件结构和使用

创建一个CUDA runtime 项目(我的项目名:cuda_code)

项目下文件结构如下:

5.完整代码 

 源.cpp完整代码:

#include 
#include 
#include 
#include 
#include"device_launch_parameters.h"
#include"cuda_runtime.h"

#define FILTER_WIDTH 3
int filter_size = FILTER_WIDTH;
int arr_size = 1024;
int result_size = arr_size + FILTER_WIDTH - 1;


extern "C" void Conv2Kernel(float** arr, float** pFilter, int filter_size, int arr_size, int result_size);

void Conv2(float** filter, float** arr, float** result, int filter_size, int arr_size) {
	float temp;

	for (int i = 0; i < arr_size - filter_size + 1; i++) {
		for (int j = 0; j < arr_size - filter_size + 1; j++) {
			temp = 0;
			for (int m = 0; m < filter_size; m++) {
				for (int n = 0; n < filter_size; n++) {
					temp += filter[m][n] * arr[i + m][j + n];
				}
			}
			result[i][j] = temp;
		}
	}
}

int main()
{
	int dev = 0;
	cudaDeviceProp devProp;
	cudaGetDeviceProperties(&devProp, dev);
	std::cout << "使用GPU device " << dev << ": " << devProp.name << std::endl;
	std::cout << "SM的数量:" << devProp.multiProcessorCount << std::endl;
	std::cout << "每个线程块的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
	std::cout << "每个Grid的Block数:" << devProp.maxGridSize[0] << " x " << devProp.maxGridSize[1] << " x " << devProp.maxGridSize[2] << std::endl;
	std::cout << "每个线程块的最大线程数:" << devProp.maxThreadsPerBlock << std::endl;
	std::cout << "每个EM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << std::endl;
	std::cout << "每个EM的最大线程束数:" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;


	clock_t CPU_start, CPU_stop;

	// Array, filter, result
	float** pFilter = new float*[filter_size];
	for (int i = 0; i < filter_size; i++)
	{
		pFilter[i] = new float[filter_size];
	}

	float** arr = new float*[arr_size];
	for (int i = 0; i < arr_size; i++)
	{
		arr[i] = new float[arr_size];
	}

	float** res = new float*[result_size];
	for (int i = 0; i < result_size; i++)
	{
		res[i] = new float[result_size];
	}

	//initialization
	for (int i = 0; i < filter_size; i++) {
		for (int j = 0; j < filter_size; j++)
			pFilter[i][j] = rand() % 11;
	}

	for (int i = 0; i < arr_size; i++) {
		for (int j = 0; j < arr_size; j++)
			arr[i][j] = rand() % 11;
	}

	CPU_start = clock();
	Conv2(pFilter, arr, res, filter_size, arr_size);
	CPU_stop = clock();
	float CPU_time = (float)(CPU_stop - CPU_start) / CLOCKS_PER_SEC;
	printf("-------------------CPU version Done!------------------n");
	printf("CPU time:%f n", CPU_time);

	Conv2Kernel(arr, pFilter, filter_size, arr_size, result_size);

}

kernel.cu完整代码:

#include"device_launch_parameters.h"
#include"cuda_runtime.h"
#include 
#include 
#include 


//kernel function
__global__
void convolution_2D_basic(float* filter, float* arr, float* result, int filter_size, int arr_size)
{
	int Col = blockIdx.x*blockDim.x + threadIdx.x;
	int Row = blockIdx.y*blockDim.y + threadIdx.y;
	if (Row < arr_size - filter_size + 1 && Col < arr_size - filter_size + 1)
	{
		float pixVal = 0;
		//start
		int startCol = Col;
		int startRow = Row;	
		//caculate the res
		for (int i = 0; i < filter_size; i++)
		{
			for (int j = 0; j < filter_size; j++)
			{
				int curRow = startRow + i;
				int curCol = startCol + j;
				if (curRow > -1 && curRow-1 && curCol < arr_size)
				{
					pixVal += filter[i*filter_size + j] * arr[curRow*arr_size + curCol];
				}
			}
		}
		result[Row*arr_size + Col] = pixVal;
	}
}


extern "C" void Conv2Kernel(float** arr, float** pFilter, int filter_size, int arr_size, int result_size)
{
	int arr_size_1D = arr_size * arr_size;
	int filter_size_1D = filter_size * filter_size;
	int result_size_1D = result_size * result_size;

	float *arr_1D = (float*)malloc(arr_size_1D * sizeof(float));
	float *result_1D = (float*)malloc(result_size_1D * sizeof(float));
	float *filter_1D = (float*)malloc(filter_size_1D * sizeof(float));

	for (int i = 0; i < arr_size; i++) {
		for (int j = 0; j < arr_size; j++) {
			arr_1D[i*arr_size + j] = arr[i][j] * 1.0;
		}
	}

	for (int i = 0; i < filter_size; i++) {
		for (int j = 0; j < filter_size; j++) {
			filter_1D[i*filter_size + j] = pFilter[i][j] * 1.0;
		}
	}

	float *device_input_arr, *device_output_arr, *device_filter_arr;
	cudaMalloc((void**)&device_input_arr, sizeof(float) * arr_size_1D);
	cudaMalloc((void**)&device_output_arr, sizeof(float) * result_size_1D);
	cudaMalloc((void**)&device_filter_arr, sizeof(float) * filter_size_1D);

	cudaEvent_t start, stop;
	cudaEventCreate(&start);    //创建Event
	cudaEventCreate(&stop);

	cudaMemcpy(device_input_arr, arr_1D, sizeof(float) * arr_size_1D, cudaMemcpyHostToDevice);
	cudaMemcpy(device_output_arr, result_1D, sizeof(float) * result_size_1D, cudaMemcpyHostToDevice);
	cudaMemcpy(device_filter_arr, filter_1D, sizeof(float) * filter_size_1D, cudaMemcpyHostToDevice);

	dim3 ThreadNum = (64, 64);
	dim3 BlockNum  = ((arr_size - 0.5) / ThreadNum.x + 1, (arr_size - 0.5) / ThreadNum.x + 1, 1);

	cudaEventRecord(start, 0);
	convolution_2D_basic << > > (device_input_arr, device_output_arr, device_filter_arr, filter_size, arr_size);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	cudaMemcpy(result_1D, device_output_arr, sizeof(float)*arr_size_1D, cudaMemcpyDeviceToHost);


	float GPU_time;
	cudaEventElapsedTime(&GPU_time, start, stop);

	printf("-------------------GPU version Done!------------------n");
	printf("GPU_Time: %f n", GPU_time);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

	cudaFree(device_input_arr);
	cudaFree(device_output_arr);
	cudaFree(device_filter_arr);
}

6.如果想读入图片数据,操作如下

  • 为项目配置opencv(网上随便一搜,便能找到)
  • 添加头文件,替换初始化arr部分的代码
    #include   
    #include  
    
    using namespace cv;
    
    
    //读取图像img。0表示转换为灰度图像(单通道)读入    
    Mat img = imread("image.jpg", 0);
    
    int row = img.rows;
    int col = img.cols;
    
    float** arr = new float*[row];
    for (int i = 0; i < row; i++)
    {
    	arr[i] = new float[col];
    }
    
    for (int i = 0; i < row; i++) {
    	for (int j = 0; j < col; j++) {
    		arr[i][j] = img.at(i, j);
    	}
    }
    

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/587690.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号