利用GPU并行计算的的总体思路是:在CPU(Host)中创建数据,将数据传到GPU(Device)中进行计算,再将计算结果传回到CPU中。
最简单的例子:将CPU中的两个数字在GPU中进行相加,并在CPU中输出:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include#include __device__ int add_gpu(int a, int b) { return a + b; } __global__ void add(int a, int b, int *c) { *c = add_gpu(a , b); } int main() { //创建变量 int a = 3, b = 5; int c,int *ptr; //分配GPU中的内存 cudaMalloc((void **)&ptr, sizeof(int)); //在GPU中进行计算 add << <1, 1 >> > (a, b, ptr); //将GPU中的计算结果(ptr指针)复制到CPU主机中,赋给c cudaMemcpy(&c, ptr, sizeof(int), cudaMemcpyDeviceToHost); printf("%d + %d = %dn", a, b, c); //释放指针 cudaFree(ptr); return 0; } //3 + 5 = 8
上面的例子只有一个数相加,因此不需要多线程,下面我们加大数据量,引入多线程并行计算。
用10个线程对两个长度为10的数组相加:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include#include const int N = 10; //数组长度 const int Mem = N * sizeof(int); //数组内存大小 __global__ void add(int *a, int *b, int *c) { int tid = threadIdx.x; if (tid < N) c[tid] = a[tid] + b[tid]; } int main() { //创建变量 int a[N] = { 1,2,3,4,5,6,7,8,9,10 }; int b[N] = { 1,3,5,7,9,11,13,15,17,19 }; int c[N]; int *dev_a, *dev_b, *dev_c; //分类GPU内存 cudaMalloc((void **)&dev_a, Mem); cudaMalloc((void **)&dev_b, Mem); cudaMalloc((void **)&dev_c, Mem); //将数据传给GPU cudaMemcpy(dev_a, a, Mem, cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, Mem, cudaMemcpyHostToDevice); //在GPU中并行计算 add << <1, 10 >> > (dev_a, dev_b, dev_c); //将计算结果传回CPU cudaMemcpy(c, dev_c, Mem, cudaMemcpyDeviceToHost); //输出计算结果 for (int i = 0; i < N; i++) printf("%d + %d = %dn", a[i], b[i], c[i]); //释放指针 cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return 0; }
当然也可以用10个线程块,每个线程块分配一个线程的方式来实现,输出结果是一样的,代码如下:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include#include #define N 1024 const int Mem = N * sizeof(int); //数组内存大小 __global__ void add(int *a, int *b, int *c) { int bid = blockIdx.x; if (bid < N) c[bid] = a[bid] + b[bid]; } int main() { //创建变量 int a[N] = { 1,2,3,4,5,6,7,8,9,10 }; int b[N] = { 1,3,5,7,9,11,13,15,17,19 }; int c[N]; int *dev_a, *dev_b, *dev_c; //分类GPU内存 cudaMalloc((void **)&dev_a, Mem); cudaMalloc((void **)&dev_b, Mem); cudaMalloc((void **)&dev_c, Mem); //将数据传给GPU cudaMemcpy(dev_a, a, Mem, cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, Mem, cudaMemcpyHostToDevice); //在GPU中并行计算 add << <10, 1 >> > (dev_a, dev_b, dev_c); //将计算结果传回CPU cudaMemcpy(c, dev_c, Mem, cudaMemcpyDeviceToHost); //输出计算结果 for (int i = 0; i < N; i++) printf("%d + %d = %dn", a[i], b[i], c[i]); //释放指针 cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return 0; }
实际情况中,数据量很大,我们不可能用一个线程只处理一个数据,而是用一个线程处理多个数据。
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include#include #define N 1024 const int Mem = N * sizeof(int); //数组内存大小 __global__ void add(int *a, int *b, int *c) { int bid = blockIdx.x; while (bid < N) { c[bid] = a[bid] + b[bid]; bid += gridDim.x; } } int main() { //创建变量 int a[N],b[N],c[N]; int *dev_a, *dev_b, *dev_c; for (int i = 0; i < N; i++) { a[i] = i + 1; b[i] = 2 * i; } //分类GPU内存 cudaMalloc((void **)&dev_a, Mem); cudaMalloc((void **)&dev_b, Mem); cudaMalloc((void **)&dev_c, Mem); //将数据传给GPU cudaMemcpy(dev_a, a, Mem, cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, Mem, cudaMemcpyHostToDevice); //在GPU中并行计算 add << <10, 1 >> > (dev_a, dev_b, dev_c); //将计算结果传回CPU cudaMemcpy(c, dev_c, Mem, cudaMemcpyDeviceToHost); //输出计算结果 for (int i = 0; i < N; i++) printf("%d + %d = %dn", a[i], b[i], c[i]); //释放指针 cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return 0; }



