- 目的
- 内容
- 整体框架
- CPU代码
- GPU代码
- 结论
熟悉基本的 CUDA 程序架构以及如何调用相应的 API 进行 CUDA 编程。
内容实现 2 个矢量(长度 50000)的相加,输入的矢量 A,B 按照以下要求初始化,矢量 A 的初始值全为本人学号的最后 1 位数字,矢量 B 的初始值全为本人学号的倒数第 2 位数字。同时用CPU代码实现,比较两个代码的运行时间。完成以下三个版本的 CUDA 核函数。
整体框架- 用每个线程来计算矢量加法的一个输出元素。
#include#include #include // For the CUDA runtime routines (prefixed with "cuda_") __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < numElements) { C[i] = A[i] + B[i]; } } void checkErr(cudaError err,int num) { if( cudaSuccess != err) { fprintf(stderr, "Cuda error in file '%s' in line %i : %s.n", __FILE__, num-1, cudaGetErrorString( err) ); } } void setCudaDevice(int devNum) { cudaError_t err = cudaSuccess; printf("nCUDA Device #%dn", devNum); cudaDeviceProp devProp; cudaGetDeviceProperties(&devProp, devNum); printf("Name: %sn", devProp.name); printf("Total global memory: %un", devProp.totalGlobalMem); printf("Major revision number: %dn", devProp.major); printf("Minor revision number: %dn", devProp.minor); err=cudaSetDevice(devNum); checkErr(err,__LINE__); } int main(void) { // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; //timer float cpu_time,gpu_time; clock_t cpu_start,cpu_stop,gpu_start,gpu_stop; int devNum=1; setCudaDevice(devNum); // Print the vector length to be used, and compute its size int numElements = 50000; size_t size = numElements * sizeof(float); printf("[Vector addition of %d elements]n", numElements); // Allocate the host input vector A float *h_A = (float *)malloc(size); // Allocate the host input vector B float *h_B = (float *)malloc(size); // Allocate the host output vector C float *h_C = (float *)malloc(size); // Verify that allocations succeeded if (h_A == NULL || h_B == NULL || h_C == NULL) { fprintf(stderr, "Failed to allocate host vectors!n"); exit(EXIT_FAILURE); } // Initialize the host input vectors for (int i = 0; i < numElements; ++i) { h_A[i] = rand()/(float)RAND_MAX; h_B[i] = rand()/(float)RAND_MAX; } // Allocate the device input vector A float *d_A = NULL; err = cudaMalloc((void **)&d_A, size); checkErr(err,__LINE__); // Allocate the device input vector B float *d_B = NULL; err = cudaMalloc((void **)&d_B, size); checkErr(err,__LINE__); // Allocate the device output vector C float *d_C = NULL; err = cudaMalloc((void **)&d_C, size); checkErr(err,__LINE__); gpu_start=clock(); // Copy the host input vectors A and B in host memory to the device input vectors in // device memory //printf("Copy input data from the host memory to the CUDA devicen"); err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); checkErr(err,__LINE__); err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); checkErr(err,__LINE__); // Launch the Vector Add CUDA Kernel int threadsPerBlock = 256; int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; //printf("CUDA kernel launch with %d blocks of %d threadsn", blocksPerGrid, threadsPerBlock); vectorAdd<< >>(d_A, d_B, d_C, numElements); err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy the device result vector in device memory to the host result vector // in host memory. //printf("Copy output data from the CUDA device to the host memoryn"); err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); checkErr(err,__LINE__); gpu_stop=clock(); // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { fprintf(stderr, "Result verification failed at element %d!n", i); exit(EXIT_FAILURE); } } printf("Test PASSEDn"); // CPU method for adding two vectors cpu_start=clock(); for (int i = 0; i < numElements; ++i) { h_C[i]=h_A[i] + h_B[i]; } cpu_stop=clock(); cpu_time=(float)(cpu_stop-cpu_start)/CLOCKS_PER_SEC; gpu_time=(float)(gpu_stop-gpu_start)/CLOCKS_PER_SEC; printf("The computation time of CPU method is :%fn",cpu_time); printf("The computation time of GPU method is :%fn",gpu_time); // Free device global memory err = cudaFree(d_A); checkErr(err,__LINE__); err = cudaFree(d_B); checkErr(err,__LINE__); err = cudaFree(d_C); checkErr(err,__LINE__); // Free host memory free(h_A); free(h_B); free(h_C); // Reset the device and exit err = cudaDeviceReset(); checkErr(err,__LINE__); printf("Donen"); return 0; }
-
用每个线程来计算向量加法的两个(相邻)输出元素。 同时,线程的索引变量 i i i 应该是该线程要处理的第一个元素的位置索引。
-
每个线程来计算向量加法的两个输出元素。 每个线程块处理 2*blockDim.x 个连续元素。 每个块中的所有线程将首先处理 blockDim.x 个连续元素,块中的每个线程处理一个元素。 然后这些块的所有线程将全部移动到下一部分,再次处理下一个 blockDim.x 个连续元素。 线程的索引变量 i i i 应该是线程要处理的第一个元素的索引。
int main()
{
int a[N], b[N], c[N];
a[0] = 0;
b[0] = 2;
int* dev_a, * dev_b, * dev_c;
cudaMalloc((void**)&dev_a, sizeof(int) * N);
cudaMalloc((void**)&dev_b, sizeof(int) * N);
cudaMalloc((void**)&dev_c, sizeof(int) * N);
clock_t start, end;
start = clock();
for (int i = 1; i < N; i++)
{
a[i] = i;
b[i] = i * i;
c[i] = a[i] + b[i];
}
cudaMemcpy(dev_a, a, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeof(int) * N, cudaMemcpyHostToDevice);
add << <256, 256 >> > (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, sizeof(int) * N, cudaMemcpyDeviceToHost);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
end = clock();
GPU代码
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include结论#include #define N 50000 __global__ void add(int* a, int* b, int* c) { clock_t start, end; start = clock(); int i = threadIdx.x + blockIdx.x * blockDim.x; int j = 2 * i; while (i < N) { c[i] = a[i] + b[i];//计算矢量加法的一个输出元素 c[j] = a[j] + b[j]; c[j + 1] = a[j + 1] + b[j + 1];//计算向量加法的两个(相邻)输出元素 c[i + blockDim.x] = a[i + blockDim.x] + b[i + blockDim.x];//计算向量加法的两个(相邻BIOCK)输出元素 i += 1; } int main() { int a[N], b[N], c[N]; a[0] = 0; b[0] = 2; int* dev_a, * dev_b, * dev_c; cudaMalloc((void**)&dev_a, sizeof(int) * N); cudaMalloc((void**)&dev_b, sizeof(int) * N); cudaMalloc((void**)&dev_c, sizeof(int) * N); clock_t start, end; start = clock(); for (int i = 1; i < N; i++) { a[i] = i; b[i] = i * i; c[i] = a[i] + b[i]; } cudaMemcpy(dev_a, a, sizeof(int) * N, cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, sizeof(int) * N, cudaMemcpyHostToDevice); add << <256, 256 >> > (dev_a, dev_b, dev_c); cudaMemcpy(c, dev_c, sizeof(int) * N, cudaMemcpyDeviceToHost); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); end = clock(); float time2 = (float)(end - start) / CLOCKS_PER_SEC; printf("执行时间为:%fn", time2); }
在计算量小的情况下 cpu 计算时间小于 gpu。



