3.1 线程与存储
tid=blockIdx.x(当前块的ID)*blockDim.x(当前块里面的线程数量)+threadIdx.x(当前线程在块中的ID)gridDim.x*blockDim.x来计算,前者代表了本次启动的块的数量,而后者代表了每个块里面的线程数量,然后每次while循环,tid变量加上这个值,向后偏移以得到下个任务的索引所有线程都有一个寄存器堆,它是最快的。共享内存只能被块中的线程访问,但比全局内存块。全局内存是最慢的,但可以被所有的块访问。常量和纹理内存用于特殊用途所有通过cudaMalloc分配的存储器都是全局内存本地内存和寄存器堆对每个线程都是唯一的。寄存器是每个线程可用的最快存储器。当内核中使用的变量在寄存器堆中装不下的时候,将会使用本地内存存储它们,这叫寄存器溢出“读取旧值-累加-回写新值”操作是不可被其他线程扰乱的原子性的整体完成的。使用atomicAdd进行原子累加的内核函数使用原子操作后程序具有更大的执行代价。可以通过使用共享内存来加速这些原子累加操作
GPU卡从逻辑上对用户提供了64KB的常量内存空间,可以用来存储内核执行期间所需要的恒定数据常量内存有助于节省全局内存的访问带宽warp整体进行一次常量内存的读取,结果广播给warp里的32个线程。同时,常量内存具有cache缓冲。当后续的在邻近位置上访问,将不会发生额外的从显存过来的传输。每个warp里的32个线程,进行一致性的相同常量内存位置读取的时候,这种广播效果和cache命中效果可以节省执行时间当程序进行具有很大程度上的空间邻近性的访存的时候,纹理变得非常高效。空间邻近性的意思是,每个线程的读取位置都和其他线程的读取位置邻近。请一定要确保纹理引用被定义成全局静态变量,同时还要确保它不能作为参数传递给任何其他函数。原子操作-求和
#include#define NUM_THREADS 10000 #define SIZE 10 #define BLOCK_WIDTH 100 __global__ void gpu_increment_without_atomic(int *d_a) { // Calculate thread id for current thread int tid = blockIdx.x * blockDim.x + threadIdx.x; // each thread increments elements wrapping at SIZE variable tid = tid % SIZE; d_a[tid] += 1; } __global__ void gpu_increment_atomic(int *d_a) { // Calculate thread id for current thread int tid = blockIdx.x * blockDim.x + threadIdx.x; // each thread increments elements wrapping at SIZE variable tid = tid % SIZE; atomicAdd(&d_a[tid], 1); } int main(int argc, char **argv) { printf("%d total threads in %d blocks writing into %d array elementsn", NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, SIZE); // declare and allocate host memory int h_a[SIZE]; const int ARRAY_BYTES = SIZE * sizeof(int); // declare and allocate GPU memory int *d_a, *d_aA; cudaMalloc((void **)&d_a, ARRAY_BYTES); //Initialize GPU memory to zero cudaMemset((void *)d_a, 0, ARRAY_BYTES); gpu_increment_without_atomic << > >(d_a); // copy back the array to host memory cudaMemcpy(h_a, d_a, ARRAY_BYTES, cudaMemcpyDeviceToHost); printf("Number of times a particular Array index has been incremented without atomic add is: n"); for (int i = 0; i < SIZE; i++) { printf("index: %d --> %d timesn ", i, h_a[i]); } cudaFree(d_a); cudaMalloc((void **)&d_aA, ARRAY_BYTES); //Initialize GPU memory to zero cudaMemset((void *)d_aA, 0, ARRAY_BYTES); gpu_increment_atomic << > >(d_aA); // copy back the array to host memory cudaMemcpy(h_a, d_aA, ARRAY_BYTES, cudaMemcpyDeviceToHost); printf("Number of times a particular Array index has been incremented is: n"); for (int i = 0; i < SIZE; i++) { printf("index: %d --> %d timesn ", i, h_a[i]); } cudaFree(d_aA); return 0; }
常量内存与纹理内存
#include "stdio.h" #include#include #include //Defining two constants __constant__ int constant_f; __constant__ int constant_g; #define N 5 //Kernel function for using constant memory __global__ void gpu_constant_memory(float *d_in, float *d_out) { //Thread index for current kernel int tid = threadIdx.x; d_out[tid] = constant_f*d_in[tid] + constant_g; } #define NUM_THREADS 5 texture textureRef; __global__ void gpu_texture_memory(int n, float *d_out) { int idx = blockIdx.x*blockDim.x + threadIdx.x; if (idx < n) { float temp = tex1D(textureRef, float(idx)); d_out[idx] = temp; } } int main(void) { //Defining Arrays for host float h_in[N], h_out[N]; //Defining Pointers for device float *d_in, *d_out; // 常量内存 int h_f = 2; int h_g = 20; // allocate the memory on the cpu cudaMalloc((void**)&d_in, N * sizeof(float)); cudaMalloc((void**)&d_out, N * sizeof(float)); //Initializing Array for (int i = 0; i < N; i++) { h_in[i] = i; } //Copy Array from host to device cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice); //Copy constants to constant memory cudaMemcpyToSymbol(constant_f, &h_f, sizeof(int),0,cudaMemcpyHostToDevice); cudaMemcpyToSymbol(constant_g, &h_g, sizeof(int)); //Calling kernel with one block and N threads per block gpu_constant_memory << <1, N >> >(d_in, d_out); //Coping result back to host from device memory cudaMemcpy(h_out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost); //Printing result on console printf("Use of Constant memory on GPU n"); for (int i = 0; i < N; i++) { printf("The expression for input %f is %fn", h_in[i], h_out[i]); } //Free up memory cudaFree(d_in); cudaFree(d_out); // 纹理内存 //Calculate number of blocks to launch int num_blocks = N / NUM_THREADS + ((N % NUM_THREADS) ? 1 : 0); //Declare device pointer float *d_outM; // allocate space on the device for the result cudaMalloc((void**)&d_outM, sizeof(float) * N); // allocate space on the host for the results float *h_outM = (float*)malloc(sizeof(float)*N); //Declare and initialize host array float h_inM[N]; for (int i = 0; i < N; i++) { h_inM[i] = float(i); } //Define CUDA Array cudaArray *cu_Array; cudaMallocArray(&cu_Array, &textureRef.channelDesc, N, 1); //Copy data to CUDA Array cudaMemcpyToArray(cu_Array, 0, 0, h_inM, sizeof(float)*N, cudaMemcpyHostToDevice); // bind a texture to the CUDA array cudaBindTextureToArray(textureRef, cu_Array); //Call Kernel gpu_texture_memory << > >(N, d_outM); // copy result back to host cudaMemcpy(h_outM, d_outM, sizeof(float)*N, cudaMemcpyDeviceToHost); printf("Use of Texture memory on GPU: n"); for (int i = 0; i < N; i++) { printf("Texture element at %d is : %fn",i, h_outM[i]); } free(h_outM); cudaFree(d_outM); cudaFreeArray(cu_Array); cudaUnbindTexture(textureRef); return 0; }
全局、局部、共享内存
#include#define N 5 __global__ void gpu_global_memory(float *d_a) { // "array" is a pointer into global memory on the device d_a[threadIdx.x] = threadIdx.x; } __global__ void gpu_local_memory(float d_in) { int t_local; t_local = d_in * threadIdx.x; printf("Value of Local variable in current thread is: %d n", t_local); } __global__ void gpu_shared_memory(float *d_a) { // Defining local variables which are private to each thread int i, index = threadIdx.x; float average, sum = 0.0f; //Define shared memory __shared__ float sh_arr[5]; sh_arr[index] = d_a[index]; __syncthreads(); // This ensures all the writes to shared memory have completed for (i = 0; i<= index; i++) { sum += sh_arr[i]; } average = sum / (index + 1.0f); d_a[index] = average; sh_arr[index] = average; } int main(int argc, char **argv) { // Define Host Array float h_a[N]; //Define device pointer float *d_a; // 全局内存 cudaMalloc((void **)&d_a, sizeof(float) *N); // now copy data from host memory to device memory cudaMemcpy((void *)d_a, (void *)h_a, sizeof(float) *N, cudaMemcpyHostToDevice); // launch the kernel gpu_global_memory << <1, N >> >(d_a); // copy the modified array back to the host memory cudaMemcpy((void *)h_a, (void *)d_a, sizeof(float) *N, cudaMemcpyDeviceToHost); printf("Array in Global Memory is: n"); //Printing result on console for (int i = 0; i < N; i++) { printf("At Index: %d --> %f n", i, h_a[i]); } // 本地内存 printf("Use of Local Memory on GPU:n"); gpu_local_memory << <1, N >> >(5); cudaDeviceSynchronize(); // 共享内存 for (int i = 0; i < 5; i++) { h_a[i] = i; } // allocate global memory on the device cudaMalloc((void **)&d_a, sizeof(float) * 5); // now copy data from host memory to device memory cudaMemcpy((void *)d_a, (void *)h_a, sizeof(float) * 5, cudaMemcpyHostToDevice); gpu_shared_memory << <1, 5 >> >(d_a); // copy the modified array back to the host memory cudaMemcpy((void *)h_a, (void *)d_a, sizeof(float) * 5, cudaMemcpyDeviceToHost); printf("Use of Shared Memory on GPU: n"); //Printing result on console for (int i = 0; i < 5; i++) { printf("The running average after %d element is %f n", i, h_a[i]); } return 0; }
多线程
#include "stdio.h" #include#include #include //Defining number of elements in Array #define N 50000 //Defining Kernel function for vector addition __global__ void gpuAdd(int *d_a, int *d_b, int *d_c) { //Getting block index of current kernel int tid = threadIdx.x + blockIdx.x * blockDim.x; while (tid < N) { d_c[tid] = d_a[tid] + d_b[tid]; tid += blockDim.x * gridDim.x; } } int main(void) { //Defining host arrays int h_a[N], h_b[N], h_c[N]; //Defining device pointers int *d_a, *d_b, *d_c; // allocate the memory cudaMalloc((void**)&d_a, N * sizeof(int)); cudaMalloc((void**)&d_b, N * sizeof(int)); cudaMalloc((void**)&d_c, N * sizeof(int)); //Initializing Arrays for (int i = 0; i < N; i++) { h_a[i] = 2 * i*i; h_b[i] = i; } // Copy input arrays from host to device memory cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice); //Calling kernels with N blocks and one thread per block, passing device pointers as parameters gpuAdd << <512, 512 >> >(d_a, d_b, d_c); //Copy result back to host memory from device memory cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); int Correct = 1; printf("Vector addition on GPU n"); //Printing result on console for (int i = 0; i < N; i++) { if ((h_a[i] + h_b[i] != h_c[i])) { Correct = 0; } } if (Correct == 1) { printf("GPU has computed Sum Correctlyn"); } else { printf("There is an Error in GPU Computationn"); } //Free up memory cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); return 0; }
3.2 向量点乘与矩阵乘法例子
向量点乘矩阵乘法
#include "stdio.h" #include#include #include #include #define TILE_SIZE 2 #define N 1024 #define threadsPerBlock 512 __global__ void gpu_dot(float *d_a, float *d_b, float *d_c) { //Declare shared memory __shared__ float partial_sum[threadsPerBlock]; int tid = threadIdx.x + blockIdx.x * blockDim.x; //Calculate index for shared memory int index = threadIdx.x; //Calculate Partial Sum float sum = 0; while (tid < N) { sum += d_a[tid] * d_b[tid]; tid += blockDim.x * gridDim.x; // 前者代表了本次启动的块的数量,而后者代表了每个块里面的线程数量 } // Store partial sum in shared memory partial_sum[index] = sum; // synchronize threads __syncthreads(); // Calculating partial sum for whole block in reduce operation int i = blockDim.x / 2; while (i != 0) { if (index < i) partial_sum[index] += partial_sum[index + i]; __syncthreads(); i /= 2; } //Store block partial sum in global memory if (index == 0) d_c[blockIdx.x] = partial_sum[0]; } //Matrix multiplication using non shared kernel __global__ void gpu_Matrix_Mul_nonshared(float *d_a, float *d_b, float *d_c, const int size) { int row, col; col = TILE_SIZE * blockIdx.x + threadIdx.x; row = TILE_SIZE * blockIdx.y + threadIdx.y; for (int k = 0; k< size; k++) { d_c[row*size + col] += d_a[row * size + k] * d_b[k * size + col]; } } // Matrix multiplication using shared kernel __global__ void gpu_Matrix_Mul_shared(float *d_a, float *d_b, float *d_c, const int size) { int row, col; //Defining Shared Memory __shared__ float shared_a[TILE_SIZE][TILE_SIZE]; __shared__ float shared_b[TILE_SIZE][TILE_SIZE]; col = TILE_SIZE * blockIdx.x + threadIdx.x; row = TILE_SIZE * blockIdx.y + threadIdx.y; for (int i = 0; i< size / TILE_SIZE; i++) { shared_a[threadIdx.y][threadIdx.x] = d_a[row* size + (i*TILE_SIZE + threadIdx.x)]; shared_b[threadIdx.y][threadIdx.x] = d_b[(i*TILE_SIZE + threadIdx.y) * size + col]; __syncthreads(); for (int j = 0; j > >(d_a, d_b, d_partial_sum); // copy the array back to host memory cudaMemcpy(partial_sum, d_partial_sum, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost); // Calculate final dot product on host h_c = 0; for (int i = 0; i > > (d_a, d_b, d_result, size); gpu_Matrix_Mul_shared << > > (d_a, d_b, d_result, size); cudaMemcpy(h_result, d_result, size*size * sizeof(int), cudaMemcpyDeviceToHost); printf("The result of Matrix multiplication is: n"); for (int i = 0; i< size; i++) { for (int j = 0; j < size; j++) { printf("%f ", h_result[i][j]); } printf("n"); } cudaFree(d_a); cudaFree(d_b); cudaFree(d_result); return 0; } int main(){ main_matrix(); }



