【高性能计算】CUDA编程之线程存储与原子操作（教程与代码-2）

3 线程、同步、存储器

3.1 线程与存储

tid=blockIdx.x（当前块的ID）*blockDim.x（当前块里面的线程数量）+threadIdx.x（当前线程在块中的ID）gridDim.x*blockDim.x来计算，前者代表了本次启动的块的数量，而后者代表了每个块里面的线程数量，然后每次while循环，tid变量加上这个值，向后偏移以得到下个任务的索引所有线程都有一个寄存器堆，它是最快的。共享内存只能被块中的线程访问，但比全局内存块。全局内存是最慢的，但可以被所有的块访问。常量和纹理内存用于特殊用途所有通过cudaMalloc分配的存储器都是全局内存本地内存和寄存器堆对每个线程都是唯一的。寄存器是每个线程可用的最快存储器。当内核中使用的变量在寄存器堆中装不下的时候，将会使用本地内存存储它们，这叫寄存器溢出“读取旧值-累加-回写新值”操作是不可被其他线程扰乱的原子性的整体完成的。使用atomicAdd进行原子累加的内核函数使用原子操作后程序具有更大的执行代价。可以通过使用共享内存来加速这些原子累加操作
GPU卡从逻辑上对用户提供了64KB的常量内存空间，可以用来存储内核执行期间所需要的恒定数据常量内存有助于节省全局内存的访问带宽warp整体进行一次常量内存的读取，结果广播给warp里的32个线程。同时，常量内存具有cache缓冲。当后续的在邻近位置上访问，将不会发生额外的从显存过来的传输。每个warp里的32个线程，进行一致性的相同常量内存位置读取的时候，这种广播效果和cache命中效果可以节省执行时间当程序进行具有很大程度上的空间邻近性的访存的时候，纹理变得非常高效。空间邻近性的意思是，每个线程的读取位置都和其他线程的读取位置邻近。请一定要确保纹理引用被定义成全局静态变量，同时还要确保它不能作为参数传递给任何其他函数。原子操作-求和

#include 
#define NUM_THREADS 10000
#define SIZE  10
#define BLOCK_WIDTH 100
__global__ void gpu_increment_without_atomic(int *d_a)
{
    // Calculate thread id for current thread
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    // each thread increments elements wrapping at SIZE variable
    tid = tid % SIZE;
    d_a[tid] += 1;
}
__global__ void gpu_increment_atomic(int *d_a)
{
    // Calculate thread id for current thread
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    // each thread increments elements wrapping at SIZE variable
    tid = tid % SIZE;
    atomicAdd(&d_a[tid], 1);
}
int main(int argc, char **argv)
{
    printf("%d total threads in %d blocks writing into %d array elementsn",
        NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, SIZE);
    // declare and allocate host memory
    int h_a[SIZE];
    const int ARRAY_BYTES = SIZE * sizeof(int);
    // declare and allocate GPU memory
    int *d_a, *d_aA;
    cudaMalloc((void **)&d_a, ARRAY_BYTES);
    //Initialize GPU memory to zero
    cudaMemset((void *)d_a, 0, ARRAY_BYTES);
    gpu_increment_without_atomic << > >(d_a);
    // copy back the array to host memory
    cudaMemcpy(h_a, d_a, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    printf("Number of times a particular Array index has been incremented without atomic add is: n");
    for (int i = 0; i < SIZE; i++)
    {
        printf("index: %d --> %d timesn ", i, h_a[i]);
    }
    cudaFree(d_a);
    
    cudaMalloc((void **)&d_aA, ARRAY_BYTES);
    //Initialize GPU memory to zero
    cudaMemset((void *)d_aA, 0, ARRAY_BYTES);
    gpu_increment_atomic << > >(d_aA);
    // copy back the array to host memory
    cudaMemcpy(h_a, d_aA, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    printf("Number of times a particular Array index has been incremented is: n");
    for (int i = 0; i < SIZE; i++) 
    { 
        printf("index: %d --> %d timesn ", i, h_a[i]); 
    }
    cudaFree(d_aA);
    return 0;
}

常量内存与纹理内存

#include "stdio.h"
#include
#include 
#include 
//Defining two constants
__constant__ int constant_f;
__constant__ int constant_g;
#define N   5
//Kernel function for using constant memory
__global__ void gpu_constant_memory(float *d_in, float *d_out) {
    //Thread index for current kernel
    int tid = threadIdx.x;  
    d_out[tid] = constant_f*d_in[tid] + constant_g;
}
#define NUM_THREADS 5
texture  textureRef;
__global__ void gpu_texture_memory(int n, float *d_out)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx < n) {
        float temp = tex1D(textureRef, float(idx));
        d_out[idx] = temp;
    }
}
int main(void) {
    //Defining Arrays for host
    float h_in[N], h_out[N];
    //Defining Pointers for device
    float *d_in, *d_out;
    // 常量内存
    int h_f = 2;
    int h_g = 20;
    // allocate the memory on the cpu
    cudaMalloc((void**)&d_in, N * sizeof(float));
    cudaMalloc((void**)&d_out, N * sizeof(float));
    //Initializing Array
    for (int i = 0; i < N; i++) {
        h_in[i] = i;
    }
    //Copy Array from host to device
    cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);
    //Copy constants to constant memory
    cudaMemcpyToSymbol(constant_f, &h_f, sizeof(int),0,cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(constant_g, &h_g, sizeof(int));
    //Calling kernel with one block and N threads per block
    gpu_constant_memory << <1, N >> >(d_in, d_out);
    //Coping result back to host from device memory
    cudaMemcpy(h_out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);
    //Printing result on console
    printf("Use of Constant memory on GPU n");
    for (int i = 0; i < N; i++) {
        printf("The expression for input %f is %fn", h_in[i], h_out[i]);
    }
    //Free up memory
    cudaFree(d_in);
    cudaFree(d_out);
    // 纹理内存
    //Calculate number of blocks to launch
    int num_blocks = N / NUM_THREADS + ((N % NUM_THREADS) ? 1 : 0);
    //Declare device pointer
    float *d_outM;
    // allocate space on the device for the result
    cudaMalloc((void**)&d_outM, sizeof(float) * N);
    // allocate space on the host for the results
    float *h_outM = (float*)malloc(sizeof(float)*N);
    //Declare and initialize host array
    float h_inM[N];
    for (int i = 0; i < N; i++) {
        h_inM[i] = float(i);
    }
    //Define CUDA Array
    cudaArray *cu_Array;
    cudaMallocArray(&cu_Array, &textureRef.channelDesc, N, 1);
    //Copy data to CUDA Array
    cudaMemcpyToArray(cu_Array, 0, 0, h_inM, sizeof(float)*N, cudaMemcpyHostToDevice);
    // bind a texture to the CUDA array
    cudaBindTextureToArray(textureRef, cu_Array);
    //Call Kernel   
    gpu_texture_memory << > >(N, d_outM);
    
    // copy result back to host
    cudaMemcpy(h_outM, d_outM, sizeof(float)*N, cudaMemcpyDeviceToHost);
    printf("Use of Texture memory on GPU: n");
    for (int i = 0; i < N; i++) {
        printf("Texture element at %d is : %fn",i, h_outM[i]);
    }
    free(h_outM);
    cudaFree(d_outM);
    cudaFreeArray(cu_Array);
    cudaUnbindTexture(textureRef);
    return 0;
}

全局、局部、共享内存

#include 
#define N 5
__global__ void gpu_global_memory(float *d_a)
{
    // "array" is a pointer into global memory on the device
    d_a[threadIdx.x] = threadIdx.x;
}
__global__ void gpu_local_memory(float d_in)
{
    int t_local;    
    t_local = d_in * threadIdx.x;     
    printf("Value of Local variable in current thread is: %d n", t_local);
}
__global__ void gpu_shared_memory(float *d_a)
{
    // Defining local variables which are private to each thread
    int i, index = threadIdx.x;
    float average, sum = 0.0f;
    //Define shared memory
    __shared__ float sh_arr[5];
    sh_arr[index] = d_a[index];
    __syncthreads();    // This ensures all the writes to shared memory have completed
    for (i = 0; i<= index; i++) 
    { 
        sum += sh_arr[i]; 
    }
    average = sum / (index + 1.0f);
    d_a[index] = average; 
    sh_arr[index] = average;
}
int main(int argc, char **argv)
{
    // Define Host Array
    float h_a[N];
    //Define device pointer 
    float *d_a;       
    // 全局内存
    cudaMalloc((void **)&d_a, sizeof(float) *N);
    // now copy data from host memory to device memory 
    cudaMemcpy((void *)d_a, (void *)h_a, sizeof(float) *N, cudaMemcpyHostToDevice);
    // launch the kernel 
    gpu_global_memory << <1, N >> >(d_a);  
    // copy the modified array back to the host memory
    cudaMemcpy((void *)h_a, (void *)d_a, sizeof(float) *N, cudaMemcpyDeviceToHost);
    printf("Array in Global Memory is: n");
    //Printing result on console
    for (int i = 0; i < N; i++) {
        printf("At Index: %d --> %f n", i, h_a[i]);
    }
    // 本地内存
    printf("Use of Local Memory on GPU:n");
    gpu_local_memory << <1, N >> >(5);  
    cudaDeviceSynchronize();
      
    // 共享内存 
    for (int i = 0; i < 5; i++) {
        h_a[i] = i;
    }
    // allocate global memory on the device
    cudaMalloc((void **)&d_a, sizeof(float) * 5);
    // now copy data from host memory  to device memory 
    cudaMemcpy((void *)d_a, (void *)h_a, sizeof(float) * 5, cudaMemcpyHostToDevice);
    
    gpu_shared_memory << <1, 5 >> >(d_a);
    // copy the modified array back to the host memory
    cudaMemcpy((void *)h_a, (void *)d_a, sizeof(float) * 5, cudaMemcpyDeviceToHost);
    printf("Use of Shared Memory on GPU:  n");
    //Printing result on console
    for (int i = 0; i < 5; i++) {
        printf("The running average after %d element is %f n", i, h_a[i]);
    }
    
    return 0;
}

多线程

#include "stdio.h"
#include
#include 
#include 
//Defining number of elements in Array
#define N   50000
//Defining Kernel function for vector addition
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c) {
    //Getting block index of current kernel
    int tid = threadIdx.x + blockIdx.x * blockDim.x;    
    while (tid < N)
    {
        d_c[tid] = d_a[tid] + d_b[tid];
        tid += blockDim.x * gridDim.x;
    }
        
}
int main(void) {
    //Defining host arrays
    int h_a[N], h_b[N], h_c[N];
    //Defining device pointers
    int *d_a, *d_b, *d_c;
    // allocate the memory
    cudaMalloc((void**)&d_a, N * sizeof(int));
    cudaMalloc((void**)&d_b, N * sizeof(int));
    cudaMalloc((void**)&d_c, N * sizeof(int));
    //Initializing Arrays
    for (int i = 0; i < N; i++) {
        h_a[i] = 2 * i*i;
        h_b[i] = i;
    }
    // Copy input arrays from host to device memory
    cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
    //Calling kernels with N blocks and one thread per block, passing device pointers as parameters
    gpuAdd << <512, 512 >> >(d_a, d_b, d_c);
    //Copy result back to host memory from device memory
    cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    int Correct = 1;
    printf("Vector addition on GPU n");
    //Printing result on console
    for (int i = 0; i < N; i++) {
        if ((h_a[i] + h_b[i] != h_c[i]))
        {
            Correct = 0;
        }
        
    }
    if (Correct == 1)
    {
        printf("GPU has computed Sum Correctlyn");
    }
    else
    {
        printf("There is an Error in GPU Computationn");
    }
    //Free up memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return 0;
}

3.2 向量点乘与矩阵乘法例子

向量点乘矩阵乘法

#include "stdio.h"
#include 
#include 
#include 
#include 
#define TILE_SIZE 2
#define N 1024
#define threadsPerBlock 512
__global__ void gpu_dot(float *d_a, float *d_b, float *d_c) {
    //Declare shared memory
    __shared__ float partial_sum[threadsPerBlock];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    //Calculate index for shared memory 
    int index = threadIdx.x;
    //Calculate Partial Sum
    float sum = 0;
    while (tid < N) 
    {
        sum += d_a[tid] * d_b[tid];
        tid += blockDim.x * gridDim.x; // 前者代表了本次启动的块的数量，而后者代表了每个块里面的线程数量
    }
    // Store partial sum in shared memory
    partial_sum[index] = sum;
    // synchronize threads 
    __syncthreads();
    // Calculating partial sum for whole block in reduce operation  
    int i = blockDim.x / 2;
    while (i != 0) {
        if (index < i)
            partial_sum[index] += partial_sum[index + i];
        __syncthreads();
        i /= 2;
    }
    //Store block partial sum in global memory
    if (index == 0)
        d_c[blockIdx.x] = partial_sum[0];
}
//Matrix multiplication using non shared kernel
__global__ void gpu_Matrix_Mul_nonshared(float *d_a, float *d_b, float *d_c, const int size)
{
    int row, col;
    col = TILE_SIZE * blockIdx.x + threadIdx.x;
    row = TILE_SIZE * blockIdx.y + threadIdx.y;
    for (int k = 0; k< size; k++)
    {
        d_c[row*size + col] += d_a[row * size + k] * d_b[k * size + col];
    }
}
// Matrix multiplication using shared kernel
__global__ void gpu_Matrix_Mul_shared(float *d_a, float *d_b, float *d_c, const int size)
{
    int row, col;
    //Defining Shared Memory
    __shared__ float shared_a[TILE_SIZE][TILE_SIZE];
    __shared__ float shared_b[TILE_SIZE][TILE_SIZE];
    col = TILE_SIZE * blockIdx.x + threadIdx.x;
    row = TILE_SIZE * blockIdx.y + threadIdx.y;
    for (int i = 0; i< size / TILE_SIZE; i++) 
    {
        shared_a[threadIdx.y][threadIdx.x] = d_a[row* size + (i*TILE_SIZE + threadIdx.x)];
        shared_b[threadIdx.y][threadIdx.x] = d_b[(i*TILE_SIZE + threadIdx.y) * size + col];
        __syncthreads(); 
        for (int j = 0; j> >(d_a, d_b, d_partial_sum);
    // copy the array back to host memory
    cudaMemcpy(partial_sum, d_partial_sum, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost);
    // Calculate final dot product on host
    h_c = 0;
    for (int i = 0; i> > (d_a, d_b, d_result, size);
    gpu_Matrix_Mul_shared << > > (d_a, d_b, d_result, size);
    cudaMemcpy(h_result, d_result, size*size * sizeof(int), cudaMemcpyDeviceToHost);
    printf("The result of Matrix multiplication is: n");
    
    for (int i = 0; i< size; i++)
    {
        for (int j = 0; j < size; j++)
        {
            printf("%f   ", h_result[i][j]);
        }
        printf("n");
    }
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_result);
    return 0;
}
int main(){
    main_matrix();
}

【高性能计算】CUDA编程之线程存储与原子操作（教程与代码-2）

C/C++/C#相关栏目本月热门文章