Sample: cppOverload
Minimum spec: SM 3.0
This sample demonstrates how to use C++ function overloading on the GPU.
Key concepts:
C++ Function Overloading
CUDA Streams and Events
cppOverload_kernel.cuh
__global__ void simple_kernel(const int *pIn, int *pOut, int a)
{
__shared__ int sData[THREAD_N];
int tid = threadIdx.x + blockDim.x*blockIdx.x;
sData[threadIdx.x] = pIn[tid];
__syncthreads();
pOut[tid] = sData[threadIdx.x]*a + tid;;
}
__global__ void simple_kernel(const int2 *pIn, int *pOut, int a)
{
__shared__ int2 sData[THREAD_N];
int tid = threadIdx.x + blockDim.x*blockIdx.x;
sData[threadIdx.x] = pIn[tid];
__syncthreads();
pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y)*a + tid;;
}
__global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a)
{
__shared__ int sData1[THREAD_N];
__shared__ int sData2[THREAD_N];
int tid = threadIdx.x + blockDim.x*blockIdx.x;
sData1[threadIdx.x] = pIn1[tid];
sData2[threadIdx.x] = pIn2[tid];
__syncthreads();
pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid;
}
cppOverload.cu
#define THREAD_N 256 #define N 1024 #define DIV_UP(a, b) (((a) + (b) - 1) / (b)) // Includes, system #include#include #include #include #include "cppOverload_kernel.cuh" const char *sampleName = "C++ Function Overloading"; #define OUTPUT_ATTR(attr) printf("Shared Size: %dn", (int)attr.sharedSizeBytes); printf("Constant Size: %dn", (int)attr.constSizeBytes); printf("Local Size: %dn", (int)attr.localSizeBytes); printf("Max Threads Per Block: %dn", attr.maxThreadsPerBlock); printf("Number of Registers: %dn", attr.numRegs); printf("PTX Version: %dn", attr.ptxVersion); printf("Binary Version: %dn", attr.binaryVersion); bool check_func1(int *hInput, int *hOutput, int a) { for (int i = 0; i < N; ++i) { int cpuRes = hInput[i]*a + i; if (hOutput[i] != cpuRes) { return false; } } return true; } bool check_func2(int2 *hInput, int *hOutput, int a) { for (int i = 0; i < N; i++) { int cpuRes = (hInput[i].x + hInput[i].y)*a + i; if (hOutput[i] != cpuRes) { return false; } } return true; } bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a) { for (int i = 0; i < N; i++) { if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i) { return false; } } return true; } int main(int argc, const char *argv[]) { int *hInput = NULL; int *hOutput = NULL; int *dInput = NULL; int *dOutput = NULL; printf("%s starting...n", sampleName); int deviceCount; checkCudaErrors(cudaGetDeviceCount(&deviceCount)); printf("DevicecheckCudaErrors Count: %dn", deviceCount); int deviceID = findCudaDevice(argc, argv); cudaDeviceProp prop; checkCudaErrors(cudaGetDeviceProperties(&prop, deviceID)); if (prop.major < 2) { printf("ERROR: cppOverload requires GPU devices with compute SM 2.0 or higher.n"); printf("Current GPU device has compute SM%d.%d, Exiting...", prop.major, prop.minor); exit(EXIT_WAIVED); } checkCudaErrors(cudaSetDevice(deviceID)); // Allocate device memory checkCudaErrors(cudaMalloc(&dInput , sizeof(int)*N*2)); checkCudaErrors(cudaMalloc(&dOutput, sizeof(int)*N)); // Allocate host memory checkCudaErrors(cudaMallocHost(&hInput , sizeof(int)*N*2)); checkCudaErrors(cudaMallocHost(&hOutput, sizeof(int)*N)); for (int i = 0; i < N*2; i++) { hInput[i] = i; } // Copy data from host to device checkCudaErrors(cudaMemcpy(dInput, hInput, sizeof(int)*N*2, cudaMemcpyHostToDevice)); // Test C++ overloading bool testResult = true; bool funcResult = true; int a = 1; void (*func1)(const int *, int *, int); void (*func2)(const int2 *, int *, int); void (*func3)(const int *, const int *, int *, int); struct cudaFuncAttributes attr; // overload function 1 func1 = simple_kernel; memset(&attr, 0, sizeof(attr)); checkCudaErrors(cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared)); checkCudaErrors(cudaFuncGetAttributes(&attr, *func1)); OUTPUT_ATTR(attr); (*func1)<<>>(dInput, dOutput, a); checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost)); funcResult = check_func1(hInput, hOutput, a); printf("simple_kernel(const int *pIn, int *pOut, int a) %snn", funcResult ? "PASSED" : "FAILED"); testResult &= funcResult; // overload function 2 func2 = simple_kernel; memset(&attr, 0, sizeof(attr)); checkCudaErrors(cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared)); checkCudaErrors(cudaFuncGetAttributes(&attr, *func2)); OUTPUT_ATTR(attr); (*func2)<<>>((int2 *)dInput, dOutput, a); checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost)); funcResult = check_func2(reinterpret_cast (hInput), hOutput, a); printf("simple_kernel(const int2 *pIn, int *pOut, int a) %snn", funcResult ? "PASSED" : "FAILED"); testResult &= funcResult; // overload function 3 func3 = simple_kernel; memset(&attr, 0, sizeof(attr)); checkCudaErrors(cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared)); checkCudaErrors(cudaFuncGetAttributes(&attr, *func3)); OUTPUT_ATTR(attr); (*func3)<<>>(dInput, dInput+N, dOutput, a); checkCudaErrors(cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost)); funcResult = check_func3(&hInput[0], &hInput[N], hOutput, a); printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %snn", funcResult ? "PASSED" : "FAILED"); testResult &= funcResult; checkCudaErrors(cudaFree(dInput)); checkCudaErrors(cudaFree(dOutput)); checkCudaErrors(cudaFreeHost(hOutput)); checkCudaErrors(cudaFreeHost(hInput)); checkCudaErrors(cudaDeviceSynchronize()); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); }



