Sample: cppIntegration
Minimum spec: SM 3.0
This example demonstrates how to integrate CUDA into an existing C++ application, i.e. the CUDA entry point on host side is only a function which is called from C++ code and only the file containing this function is compiled with nvcc. It also demonstrates that vector types can be used from cpp.
cppIntegration_gold.cpp
// Required header to support CUDA vector types #include// export C interface extern "C" void computeGold(char *reference, char *idata, const unsigned int len); extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len); //! Compute reference data set //! Each element is multiplied with the number of threads / array length //! @param reference reference data, computed but preallocated //! @param idata input data as provided to device //! @param len number of elements in reference / idata void computeGold(char *reference, char *idata, const unsigned int len) { for (unsigned int i = 0; i < len; ++i) reference[i] = idata[i] - 10; } //! Compute reference data set for int2 version //! Each element is multiplied with the number of threads / array length //! @param reference reference data, computed but preallocated //! @param idata input data as provided to device //! @param len number of elements in reference / idata void computeGold2(int2 *reference, int2 *idata, const unsigned int len) { for (unsigned int i = 0; i < len; ++i) { reference[i].x = idata[i].x - idata[i].y; reference[i].y = idata[i].y; } }
cppIntegration.cu
// // Copyright 1993-2015 NVIDIA Corporation. All rights reserved. // // Please refer to the NVIDIA end user license agreement (EULA) associated // with this source code for terms and conditions that govern your use of // this software. Any use, reproduction, disclosure, or distribution of // this software and related documentation outside the terms of the EULA // is strictly prohibited. // // System includes #include#include #include #include #include // CUDA runtime #include // helper functions and utilities to work with CUDA #include #include #ifndef MAX #define MAX(a,b) (a > b ? a : b) #endif // declaration, forward extern "C" void computeGold(char *reference, char *idata, const unsigned int len); extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len); /// //! Simple test kernel for device functionality //! @param g_odata memory to process (in and out) /// __global__ void kernel(int *g_data) { // write data to global memory const unsigned int tid = threadIdx.x; int data = g_data[tid]; // use integer arithmetic to process all four bytes with one thread // this serializes the execution, but is the simplest solutions to avoid // bank conflicts for this very low number of threads // in general it is more efficient to process each byte by a separate thread, // to avoid bank conflicts the access pattern should be // g_data[4 * wtid + wid], where wtid is the thread id within the half warp // and wid is the warp id // see also the programming guide for a more in depth discussion. g_data[tid] = ((((data << 0) >> 24) - 10) << 24) | ((((data << 8) >> 24) - 10) << 16) | ((((data << 16) >> 24) - 10) << 8) | ((((data << 24) >> 24) - 10) << 0); } /// //! Demonstration that int2 data can be used in the cpp code //! @param g_odata memory to process (in and out) /// __global__ void kernel2(int2 *g_data) { // write data to global memory const unsigned int tid = threadIdx.x; int2 data = g_data[tid]; // use integer arithmetic to process all four bytes with one thread // this serializes the execution, but is the simplest solutions to avoid // bank conflicts for this very low number of threads // in general it is more efficient to process each byte by a separate thread, // to avoid bank conflicts the access pattern should be // g_data[4 * wtid + wid], where wtid is the thread id within the half warp // and wid is the warp id // see also the programming guide for a more in depth discussion. g_data[tid].x = data.x - data.y; } //! Entry point for Cuda functionality on host side //! @param argc command line argument count //! @param argv command line arguments //! @param data data to process on the device //! @param len len of a data extern "C" bool runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s findCudaDevice(argc, (const char **)argv); const unsigned int num_threads = len / 4; assert(0 == (len % 4)); const unsigned int mem_size = sizeof(char) * len; const unsigned int mem_size_int2 = sizeof(int2) * len; // allocate device memory char *d_data; checkCudaErrors(cudaMalloc((void **) &d_data, mem_size)); // copy host memory to device checkCudaErrors(cudaMemcpy(d_data, data, mem_size, cudaMemcpyHostToDevice)); // allocate device memory for int2 version int2 *d_data_int2; checkCudaErrors(cudaMalloc((void **) &d_data_int2, mem_size_int2)); // copy host memory to device checkCudaErrors(cudaMemcpy(d_data_int2, data_int2, mem_size_int2, cudaMemcpyHostToDevice)); // setup execution parameters dim3 grid(1, 1, 1); dim3 threads(num_threads, 1, 1); dim3 threads2(len, 1, 1); // more threads needed fir separate int2 version // execute the kernel kernel<<< grid, threads >>>((int *) d_data); kernel2<<< grid, threads2 >>>(d_data_int2); // check if kernel execution generated and error getLastCudaError("Kernel execution failed"); // compute reference solutions char *reference = (char *) malloc(mem_size); computeGold(reference, data, len); int2 *reference2 = (int2 *) malloc(mem_size_int2); computeGold2(reference2, data_int2, len); // copy results from device to host checkCudaErrors(cudaMemcpy(data, d_data, mem_size, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(data_int2, d_data_int2, mem_size_int2, cudaMemcpyDeviceToHost)); // check result bool success = true; for (unsigned int i = 0; i < len; i++) { if (reference[i] != data[i] || reference2[i].x != data_int2[i].x || reference2[i].y != data_int2[i].y) { success = false; } } // cleanup memory checkCudaErrors(cudaFree(d_data)); checkCudaErrors(cudaFree(d_data_int2)); free(reference); free(reference2); return success; }
main.cpp
// includes, system #include#include // Required to include CUDA vector types #include #include #include // declaration, forward extern "C" bool runTest(const int argc, const char **argv, char *data, int2 *data_int2, unsigned int len); // Program main int main(int argc, char **argv) { // input data int len = 16; // the data has some zero padding at the end so that the size is a multiple of // four, this simplifies the processing as each thread can process four // elements (which is necessary to avoid bank conflicts) but no branching is // necessary to avoid out of bounds reads char str[] = { 82, 111, 118, 118, 121, 42, 97, 121, 124, 118, 110, 56, 10, 10, 10, 10 }; // Use int2 showing that CUDA vector types can be used in cpp code int2 i2[16]; for (int i = 0; i < len; i++) { i2[i].x = str[i]; i2[i].y = 10; } bool bTestResult; // run the device part of the program bTestResult = runTest(argc, (const char **)argv, str, i2, len); std::cout << str << std::endl; char str_device[16]; for (int i = 0; i < len; i++) { str_device[i] = (char)(i2[i].x); } std::cout << str_device << std::endl; exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }



