cuda deep dive

Download CUDA Deep Dive

Post on 28-Nov-2014

5.580 views

Category:

Technology

6 download

Embed Size (px)

DESCRIPTION

A look at the more advanced CUDA APIs and their use.

TRANSCRIPT

  • 1. CUDADeep Dive Kashif Rasul @krasul
  • 2. Hellomy name is Kashif
  • 3. objective: Deeperunderstanding
  • 4. Prerequisites 1
  • 5. #include int main( void ){ int N = 50000; size_t size = N * sizeof(float); cudaSetDevice( cutGetMaxGflopsDeviceId() ); ... cutilSafeCall( cudaMalloc((void**)&d_A, size) ); cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) ); ... int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; add(d_A, d_B, d_C, N); ... cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) ); cudaFree(d_A); ... cutilDeviceReset();} blocks & threads
  • 6. __global__ void dot( float *a, float *b, float *c ){ __shared__ float cache[threadsPerBlock]; int cacheIndex = threadIdx.x; ... // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block __syncthreads(); ...}int main( void ){ ... dot( d_a, d_b, d_c ); ...} shared memory
  • 7. thread coop. & shared mem. useful for reduction algorithms avoid race conditions by using __syncthreads() avoid bank conflicts every thread in the block needs to call __syncthreads() keep in mind
  • 8. Memory 2
  • 9. __constant__ float constFloat;__device__ float getConstFloat() { return constFloat; }__global__ void addConstant(float *vec, int N){ int i = blockDim.x * blockIdx.x + threadIdx.x; if (iint main( int argc, char** argv){ float constValue = 4.0f; cutilSafeCall( cudaMemcpyToSymbol(constFloat, &constValue, sizeof(float), 0, cudaMemcpyHostToDevice) ); ...} constant mem.
  • 10. read-only, but conserves mem. bandwidth a single read can be broadcasted and cached for additional reads painfully slow when each thread reads a different address from constant memory keep in mind
  • 11. // textures containing look-up tablestexture edgeTex;texture edge2dTex;int main(int argc, char** argv){ ... cutilSafeCall( cudaMalloc((void**) d_edgeTable, 256*sizeof(uint)) ); cutilSafeCall( cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable, 256*sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindTexture(0, edgeTex, *d_edgeTable, 256*sizeof(uint)) ); // run kernel kernel(...) //cleanup cutilSafeCall( cudaUnbindTexture(edgeTex) );}__global__ void kernel(...){ ... uint edge = tex1Dfetch(edgeTex, index*16 + i); ...} texture mem.
  • 12. read-only, like for const. mem. great when memory access exhibits spatial locality, i.e. each thread reads a loc. near where the next or previous thread reads comes in 1-D, 2-D and 3-D versions & typically used in finite diff. apps keep in mind
  • 13. surface output_surface;__global__ void surfaceWrite(float* g_idata, int width, int height) { // calculate surface coordinates unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; // read from global memory and write to cuarray (via surface reference) surf2Dwrite(g_idata[y*width+x], output_surface, x*4, y, cudaBoundaryModeTrap);}int main( int argc, char** argv) { ... cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cu_array; cutilSafeCall( cudaMallocArray(&cu_array, &channelDesc, width, height, cudaArraySurfaceLoadStore) ); cutilSafeCall( cudaMemcpy( d_data, h_data, size, cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindSurfaceToArray(output_surface, cu_array) ); surfaceWrite(d_data, width, height); ... cutilSafeCall( cudaFree(d_data) ); cutilSafeCall( cudaFreeArray(cu_array) );} surface mem.
  • 14. InterOp. 3
  • 15. // OpenGL Graphics includes#include #if defined (__APPLE__) || defined(MACOSX)#include #else#include #endifint main(int argc, char **argv) { // Initialize GL glutInit(&argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB); glutInitWindowSize(1000, 1000); // Create a window with rendering context and all else we need glutCreateWindow("CUDA Interop."); // initialize necessary OpenGL extensions glewInit(); // Select CUDA device with OpenGL interoperability if (cutCheckCmdLineFlag(argc, (const char**)argv, "device")) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); }} set device
  • 16. // vbo variablesGLuint vbo;struct cudaGraphicsResource *cuda_vbo_resource;void *d_vbo_buffer = NULL;// create buffer objectglGenBuffers(1, vbo);glBindBuffer(GL_ARRAY_BUFFER, *vbo);// initialize buffer objectunsigned int size = mesh_width * mesh_height * 4 * sizeof(float);glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);glBindBuffer(GL_ARRAY_BUFFER, 0);// register this buffer object with CUDAcutilSafeCall(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, *vbo, cudaGraphicsMapFlagsWriteDiscard)); register data with CUDA
  • 17. // map OpenGL buffer object for writing from CUDAfloat4 *dptr;cutilSafeCall( cudaGraphicsMapResources(1, cuda_vbo_resource, 0) );size_t num_bytes;cutilSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *cuda_vbo_resource) );// run kernelkernel(dptr,...);// unmap buffer objectcutilSafeCall( cudaGraphicsUnmapResources(1, cuda_vbo_resource, 0) ); pass data via shared buffers
  • 18. need to tell the CUDA runtime the device we intend to use for CUDA and OpenGL initialize OpenGL first and then use the cudaGLSetGLDevice() method DirectX interop. is nearly identical keep in mind
  • 19. Pro Tip 4
  • 20. git clone https://github.com/kashif/cuda-workshop.gitCloning into cuda-workshop...... cd cuda-workshop cmake CMakeLists.txt-- The C compiler identification is GNU... makeScanning dependencies of target cutil[ 5%] Building CXX object cutil/CMakeFiles/cutil.dir/src/bank_checker.cpp.o...[100%] Built target matrixMul ./bin/matrixMul[ matrixMul ]bin/matrixMul Starting (CUDA and CUBLAS tests)...Device 0: "GeForce GTX 480" with Compute 2.0 capability... install CMake, glut & glew
  • 21.