Download - CUDA Deep Dive
CUDADeepDive
Kashif Rasul@krasul
my name is KashifHello
objective: Deeperunderstanding
1Prerequisites
blocks & threads
#include <cutil_inline.h>
int main( void ) { int N = 50000; size_t size = N * sizeof(float);
cudaSetDevice( cutGetMaxGflopsDeviceId() ); ... cutilSafeCall( cudaMalloc((void**)&d_A, size) ); cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) ); ...
int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; add<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N); ... cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) ); cudaFree(d_A); ... cutilDeviceReset();}
shared memory
__global__ void dot( float *a, float *b, float *c ){ __shared__ float cache[threadsPerBlock]; int cacheIndex = threadIdx.x; ... // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block __syncthreads(); ...}
int main( void ) { ... dot<<<blocksPerGrid,threadsPerBlock>>>( d_a, d_b, d_c ); ...}
• thread coop. & shared mem. useful for reduction algorithms
• avoid race conditions by using __syncthreads()
• avoid bank conflicts
• every thread in the block needs to call __syncthreads()
keep in mind
2Memory
__constant__ float constFloat;
__device__ float getConstFloat() { return constFloat; }
__global__ void addConstant(float *vec, int N){ int i = blockDim.x * blockIdx.x + threadIdx.x; if (i<N) vec[i] += getConstFloat();}
#include <cutil_inline.h>
int main( int argc, char** argv) { float constValue = 4.0f; cutilSafeCall( cudaMemcpyToSymbol(constFloat, &constValue, sizeof(float), 0, cudaMemcpyHostToDevice) ); ...}
constant mem.
• read-only, but conserves mem. bandwidth
• a single read can be broadcasted and cached for additional reads
• painfully slow when each thread reads a different address from constant memory
keep in mind
// textures containing look-up tablestexture<uint> edgeTex;texture<uint, 2> edge2dTex;
int main(int argc, char** argv){ ... cutilSafeCall( cudaMalloc((void**) d_edgeTable, 256*sizeof(uint)) ); cutilSafeCall( cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable, 256*sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindTexture(0, edgeTex, *d_edgeTable, 256*sizeof(uint)) ); // run kernel kernel<<<blocks, threads>>>(...) //cleanup cutilSafeCall( cudaUnbindTexture(edgeTex) );}
__global__ void kernel(...){ ... uint edge = tex1Dfetch(edgeTex, index*16 + i); ...} texture mem.
• read-only, like for const. mem.
• great when memory access exhibits spatial locality, i.e. each thread reads a loc. near where the next or previous thread reads
• comes in 1-D, 2-D and 3-D versions & typically used in finite diff. apps
keep in mind
surface mem.
surface<void, 2> output_surface;
__global__ void surfaceWrite(float* g_idata, int width, int height) { // calculate surface coordinates unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
// read from global memory and write to cuarray (via surface reference) surf2Dwrite(g_idata[y*width+x], output_surface, x*4, y, cudaBoundaryModeTrap);}
int main( int argc, char** argv) { ... cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cu_array; cutilSafeCall( cudaMallocArray(&cu_array, &channelDesc, width, height, cudaArraySurfaceLoadStore) ); cutilSafeCall( cudaMemcpy( d_data, h_data, size, cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindSurfaceToArray(output_surface, cu_array) );
surfaceWrite<<<dimGrid, dimBlock>>>(d_data, width, height); ... cutilSafeCall( cudaFree(d_data) ); cutilSafeCall( cudaFreeArray(cu_array) ); }
3InterOp.
// OpenGL Graphics includes#include <GL/glew.h>#if defined (__APPLE__) || defined(MACOSX)#include <GLUT/glut.h>#else#include <GL/freeglut.h>#endif
int main(int argc, char **argv) { // Initialize GL glutInit(&argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB); glutInitWindowSize(1000, 1000);
// Create a window with rendering context and all else we need glutCreateWindow("CUDA Interop.");
// initialize necessary OpenGL extensions glewInit();
// Select CUDA device with OpenGL interoperability if (cutCheckCmdLineFlag(argc, (const char**)argv, "device")) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); }} set device
register data with CUDA
// vbo variablesGLuint vbo;struct cudaGraphicsResource *cuda_vbo_resource;void *d_vbo_buffer = NULL;
// create buffer objectglGenBuffers(1, vbo);glBindBuffer(GL_ARRAY_BUFFER, *vbo);
// initialize buffer objectunsigned int size = mesh_width * mesh_height * 4 * sizeof(float);glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
glBindBuffer(GL_ARRAY_BUFFER, 0);
// register this buffer object with CUDAcutilSafeCall(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, *vbo, cudaGraphicsMapFlagsWriteDiscard));
pass data via shared buffers
// map OpenGL buffer object for writing from CUDAfloat4 *dptr;cutilSafeCall( cudaGraphicsMapResources(1, cuda_vbo_resource, 0) );
size_t num_bytes; cutilSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *cuda_vbo_resource) );
// run kernelkernel<<<blocks,threads>>>(dptr,...);
// unmap buffer objectcutilSafeCall( cudaGraphicsUnmapResources(1, cuda_vbo_resource, 0) );
• need to tell the CUDA runtime the device we intend to use for CUDA and OpenGL
• initialize OpenGL first and then use the cudaGLSetGLDevice() method
• DirectX interop. is nearly identical
keep in mind
4Pro Tip
install CMake, glut & glew
➜ git clone https://github.com/kashif/cuda-workshop.gitCloning into cuda-workshop......
➜ cd cuda-workshop
➜ cmake CMakeLists.txt-- The C compiler identification is GNU...
➜ makeScanning dependencies of target cutil[ 5%] Building CXX object cutil/CMakeFiles/cutil.dir/src/bank_checker.cpp.o...[100%] Built target matrixMul
➜ ./bin/matrixMul[ matrixMul ]bin/matrixMul Starting (CUDA and CUBLAS tests)...
Device 0: "GeForce GTX 480" with Compute 2.0 capability...
great for experimenting
➜ ls src/matrixMulCMakeLists.txt matrixMul.cu matrixMul.h matrixMul_gold.cpp matrixMul_kernel.cu
➜ cat src/matrixMul/CMakeLists.txtCUDA_ADD_EXECUTABLE( matrixMul matrixMul.cu matrixMul_gold.cpp) TARGET_LINK_LIBRARIES( matrixMul cutil shrutil ${CUDA_CUBLAS_LIBRARIES})
➜ cmake -G "Visual Studio 10 Win64" CMakeLists.txt...
5Events & Timers
events: GPU timestamp
cudaEvent_t start, stop;float time;
// initialize eventscutilSafeCall( cudaEventCreate(&start) );cutilSafeCall( cudaEventCreate(&stop) );
// warmup to avoid timing startupkernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1);
// take measurements for loop over kernel launchescutilSafeCall( cudaEventRecord(start, 0) );for (int i=0; i < NUM_REPS; i++) { kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1); // Ensure no launch failure cutilSafeCall( cudaGetLastError() );}cutilSafeCall( cudaEventRecord(stop, 0) );cutilSafeCall( cudaEventSynchronize(stop) );cutilSafeCall( cudaEventElapsedTime(&time, start, stop) );
// report effective bandwidth in GB/s (2.0f due to read + write)float bandwidth = 2.0f * mem_size/(1024*1024*1024)/(time/NUM_REPS);
cutilSafeCall( cudaEventDestroy(stop) );cutilSafeCall( cudaEventDestroy(start) );
os timers
#include <cutil_inline.h>...unsigned int timer_matrixMul = 0;
// start timingcutilCheckError( cutStartTimer(timer_matrixMul) );
// do some workkernel<<<grid, threads, mem_size>>>(d_idata, d_odata);cutilDeviceSynchronize();
// stop timercutilCheckError( cutStopTimer(timer_matrixMul) );
double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0);double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;double gflops = 1.0e-9 * dNumOps/dSeconds;
// destroy timercutilCheckError( cutDeleteTimer(timer_matrixMul) );
• creating and recording events is tricky since some CUDA calls are asynch.
• all kernel launches are asynch.
• instruct the CPU to synch. on an event via cudaDeviceSynchronize()
keep in mind
6Bindings
pycuda
➜ cat hello_gpu.pyimport pycuda.driver as drvimport pycuda.toolsimport pycuda.autoinitimport numpyimport numpy.linalg as lafrom pycuda.compiler import SourceModule
mod = SourceModule("""__global__ void multiply_them(float *dest, float *a, float *b){ const int i = threadIdx.x; dest[i] = a[i] * b[i];}""")
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(400).astype(numpy.float32)b = numpy.random.randn(400).astype(numpy.float32)
dest = numpy.zeros_like(a)multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=(400,1,1))
print dest-a*b
➜ python hello_gpu.py[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
jcuda
// Initialize the driver and create a context for the first device.cuInit(0);CUdevice device = new CUdevice(); cuDeviceGet(device, 0);CUcontext context = new CUcontext(); cuCtxCreate(context, 0, device);
// Create the PTX file by calling the NVCC and load itString ptxFileName = preparePtxFile("JCudaVectorAddKernel.cu");CUmodule module = new CUmodule(); cuModuleLoad(module, ptxFileName);
// Obtain a function pointer to the "add" function.CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "add");
// Allocate the device input datafloat hostInputA[] = new float[numElements]; CUdeviceptr deviceInputA = new CUdeviceptr();cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT);cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numElements * Sizeof.FLOAT);...// Set up the kernel parametersPointer kernelParameters = Pointer.to(Pointer.to(deviceInputA),...);
// Call the kernel functionint blockSizeX = 256; int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX);cuLaunchKernel(function, gridSizeX, 1, 1, // Grid dimension blockSizeX, 1, 1, // Block dimension 0, null, // Shared memory size and stream kernelParameters, null); // Kernel- and extra parameterscuCtxSynchronize();
➜ lsLicense.txt jcuda-0.4.0-beta1.jar jcurand-0.4.0-beta1.jar libJCublas-apple-x86_64.dylib libJCudaRuntime-apple-x86_64.dylib libJCurand-apple-x86_64.dylibjcublas-0.4.0-beta1.jar jcufft-0.4.0-beta1.jar jcusparse-0.4.0-beta1.jar libJCudaDriver-apple-x86_64.dylib libJCufft-apple-x86_64.dylib libJCusparse-apple-x86_64.dylibJCudaVectorAdd.java JCudaVectorAddKernel.cu
➜ cat JCudaVectorAddKernel.cuextern "C"__global__ void add(float *a, float *b, float *sum, int n){ int i = blockIdx.x * blockDim.x + threadIdx.x; if (i<n) { sum[i] = a[i] + b[i]; }}
➜ javac -classpath jcuda-0.4.0-beta1.jar JCudaVectorAdd.java
➜ java -classpath jcuda-0.4.0-beta1.jar:. JCudaVectorAddExecutingnvcc -m64 -ptx JCudaVectorAddKernel.cu -o JCudaVectorAddKernel.ptxFinished creating PTX fileTest PASSED
ruby-cuda
➜ gem install sgc-ruby-cudaSuccessfully installed sgc-ruby-cuda-0.1.11 gem installed
➜ cat vector_add.rb...# Prepare and load vadd kernel.kernel_lib_file = compile(vadd_kernel_src)CudaFunction.load_lib_file(kernel_lib_file.path)
# Copy input buffers from host memory to device memory.memcpy_htod(da, ha, nbytes)memcpy_htod(db, hb, nbytes)
# Invoke vadd kernel.nthreads_per_block = 256block_dim = Dim3.new(nthreads_per_block, 1, 1)grid_dim = Dim3.new((N + nthreads_per_block - 1) / nthreads_per_block, 1, 1)CudaFunction.configure(block_dim, grid_dim)CudaFunction.setup(da, db, dc, N)f = CudaFunction.new("vadd")f.launch
# Copy output buffer from device memory to host memory.memcpy_dtoh(hc, dc, nbytes)...
➜ ruby vector_add.rbVector AdditionVerification completed. All matches? YES
7Libraries
cublas
cublasHandle_t handle;cublasStatus_t status = cublasCreate(&handle);
float* h_A = (float*)malloc(N * N * sizeof(h_A[0]));.../* Fill the matrices with test data */.../* Allocate device memory for the matrices */cudaMalloc((void**)&d_A, N * N * sizeof(d_A[0]));.../* Initialize the device matrices with the host matrices */status = cublasSetVector(N * N, sizeof(h_A[0]), h_A, 1, d_A, 1);.../* Performs Sgemm: C <- alphaAB + betaC */status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);
/* Allocate host mem & read back the result from device mem */h_C = (float*)malloc(N * N * sizeof(h_C[0]));status = cublasGetVector(N * N, sizeof(h_C[0]), d_C, 1, h_C, 1);
/* Memory clean up */cudaFree(d_A);.../* Shutdown */status = cublasDestroy(handle);
cufft
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// Allocate & init. host memory for the signalComplex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE);...// Pad signalComplex* h_padded_signal;...// Allocate device memory for signalComplex* d_signal;cutilSafeCall( cudaMalloc((void**)&d_signal, mem_size) );// Copy host memory to devicecutilSafeCall( cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice) );
// CUFFT plancufftHandle plan;cufftSafeCall( cufftPlan1d(&plan, new_size, CUFFT_C2C, 1) );
// Transform signalcufftSafeCall( cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD) );
// Destroy CUFFT contextcufftSafeCall( cufftDestroy(plan) );
// Cleanup memorycutilSafeCall( cudaFree(d_signal) );...cutilDeviceReset();
cusparse
cusparseHandle_t handle = 0;cusparseStatus_t status = cusparseCreate(&handle);
// create a matrix description for the matrix McusparseMatDescr_t descrM = 0; status = cusparseCreateMatDescr(&descrM);cusparseSetMatType ( descrM, CUSPARSE_MATRIX_TYPE_TRIANGULAR );cusparseSetMatIndexBase ( descrM, CUSPARSE_INDEX_BASE_ZERO );cusparseSetMatDiagType ( descrM, CUSPARSE_DIAG_TYPE_NON_UNIT );cusparseSetMatFillMode ( descrM, CUSPARSE_FILL_MODE_LOWER );
// create & perform analysis info for the non-trans & trans casecusparseSolveAnalysisInfo_t info = 0, infoTrans = 0;cusparseCreateSolveAnalysisInfo(&info);cusparseCreateSolveAnalysisInfo(&infoTrans);
cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, info);cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans);...// Solve M z = H H^T z = r by first doing a forward solve: H y = rcusparseScsrsv_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 1.0, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, info, d_r, d_y);// and then a back substitution: H^T z = ycusparseScsrsv_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, 1.0, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans, d_y, d_z); ...cusparseDestroy(handle);
curand
cudaError_t cudaResult = cudaSuccess;
// Allocate memory for pointsfloat *d_points = 0;cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(float));
// Generate random points in unit squarecurandStatus_t curandResult;curandGenerator_t qrng;
curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32);curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2);curandResult = curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT);curandResult = curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims);
// CleanupcurandResult = curandDestroyGenerator(qrng);cudaFree(d_points);
npp
// declare a host image object for an 8-bit grayscale imagenpp::ImageCPU_8u_C1 oHostSrc;// load gray-scale image from disknpp::loadImage(sFilename, oHostSrc);// declare a device image and copy from the host image to the devicenpp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc);
// create struct with box-filter mask sizeNppiSize oMaskSize = {5, 5};// create struct with ROI size given the current maskNppiSize oSizeROI = {oDeviceSrc.width() - oMaskSize.width + 1, oDeviceSrc.height() - oMaskSize.height + 1};
// allocate device image of appropriately reduced sizenpp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height);
// set anchor point inside the mask to (0, 0)NppiPoint oAnchor = {0, 0};// run box filternppiFilterBox_8u_C1R(oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, oMaskSize, oAnchor);
// declare a host image for the resultnpp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size());// and copy the device result data into itoDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch());
8Streams
pinned memory
cudaStream_t stream;
cutilSafeCall( cudaStreamCreate(&stream) );
// allocate page locked memorycutilSafeCall( cudaMallocHost((void**)&a, nbytes, cudaHostAllocDefault) );
// allocate device memorycutilSafeCall( cudaMalloc((void**)&d_a, nbytes) );cutilSafeCall( cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, stream) );
// run kernel and copy result backcutilSafeCall( cudaEventRecord(start, stream) );kernel<<<N,M,0,stream>>>(&d_a, ... );cutilSafeCall( cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, stream) );
// freecudaStreamDestroy(stream);cudaFreeHost(a);cudaFree(d_a);
chunked computation
// loop over full data, in bite-sized chunksfor (int i=0; i<FULL_DATA_SIZE; i+= N) { // copy the locked memory to the device, async cutilSafeCall( cudaMemcpyAsync(dev_a, host_a+i, N * sizeof(int), cudaMemcpyHostToDevice, stream) ); cutilSafeCall( cudaMemcpyAsync(dev_b, host_b+i, N * sizeof(int), cudaMemcpyHostToDevice, stream) );
kernel<<<N/256,256,0,stream>>>(dev_a, dev_b, dev_c);
// copy the data from device to locked memory cutilSafeCall( cudaMemcpyAsync(host_c+i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream) );}
// wait for all operations to finishcutilSafeCall( cudaStreamSynchronize(stream) );
batched computation
cudaStream_t *streamArray = 0;streamArray = (cudaStream_t *)malloc(N * sizeof (cudaStream_t *));
...for ( int i = 0; i < N ; i++) { cudaStreamCreate(&streamArray[i]); ...}
...for ( int i = 0; i < N ; i++) { cublasSetMatrix (..., devPtrA[i], ...); ...}
...for ( int i = 0; i < N ; i++) { cublasSetStream(handle, streamArray[i]); cublasSgemm(handle, ..., devPtrA[i], devPtrB[i], devPtrC[i], ...);}cudaThreadSynchronize();
• use it to specify in which order operations get executed async.
• idea is to use more than 1 stream
• requires a new kind of mem. copy which in turn requires pinned: paged locked mem.
• free pinned mem. when not needed
keep in mind
overlap kernel exec. & memcpy
// Allocate resourcesfor( int i =0; i<STREAM_COUNT; ++i ) { cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault); cudaMalloc(&d_data_in[i], memsize); ...}
int current_stream = 0;// Do processing in a loop...{ int next_stream = (current_stream + 1 ) % STREAM_COUNT; // Ensure that processing and copying of the last cycle has finished cudaEventSynchronize(cycleDone[next_stream]); // Process current frame kernel<<<grid, block, 0, stream[current_stream]>>>(d_data_out[current_stream], d_data_in[current_stream], N, ...); // Upload next frame cudaMemcpyAsync(d_data_in[next_stream], ..., cudaMemcpyHostToDevice, stream[next_stream]);
// Download current frame cudaMemcpyAsync(h_data_out[current_stream], ..., cudaMemcpyDeviceToHost, stream[current_stream]);
cudaEventRecord(cycleDone[current_stream], stream[current_stream]); current_stream = next_stream;}
• devices with CC 1.1 and above can overlap a kernel exec & memcpy as long as they are issued from different streams
• kernels are serialized
• queue in a way that independent streams can execute in parallel
keep in mind
9Multi-GPU
zero-copy host memory
float *a, *d_a;...
/* Allocate mapped CPU memory. */cutilSafeCall( cudaHostAlloc((void **)&a, bytes, cudaHostAllocMapped) );...
/* Initialize the vectors. */for(n = 0; n < nelem; n++) { a[n] = rand() / (float)RAND_MAX; ... }
/* Get the device pointers for the pinned CPU memory mapped into the GPU memory space. */cutilSafeCall( cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0) );...
/* Call the GPU kernel using the device pointers for the mapped memory. */ ...kernel<<<grid, block>>>(d_a, d_b, d_c, nelem);...
/* Memory clean up */cutilSafeCall( cudaFreeHost(a) );...
streams
//Create streams for issuing GPU command asynchronously and allocate memoryfor(int i = 0; i < GPU_N; i++) { cutilSafeCall( cudaStreamCreate(&stream[i]) ); cutilSafeCall( cudaMalloc((void**)&d_Data[i], dataN * sizeof(float)) ); cutilSafeCall( cudaMallocHost((void**)&h_Data[i], dataN * sizeof(float)) ); //init h_Data}
//Copy data to GPU, launch the kernel and copy data back. All asynchronouslyfor(int i = 0; i < GPU_N; i++) { //Set device cutilSafeCall( cudaSetDevice(i) ); // Copy input data from CPU cutilSafeCall( cudaMemcpyAsync(d_Data[i], h_Data[i], dataN * sizeof(float), cudaMemcpyHostToDevice, stream[i]) );
// Perform GPU computations kernel<<<blocks, threads, 0, stream[i]>>>(...) // Copy back the result cutilSafeCall( cudaMemcpyAsync(h_Sum_from_device[i], d_Sum[i], ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, stream[i]) );}
process the result
// Process GPU resultsfor(i = 0; i < GPU_N; i++) { // Set device cutilSafeCall( cudaSetDevice(i) ); // Wait for all operations to finish cudaStreamSynchronize(stream[i]); // Shut down this GPU cutilSafeCall( cudaFreeHost(h_Data[i]) ); cutilSafeCall( cudaFree(d_Data[i]) ); cutilSafeCall( cudaStreamDestroy(stream[i]) );}
// shutdownfor(int i = 0; i < GPU_N; i++) { cutilSafeCall( cudaSetDevice(i) ); cutilDeviceReset(); }
• can also control each GPU by a separate CPU thread
• need to assign portable pinned memory if a different thread needs access to one thread’s memory
• use the flag cudaHostAllocPortable to cudaHostAlloc()
keep in mind
mpi + cuda
// Initialize MPI stateMPI_CHECK( MPI_Init(&argc, &argv) );
// Get our MPI node number and node countint commSize, commRank;MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &commSize) );MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &commRank) );
if(commRank == 0) {// Are we the root node? //initialize dataRoot...}
// Allocate a buffer on each nodefloat * dataNode = new float[dataSizePerNode];
// Dispatch a portion of the input data to each nodeMPI_CHECK( MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD) );
// if commRank == 0 then free dataRoot...
kernel<<<gridSize, blockSize>>>(dataNode, ...);
// Reduction to the root nodefloat sumNode = sum(dataNode, dataSizePerNode);float sumRoot;MPI_CHECK( MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD) );
MPI_CHECK( MPI_Finalize() );
P2P & unified virtual address space
// Enable peer accesscutilSafeCall(cudaSetDevice(gpuid_tesla[0]));cutilSafeCall(cudaDeviceEnablePeerAccess(gpuid_tesla[1], gpuid_tesla[0]));...
// Allocate bufferscudaSetDevice(gpuid_tesla[0]); cudaMalloc(&g0, buf_size);cudaSetDevice(gpuid_tesla[1]); cudaMalloc(&g1, buf_size);
// Ping-pong copy between GPUscudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);
// Prepare host buffer and copy to GPU 0cudaSetDevice(gpuid_tesla[0]); cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);
// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing// output to the GPU 1 buffer: dst[idx] = src[idx] * 2.0fcudaSetDevice(gpuid_tesla[1]); kernel<<<blocks, threads>>>(g0, g1);cutilDeviceSynchronize();
// Disable peer access (also unregisters memory for non-UVA cases)cudaSetDevice(gpuid_tesla[0]); cudaDeviceDisablePeerAccess(gpuid_tesla[1]);cudaSetDevice(gpuid_tesla[1]); cudaDeviceDisablePeerAccess(gpuid_tesla[0]);
cudaFree(g0);...
References
download slides (2MB pdf) fromhttp://bit.ly/cuda-deep-dive
Thank you