gmac global memory for accelerators isaac gelado, john e. stone, javier cabezas, nacho navarro and...

GMACGlobal Memory for Accelerators

Isaac Gelado, John E. Stone, Javier Cabezas, Nacho Navarro and Wen-mei W. Hwu

GTC 2010

GMAC in a nutshell• GMAC: Unified Virtual Address Space for CUDA – Simplifies the CPU code– Exploits advanced CUDA features for free

• Vector addition example– Really simple kernel code

– But, what about the CPU code?

__global__ void vector(float *c, float *a, float *b, size_t size){

int idx = threadIdx.x + blockIdx.x * blockDim.x;if(idx < size) c[idx] = a[idx] + b[idx];

}

9/23/10 2GTC 2010

CPU CUDA code (I)• Read from disk, transfer to GPU and computeint main(int argc, char *argv[]) {

float *h_a, *h_b, *h_c, *d_a, *d_b, *d_c;size_t size = LENGTH * sizeof(float);

assert((h_a = malloc(size) != NULL);assert((h_b = malloc(size) != NULL);assert((h_c = malloc(size) != NULL);

assert(cudaMalloc((void **)&d_a, size) == cudaSuccess));assert(cudaMalloc((void **)&d_b, size) == cudaSuccess));assert(cudaMalloc((void **)&d_c, size) == cudaSuccess));

read_file(argv[A], h_a);read_file(argv[B], h_b);

assert(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice) == cudaSuccess);

assert(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice) == cudaSuccess);

9/23/10 3GTC 2010

CPU CUDA code (and II)• Read from disk, transfer to GPU and compute

Db(BLOCK_SIZE);Dg(LENGTH / BLOCK_SIZE);if(LENGTH % BLOCK_SIZE) Dg.x++;vector<<<Dg, Db>>>(d_c, d_a, d_b, LENGTH);assert(cudaThreadSynchronize() == cudaSuccess);

assert(cudaMemcpy(d_c, h_c, LENGTH * sizeof(float), cudaMemcpyDeviceToHost) == cudaSuccess);

save_file(argv[C], h_c);

free(h_a); cudaFree(d_a);free(h_b); cudaFree(d_b);free(h_c); cudaFree(d_c);

return 0;}

9/23/10 4GTC 2010

CPU GMAC code

9/23/10 5GTC 2010

int main(int argc, char *argv[]) {float *a, *b, *c;size_t size = LENGTH * sizeof(float);

assert(gmacMalloc((void **)&a, size) == gmacSuccess));assert(gmacMalloc((void **)&b, size) == gmacSuccess));assert(gmacMalloc((void **)&c, size) == gmacSuccess));

read_file(argv[A], a);read_file(argv[B], b);

Db(BLOCK_SIZE);Dg(LENGTH / BLOCK_SIZE);if(LENGTH % BLOCK_SIZE) Dg.x++;vector<<<Dg, Db>>>(c, a, b, LENGTH);assert(gmacThreadSynchronize() == gmacSuccess);

save_file(argv[C], c);

gmacFree(a); gmacFree(b); gmacFree(c);

return 0;}

There is no memory

copy

There is no memory

copy

Getting GMAC

• GMAC is at http://adsm.googlecode.com/• Debian / Ubuntu binary and

development .deb files• UNIX (also MacOS X) source code package• Experimental versions from mercurial

repository

9/23/10 6GTC 2010

http://adsm.googlecode.com/

Outline• Introduction• GMAC Memory Model– Asymmetric Memory– Global Memory

• GMAC Execution Model– Multi-threading– Inter-thread communication

• Conclusions

9/23/10 7GTC 2010

GMAC Memory Model• Unified CPU / GPU virtual address space• Asymmetric address space accessibility

CPU

Memory

GPU

Shared Data

CPU Data

9/23/10 8GTC 2010

GMAC Consistency Model• Implicit acquire / release primitives at

accelerator call / return boundaries

9/23/10 GTC 2010 9

CPU ACC

CPU ACC

GMAC Memory API• Allocate shared memorygmacError_t gmacMalloc(void **ptr, size_t size)

– Allocated memory address (returned by reference)– Gets the size of the data to be allocated– Error code, gmacSuccess if no error

• Example usage#include <gmac.h>

int main(int argc, char *argv[]) { float *foo = NULL; gmacError_t error; if((error = gmacMalloc((void **)&foo, FOO_SIZE)) != gmacSuccess) FATAL(“Error allocating memory %s”, gmacErrorString(error)); . . .}

9/23/10 10GTC 2010

GMAC Memory API• Release shared memorygmacError_t gmacFree(void *ptr)

– Memory address to be released– Error code, gmacSuccess if no error


int main(int argc, char *argv[]) { float *foo = NULL; gmacError_t error; if((error = gmacMalloc((void **)&foo, FOO_SIZE)) != gmacSuccess) FATAL(“Error allocating memory %s”, gmacErrorString(error)); . . . gmacFree(foo);}

9/23/10 11GTC 2010

GMAC Unified Address Space• Use fixed-size segments to map accelerator memory• Implement and export Accelerator Virtual Memory

AcceleratorSystemMemory

CPU AcceleratorMemory

0x00100000 0x00100000

9/23/10 12GTC 2010

GMAC Memory API• Translate shared memory (multi-GPU)void *gmacPtr(void *ptr)template<typename T> T *gmacPtr(T *ptr)

– Receives CPU memory address– Returns GPU memory address


int main(int argc, char *argv[]) { . . . kernel<<<Dg, Db>>>(gmacPtr(buffer), size); . . .}

9/23/10 13GTC 2010

GMAC Example Code (I)int fdtd(FILE *fpMat, FILE *fpMed, int N) { /* Read and create data structures */ MaterialList materials if(readMaterials(fpMat, materials) == 0) return -1; Media media; if(readMedia(fpMed, media) == 0) return -1; Field field; if(createField(media.dim, field) == 0) return -1;

for(int n = 0; n < N; n++) { . . . updateElectic<<<Dg, Db>>>(materials, media, field); . . . n++; updateMagnetic<<<Dg, Db>>>(materials, media, field); . . . }}

9/23/10 GTC 2010 14

GMAC Example Code (II)typedef struct { float Ke[3][3], km[3][3]; } Material;typedef struct { size_t n; Material *data; } MaterialList;

/* Read materials from disk */size_t readMaterials(FILE *fp, MaterialList &list) { uint16_t n = 0; fread(&n, sizeof(n), 1, fp); ret = gmacMalloc((void **)&list.data, n * sizeof(Material)); if(ret != gmacSuccess) return 0; fread(list.data, sizeof(Material), n, fp); return n; }/* Read media description from file */typedef struct { dim3 dim; uint16_t *data } Media;void readMedia(FILE *fp, Media &media);

/* Allocate a electromagnetic field */typedef struct { dim3 dim; float3 *e; float3 *h; float3 *p; float3 *m } Field;void allocateField(Field &f, dim3 dim);

9/23/10 GTC 2010 15

GMAC I/O Handling• Functions overridden (interposition) by GMAC:– Memory: memset(), memcpy()– I/O: fread(), fwrite(), read(), write()– MPI: MPI_Send(), MPI_Receive

• Get advanced CUDA features for free– Asynchronous data transfers– Pinned memory

9/23/10 GTC 2010 16

Asynchronous Copies to device memory

Pinned memory for I/O transfers

GMAC Example Code (III)__global__ void updateElectric(Materials mats, Media media, Field f) { int Idx = threadIdx.x + blockDim.x * blockIdx.x; int Idy = threadIdx.y + blockDim.y * blockIdx.y;

for(int Idz = 0; Idz < f.dim.z; Idz++) { int pos = Idx + Idy * f.dim.x + Idz * f.dim.x * f.dim.y; float3 E = f.e[pos]; Material m = mats[media[pos]]; float3 P; P.x = E.x * m.ke[0][0] + E.y * m.ke[0][1] + E.z * m.ke[0][2]; P.y = E.x * m.ke[1][0] + E.y * m.ke[1][1] + E.z * m.ke[1][2]; P.z = E.x * m.ke[2][0] + E.y * m.ke[2][1] + E.z * m.ke[2][2]; f.p[pos] = P; }}

9/23/10 GTC 2010 17



• Conclusions

9/23/10 18GTC 2010

GMAC Global Memory• For multi-GPU systems• Data accessible by all accelerators, but owned

by the CPU

CPU

Memory

GPU

GPU

9/23/10 19GTC 2010

GMAC Global memory API• Allocate global shared MemorygmacError_t gmacGlobalMalloc(void **ptr, size_t size)

– Allocated memory address (returned by reference)– Gets the size of the data to be allocated– Error code, gmacSuccess if no error

• Example usage

#include <gmac.h>

int main(int argc, char *argv[]) { float *foo = NULL; gmacError_t error; if((error = gmacGlobalMalloc((void **)&foo, FOO_SIZE)) != gmacSuccess) FATAL(“Error allocating memory %s”, gmacErrorString(error)); . . .}

9/23/10 20GTC 2010

GMAC Example Code (I)typedef struct { float Ke[3][3], km[3][3]; } Material;typedef struct { size_t n; Material *data; } MaterialList;

/* Read materials from disk */size_t readMaterials(FILE *fp, MaterialList &list) { uint16_t n = 0; fread(&n, sizeof(n), 1, fp); ret = gmacGlobalMalloc((void **)&list.data, n * sizeof(Material)); if(ret != gmacSuccess) return 0; fread(list.data, sizeof(Material), n, fp); return n; }/* Read media description from file */typedef struct { dim3 dim; uint16_t *data } Media;void readMedia(FILE *fp, Media &media);

/* Allocate a electromagnetic field */typedef struct { dim3 dim; float3 *e; float3 *h; float3 *p; float3 *m } Field;void allocateField(Field &f, dim3 dim);

9/23/10 GTC 2010 21



• Conclusions

9/23/10 22GTC 2010

GMAC and Multi-threading• In the past, one host thread had one CPU• In GMAC, each host thread has:– One CPU– One GPU

• A GMAC thread is running at GPU or at the CPU, but not in both at the same time

• Create threads using what you already know– pthread_create(...)

9/23/10 23GTC 2010

GMAC and Multi-threading• Virtual memory accessibility:– Complete address space in CPU mode– Partial address space in GPU mode

CPU CPU

GPU GPUMemory

9/23/10 24GTC 2010

Getting Full-duplex PCIe• Use multi-threading to fully utilize the PCIe– One CPU thread launch kernels– One CPU thread writes to shared memory– Once CPU thread reads from shared memory

9/23/10 GTC 2010 25

GPUCPU

GPU MemorySystem

Memory

PCIe



• Conclusions

9/23/10 26GTC 2010

GPU Handoff and Copying• GPU handoff:– Send the thread’s virtual GPU to another thread– Do not move data, move computation

• API Calls– Virtual GPU sendinggmacError_t gmacSend(thread_id dest)

– Virtual GPU receivinggmacError_t gmacReceive()

– Virtual GPU copyinggmacError_t gmacCopy(thread_id dest)

9/23/10 27GTC 2010

GPU virtual GPUs use Case• Exploit data locality in the CPU and GPU• Example: MPEG-4 Encoder: – Each GMAC thread executes one stage– Then, moves to the GPU where the input data is

9/23/10 GTC 2010 28

GPU GPU GPUGPU

DCT and Quantization

Motion Compensation

Dequantization and IDCT

Motion Estimation



• Conclusions

9/23/10 29GTC 2010

GMAC Performance

9/23/10 GTC 2010 30

GMAC on Actual Applications (I)• Reverse Time Migration (BSC / Repsol)– Six months – one programmer– Currently in use by Repsol

• Single-GPU using CUDA Run-time– Can live with it: double-allocations, memory consistency– Nightmare: overlap GPU computation and data transfers

(CUDA streams and double-buffering with pinned memory)

• Multi-GPU using CUDA Run-time– Can live with it: lack of IDE for Linux– Nightmare: everything else

9/23/10 GTC 2010 31

Cancelled

GMAC on Actual Applications (II)• Multi-GPU using GMAC:– Double-buffering and pinned memory for free• Disk transfers• GPU to GPU (inter-domain) communication• MPI communication

– Clean threading model• One task per CPU thread• Well-know synchronization primitives

• It took shorter than the single-GPU version

9/23/10 GTC 2010 32

Conclusions• Single virtual address space for CPUs and

GPUs• Use CUDA advanced features– Automatic overlap data communication and

computation– Get access to any GPU from any CPU thread

• Get more performance from your application more easily

• Go: http://adsm.googlecode.com

9/23/10 33GTC 2010


Future Features

• OpenCL and Windows 7 support coming soon• Data-dependence tracking:– Avoid transferring data to the GPU when not used

by kernels– Avoid transferring data to the CPU when not

modified kernels• Global shared memory partitioning between

multiple GPUs

9/23/10 GTC 2010 34


http://adsm.googlecode.com


Backup Slides

GMAC Advanced Free Features• Get advanced CUDA features for free– Asynchronous data transfers– Pinned memory

9/23/10 37GTC 2010

Asynchronous Copies to device memory

Pinned memory for I/O transfers

GMAC Unified Address Space• When allocating memory

1. Allocate accelerator memory2. Allocate CPU memory at the same virtual address

CPU AcceleratorSystemMemory

AcceleratorMemory

9/23/10 38GTC 2010

Lazy Update Data Transfers• Avoid unnecessary data copies• Lazy-update:

– Call: transfer modified data– Return: transfer when needed

9/23/10 GTC 2010 39


AcceleratorMemory

CPU

Rolling Update Data Transfers• Overlap CPU execution and data transfers• Minimal transfer on-demand• Rolling-update:

– Memory-block size granularity

9/23/10 GTC 2010 40


AcceleratorMemory

CPU


http://adsm.googlecode.com


gmac global memory for accelerators isaac gelado, john e. stone, javier cabezas, nacho navarro and...

Documents