using cuda within mathematica

Using CUDA within Mathematica

Kashif Rasul and Raqibul Hassan

l a b s

Overview

• Intro to Mathematica and its API

• CUDA + Mathematica

• Some examples

Mathematica intro• Mathematica is a modular

computational system in which the kernel is separate from the front end which handles the interaction with the user.

• The most common way to work is to use interactive documents called notebooks which mix text input and output as well as graphics and other material.

Structure of Mathematica

• An import aspect of Mathematica is that it can also interact with other applications.

• This is achieved through MathLink, a standardised API for two-way communication with the kernel.

MathLink

• MathLink allows external programs both to call Mathematica, and to be called by Mathematica.

• We will use MathLink to let Mathematica call CUDA functions inside an external program.

Simple example

:Begin::Function: addtwo:Pattern: AddTwo[i_Integer,j_Integer]:Arguments: { i, j }:ArgumentTypes: {Integer,Integer}:ReturnType: Integer:End:

addtwo.tm

addtwo.c

#include <mathlink.h>

int addtwo( int i, int j){ return i+j;}

int main(int argc, char* argv[]){ return MLMain(argc, argv);}

mprep & gcc

$ mprep addtwo.tm -o addtwotm.c

$ gcc -I${INCDIR} addtwotm.c addtwo.c -L${LIBDIR} -lMLi3 -lstdc++ -o addtwo

In[3]:= SetDirectory!""Applications"Mathematica.app"SystemFiles"Links"MathLink"DeveloperKit"

PrebuiltExamples"#Out[3]= !Applications!Mathematica.app!SystemFiles!Links!MathLink!DeveloperKit!

PrebuiltExamples

In[4]:= link ! Install!"."addtwo"#Out[4]= LinkObject"!Applications!Mathematica.app!SystemFiles!Links!MathLink!DeveloperKit!

PrebuiltExamples!addtwo, 524, 8#In[5]:= LinkPatterns!link#Out[5]= $AddTwo"i_Integer, j_Integer#%In[6]:= ?AddTwo

AddTwo !x , y" gives the sum of two machine integers x and y.

In[7]:= AddTwo!2, 3#Out[7]= 5

In[8]:= AddTwo!2^31 " 1, 1#Out[8]= !2 147 483 648

In[9]:= Uninstall!link#Out[9]= !Applications!Mathematica.app!SystemFiles!Links!MathLink!DeveloperKit!

PrebuiltExamples!addtwo

MathLink Template file

• When a MathLink template file is processed, two basic things are done:

• :Pattern: & :Arguments: specifications are used to generate a Mathematica definition

• :Function:, :ArgumentTypes: & :ReturnType: specifications are used to generate C source code

:ArgumentTypes:

Mathematica specification C specification

Integer intReal double

IntegerList int*, longRealList double*, longString char*Symbol char*Manual void

Handling Lists & Arrays

:Begin::Function: sumList:Pattern: SumList[a_List]:Arguments: {a}:ArgumentTypes:{IntegerList}:ReturnType: Integer:End:

int sumList(int *a, long alen) { int i, tot=0;

for(i=0; i<alen; i++) tot += a[i];

return tot;}

Manual ArgumentTypes:Begin::Function: sumList:Pattern: SumList[a:{___Integer}]:Arguments: {a}:ArgumentTypes:{Manual}:ReturnType: Integer:End:

int sumList(void) { int n, i; int a[MAX];

MLCheckFunction(stdlink, "List", &n);

for (i=0; i<n; i++) MLGetInteger32(stdlink, a+i);...}

int sumList(void) { int n; int *a;

MLGetInteger32List(stdlink, &a, &n); ... MLReleaseInteger32List(stdlink, a, n); ...}

Array of arb. depth#include <mathlink.h>

/* read an array of double-precision floating-point numbers from a link */void f(MLINK lp){ double *data; int *dims; char **heads; int d; /* stores the rank of the array */

if(! MLGetRealArray(lp, &data, &dims, &heads, &d)) { /* unable to read the array from lp */ return; } /* ... */ MLReleaseRealArray(lp, data, dims, heads, d); }

Handling Complex numbers

In[1]:= Head!2 ! 3 ""Out[1]= Complex

If you pass a list of complex numbers to your external program, then MLGetReal64Array() will create a two-dimensional array containing a sequence of pairs of real and imaginary parts. In this case, heads[0] will be "List" while heads[1] will be "Complex".

//get an array of floating-point numbers of any depthMLGetReal64Array(stdlink,double**a,int**dims,char***heads,int*d);

http://reference.wolfram.com/mathematica/ref/format/List.html

http://reference.wolfram.com/mathematica/ref/format/List.html

Summary of API//get a list of integers, allocating the memory needed to store itMLGetInteger32List(stdlink,int**a,int*n);//get a list of floating-point numbersMLGetReal64List(stdlink,double**a,int*n); //release the memory associated with a list of integersMLReleaseInteger32List(stdlink,int*a,int n); //release the memory associated with a list of floating-point numbersMLReleaseReal64List(stdlink,double*a,int n);

//get an array of integers of any depthMLGetInteger32Array(stdlink,int**a,int**dims,char***heads,int*d);//get an array of floating-point numbers of any depthMLGetReal32Array(stdlink,float**a,int**dims,char***heads,int*d);//release memory associated with an integer arrayMLReleaseInteger32Array(stdlink,int*a,int*dims,char**heads,int d);//release memory associated with a floating-point arrayMLReleaseReal32Array(stdlink,float*a,int*dims,char**heads,int d);

Manual ReturnType

:Begin::Function: bits:Pattern: ToBits[i_Integer]:Arguments: {i}:ArgumentTypes:{Integer}:ReturnType: Manual:End:

void bits(int i){ int a[32], k;

for(k=0; k<32; k++) { a[k] = i%2; i >>= 1; if (i==0) break; }

if (k<32) k++;

MLPutInteger32List(stdlink, a, k); return;}

General arrayint a[8][16][100];int dims[] = {8, 16, 100};

MLPutInteger32Array(stdlink, a, dims, NULL, 3);

int ***a;

MLPutFunction(stdlink, "List", n1);for (i=0; i<n1; i++) { MLPutFunction(stdlink, "List", n2); for (j=0; j<n2; j++) { MLPutInteger32List(stdlink, a[i][j], n3); }}

or

In[10]:= !Sequence"1, Sequence"4, Sequence"###$Out[10]= !1, 4"

MLPutFunction(stdlink, "List", 1);

while( condition ) { /* generate an element */ MLPutFunction(stdlink, "Sequence", 2); MLPutInteger32(stdlink, i );}

MLPutFunction(stdlink, "Sequence", 0);

Unkown length

Return Complex numbers

// Complex data typetypedef float2 Complex;

Complex* h_convolved_signal;

// Return transformed signal to Mathematica as a Complex ListMLPutFunction(stdlink,"List",n);for (long i = 0; i < n; i++) { MLPutFunction(stdlink,"Complex",2); MLPutFloat(stdlink,h_convolved_signal[i].x*norm); MLPutFloat(stdlink,h_convolved_signal[i].y*norm);}

Return Complex numbers

In[4]:= list ! Table!RandomReal!", #12$"Out[4]= !0.389421, 0.222396, 0.434636, 0.0886136, 0.233102, 0.941771,

0.928712, 0.764119, 0.791473, 0.381426, 0.757661, 0.44273"In[5]:= Map!Function!#x$, Apply!Complex, x"", Partition!list, 2""Out[5]= !0.389421 ! 0.222396 ", 0.434636 ! 0.0886136 ", 0.233102 ! 0.941771 ",

0.928712 ! 0.764119 ", 0.791473 ! 0.381426 ", 0.757661 ! 0.44273 "" // Return transformed signal to Mathematica as a Complex List MLPutFunction(stdlink, "Map", 2); MLPutFunction(stdlink, "Function", 2); MLPutFunction(stdlink, "List", 1); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Apply", 2); MLPutSymbol(stdlink, "Complex"); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Partition", 2); MLPutFunction(stdlink, "Times", 2); MLPutReal(stdlink, norm); MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n); MLPutInteger(stdlink, 2);

Error & Interruptif(! MLPutInteger(stdlink, 10)){ /* check the possible errors */ switch(MLError(stdlink)) { case MLEDEAD: /* the link died unexpectedly */ break; case MLECLOSED: /* the other side closed the link */ break; case MLEOK: /* no error occurred */ break; default: /* ... */ }}

if(! MLPutReal64(stdlink, 3.22)){ /* unable to send 3.22 to lp */ printf("MathLink Error: %s\n", MLErrorMessage(stdlink)); MLClearError(stdlink);}

while(len--){ sum += *list++; /* check for the abort */ if(MLAbort) return (double)0;}

Running on remote computers

$ ./addtwo -linkcreate -linkprotocol TCPIPLink created on: [email protected],[email protected]

In[5]:= Install!LinkConnect!"63166!192.168.1.107,63167!192.168.1.107",LinkProtocol " "TCPIP"""

Out[5]= LinkObject!63166!192.168.1.107,63167!192.168.1.107, 1110, 8"In[6]:= AddTwo!2, 3"Out[6]= 5

mailto:[email protected]




Mathematica + CUDA#include <cutil_inline.h>

int main(int argc, char **argv){ // use command-line specified CUDA device, // otherwise use device with highest Gflops/s if(cutCheckCmdLineFlag(argc, (const char**)argv, "device")) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() );

return MLMain(argc, argv);}

mathematica_cuda# Add source files hereEXECUTABLE := cuFourier# CUDA source files (compiled with cudacc)CUFILES := cuFourier.cu# CUDA dependency files# CU_DEPS := # C/C++ source files (compiled with gcc / c++)# CCFILES := # Additional libraries needed by the projectUSECUFFT := 1# MathLink Template filesTMFILES := cuFourier.tm

#################################################### Rules and targets

include ../../common/common.mk

• CMake http://www.cmake.org/

• FindCUDA https://gforge.sci.utah.edu/gf/project/findcuda/

• FindMathLink http://github.com/kashif/FindMathLink/tree

FindCUDA + FindMathLink via CMake

http://www.cmake.org

http://www.cmake.org

https://gforge.sci.utah.edu/gf/project/findcuda/




http://github.com/kashif/FindMathLink/tree




CMakeLists.txtset(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)set(source_files test_bin.cu)CUDA_COMPILE(CUDA_FILES test_bin.cu)

MathLink_ADD_TM(test.tm)

INCLUDE_DIRECTORIES( ${MathLink_INCLUDE_DIR} )LINK_DIRECTORIES( ${MathLink_LIBRARY_DIR} )

ADD_EXECUTABLE(cuda_compile_example ${CUDA_FILES} ${source_files} test.tm.c main.cc external_dependency.h )TARGET_LINK_LIBRARIES(cuda_compile_example ${MathLink_LIBRARIES} ${CUDA_LIBRARIES} )

double to float conversion

#include <cutil_inline.h>// General check for CUDA GPU SM Capabilities//inline bool cutilDrvCudaCapabilities(int major_version, int minor_version);

char **heads;int *dims;int rank;float *h_float;double *h_double;

if (cutilDrvCudaCapabilities( 1,3 )){ MLGetReal64Array(stdlink, &h_double, &dims, &heads, &rank);}else{ MLGetReal32Array(stdlink, &h_float, &dims, &heads, &rank);}

CUBLAS & CUFFT

• Follow the usual routine of sending data to the MathLink app

• Use CUBLAS or CUFFT

• Return result back to Mathematica

cuFourierIn[1]:= ListLinePlot!Abs!Fourier!RandomReal!1, 200"""^2"

Out[1]=

50 100 150 200

0.05

0.10

0.15

0.20

0.25

0.30

Clone mathematica_cuda

$ git clone git://github.com/kashif/mathematica_cuda.git

$ cd mathematica_cuda/src

$ mkdir cuFourier

$ mate cuFourier

cuFourier.tm

:Begin::Function: cuFourier1D:Pattern: CUFourier1D[ a:{__?NumericQ} ]:Arguments: { a }:ArgumentTypes:{ RealList }:ReturnType: Manual:End:

cuFourier.cu// includes system#include <stdlib.h>#include <stdio.h>#include <string.h>#include <math.h>

// includes cuda#include <cufft.h>#include <cutil_inline.h>

// includes mathlink#include <mathlink.h>

// Complex data typetypedef float2 Complex;

///////////////////////////////////////////////////////////////// Showing the use of CUFFT for fast convolution using FFT.///////////////////////////////////////////////////////////////extern "C" void cuFourier1D(double*, long);

////////////////////////////////////////////////////////////////////// Main program////////////////////////////////////////////////////////////////////int main(int argc, char *argv[]){ // use command-line specified CUDA device, otherwise use device // with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() );

return MLMain(argc, argv);}

void cuFourier1D (double *h_A, long n){ double norm = 1.0/sqrt((double) n); long mem_size = sizeof(Complex) * n; // Allocate host memory for the signal Complex* h_signal = (Complex*)malloc(mem_size); // Initalize the memory for the signal for (long i = 0; i < n; ++i) { h_signal[i].x = (float)h_A[i]; h_signal[i].y = 0.0f; } // Allocate device memory for signal Complex* d_signal; cutilSafeCall(cudaMalloc((void**)&d_signal, mem_size)); // Copy host memory to device cutilSafeCall(cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice));

// CUFFT plancufftHandle plan;cufftSafeCall(cufftPlan1d(&plan, n, CUFFT_C2C, 1)); // Transform signalcufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE)); // Copy device memory to hostComplex* h_convolved_signal = h_signal;cutilSafeCall(cudaMemcpy(h_convolved_signal, d_signal, mem_size, cudaMemcpyDeviceToHost));

// Release d_signalcutilSafeCall(cudaFree(d_signal)); // Destroy CUFFT contextcufftSafeCall(cufftDestroy(plan));

// Return transformed signal to Mathematica as a Complex List MLPutFunction(stdlink, "Map", 2); MLPutFunction(stdlink, "Function", 2); MLPutFunction(stdlink, "List", 1); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Apply", 2); MLPutSymbol(stdlink, "Complex"); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Partition", 2); MLPutFunction(stdlink, "Times", 2); MLPutReal(stdlink, norm); MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n); MLPutInteger(stdlink, 2); // Cleanup memory free(h_signal); cudaThreadExit();}

Makefile#################################################################### Build script for project###################################################################

# Add source files hereEXECUTABLE := cuFourier# CUDA source files (compiled with cudacc)CUFILES := cuFourier.cu# Additional libraries needed by the projectUSECUFFT := 1

# MathLink Template filesTMFILES := cuFourier.tm

################################################################### Rules and targetsinclude ../../common/common.mk

In[35]:= link !Install!""Users"kashif"Dropbox"20090630_NDVI_CUDA"mathematica_cuda"bin"darwin"

release"cuFourier"#Out[35]= LinkObject!"Users"kashif"Dropbox"20090630_NDVI_CUDA"mathematica_cuda"bin"darwin"

release"cuFourier, 605, 9#In[36]:= LinkPatterns!link#Out[36]= $CUFourier1D!a : $__?NumericQ%#%In[37]:= ListLinePlot!Abs!CUFourier1D!RandomReal!1, 200###^2#

Out[37]=

50 100 150 200

0.1

0.2

0.3

0.4

In[38]:= Uninstall!link#Out[38]= "Users"kashif"Dropbox"20090630_NDVI_CUDA"mathematica_cuda"bin"darwin"

release"cuFourier

Image Deconvolution for Life Sciences

• Confocal and Widefield microscopy 3D or 4D images

• Multichannel (3 or more channels)

• Comes in a wide variety of formats

Bio-Formats Java lib.

• Standalone Java library for reading and writing life science image formats

• Get both the pixels and metadata

• Licensed under GPL

• http://www.loci.wisc.edu/ome/formats.html

http://www.loci.wisc.edu/ome/formats.html




Java + Mathematica: J/Link

Needs!"JLink`""InstallJava!"LinkObject!'"usr"local"Wolfram"Mathematica"7.0"SystemFiles"Java"Linux!x86!64"bin"java' !classpath

""usr"local"Wolfram"Mathematica"7.0"SystemFiles"Links"JLink"JLink.jar"!Xmx256m !Djava.system.class.loader"com.wolfram.jlink.JLinkSystemClassLoader!Djava.util.prefs.PreferencesFactory"com.wolfram.jlink.DisabledPreferencesFactorycom.wolfram.jlink.Install !init ""tmp"m000001207601", 4, 4#

ReinstallJava!ClassPath ! "#home#kashif#Dropbox#BioFormats#Java#loci_tools.jar""LinkObject!'"usr"local"Wolfram"Mathematica"7.0"SystemFiles"Java"Linux!x86!64"bin"java' !classpath

""usr"local"Wolfram"Mathematica"7.0"SystemFiles"Links"JLink"JLink.jar"!Xmx256m !Djava.system.class.loader"com.wolfram.jlink.JLinkSystemClassLoader!Djava.util.prefs.PreferencesFactory"com.wolfram.jlink.DisabledPreferencesFactorycom.wolfram.jlink.Install !init ""tmp"m000002207601", 8, 4#

Reading LIF imagesreader ! JavaNew!"loci.formats.ImageReader""« JavaObject!loci.formats.ImageReader" »reader"setId!"#media#cdrom#xyz#1ch#by2#MT1.lif""reader"getSeriesCount!"7

reader"setSeries!0"sizeC ! reader"getSizeC!"1

pixelType ! reader"getPixelType!"1

num ! reader"getImageCount!"90

LoadJavaClass!"loci.formats.FormatTools""JavaClass!loci.formats.FormatTools, !" "bpp ! FormatTools`getBytesPerPixel!pixelType"1

reader"getSizeX!"512

reader"getSizeY!"512

reader"getSizeZ!"90

Reading pixel volumeLoadJavaClass!"loci.common.DataTools""JavaClass!loci.common.DataTools, !" "volume !

Flatten!N!Table! DataTools`makeDataArray!

reader"openBytes!z, 0, 0, reader"getSizeX!", reader"getSizeY!"", bpp, False, True",#z, 0, reader"getSizeZ!" # 1$""";

unflatten!e_, "d__?##IntegerQ!!$ && Positive!!$% &%&$ :"Fold!Partition, e, Take!"d&, "#1, 2, #1&$$ '; #Length!e$ """ Times!d$%

array " unflatten!volume, "reader$getSizeX!$, reader$getSizeY!$,reader$getSizeZ!$&$;

View a sliceImage!array!!165, All, All""#255"

ResultImage!deconvled!!165, All, All"""

Wiener Deconv.:Begin::Function: wienerDeconvolve:Pattern: WienerDeconvolve[nx_Integer, ny_Integer, nz_Integer, epsilon_Real, sigma_Real, inImage:{___Real}]:Arguments: { nx, ny, nz, epsilon, sigma, inImage }:ArgumentTypes: { Integer, Integer, Integer, Real, Real, Manual }:ReturnType: Manual:End:

void wienerDeconvolve(int nx, int ny, int nz, double epsilon, double sigma){ float *inImage; int length;

if(! MLGetReal32List(stdlink, &inImage, &length)) { return; }

amira Projection viewhttp://www.amiravis.com

®

http://www.amiravis.com

http://www.amiravis.com

Export!""home"kashif"Amira522"data"deconv"alphalobe!MaxLike.raw",result, "Real32"#;

Remote Sensing application

Reflectance

Vegetation

Landsat TM Data

Band 3 & Band 4

NDVI = NIR-R/NIR+R

Reading Landsat ImagesIn[4]:= reader ! JavaNew!"loci.formats.ImageReader""Out[4]= « JavaObject!loci.formats.ImageReader" »In[5]:= reader ! JavaNew!"loci.formats.ChannelSeparator", reader"Out[5]= « JavaObject!loci.formats.ChannelSeparator" »In[35]:= reader"setId!"#Users#sabman#satellite_images#multispectral#bhtmref.tif""In[7]:= reader"getSeriesCount!"Out[7]= 1

In[8]:= sizeC ! reader"getSizeC!"Out[8]= 6

In[9]:= pixelType ! reader"getPixelType!"Out[9]= 1

In[11]:= num ! reader"getImageCount!"Out[11]= 6

In[12]:= pixelType ! reader"getPixelType!"

Loading Landsat data in Mathematica

In[14]:= LoadJavaClass!"loci.formats.FormatTools""Out[14]= JavaClass!loci.formats.FormatTools, !" "In[15]:= bpp ! FormatTools`getBytesPerPixel!pixelType"Out[15]= 1

In[16]:= reader"getSizeX!"Out[16]= 512

In[17]:= isLittle ! reader"isLittleEndian!"Out[17]= True

In[18]:= reader"getSizeY!"Out[18]= 512

In[19]:= LoadJavaClass!"loci.common.DataTools""Out[19]= JavaClass!loci.common.DataTools, !" "

In[31]:= red ! DataTools`makeDataArray!reader"openBytes!2, 0, 0, reader"getSizeX!", reader"getSizeY!"", bpp, False, True";

In[53]:= Image! Partition!100 #Normalize!red", reader"getSizeX!"""

In[56]:= NIR ! DataTools`makeDataArray!reader"openBytes!3, 0, 0, reader"getSizeX!", reader"getSizeY!"", bpp, False, True";

In[57]:= Image! Partition!100 #Normalize!NIR", reader"getSizeX!"""

In[39]:= link ! Install!""Users"sabman"mathematica_cuda"bin"darwin"emurelease"ndvi"#Out[39]= LinkObject!"Users"sabman"mathematica_cuda"bin"darwin"emurelease"ndvi, 41, 10#In[40]:= LinkPatterns!link#Out[40]= $ndvi!a_List, b_List#%In[41]:= NDVI ! ndvi!Partition!NIR, reader"getSizeX!##, Partition!red, reader"getSizeX!###;In[42]:= Image!Partition!NDVI, reader"getSizeX!###

ndvi.tm

:Begin::Function: ndvi:Pattern: ndvi[ a_List, b_List ]:Arguments: { a, b }:ArgumentTypes: { Manual }:ReturnType: Manual:End:

void ndvi(void){ short int *h_A, *h_B; float *h_C_GPU; short int *d_A, *d_B; float *d_C;

char **heads_A, **heads_B; int *dims_A, *dims_B; int rank_A, rank_B;

if(! MLGetInteger16Array(stdlink, &h_A, &dims_A, &heads_A, &rank_A)) { return; } if(! MLGetInteger16Array(stdlink, &h_B, &dims_B, &heads_B, &rank_B)) { return; }

ndvi.cu

//Initializing data h_C_GPU = (float *)malloc(dims_A[0]*dims_A[1]*sizeof(float));

//Allocating GPU memory cutilSafeCall( cudaMalloc((void **)&d_A, dims_A[0]*dims_A[1]*sizeof(short int)) ); cutilSafeCall( cudaMalloc((void **)&d_B, dims_A[0]*dims_A[1]*sizeof(short int)) ); cutilSafeCall( cudaMalloc((void **)&d_C, dims_A[0]*dims_A[1]*sizeof(float)) );

//Copy data to GPU memory for further processing cutilSafeCall( cudaMemcpy(d_A, h_A, dims_A[0]*dims_A[1]*sizeof(short int), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_B, h_B, dims_A[0]*dims_A[1]*sizeof(short int), cudaMemcpyHostToDevice) );

cutilSafeCall( cudaThreadSynchronize() );

dim3 grid(ceil((float)dims_A[0]/(float)16.0f), ceil((float) dims_A[1]/32.0f), 1); dim3 threads(ceil( dims_A[0]/(float)grid.x), ceil( dims_A[1]/(float)grid.y), 1);

ndviGPU<<<grid, threads>>>(d_C, d_A, d_B, dims_A[0], dims_A[1]); cutilCheckMsg("ndviGPU() execution failed\n"); cutilSafeCall( cudaThreadSynchronize() );

//Release d_A and d_B cutilSafeCall( cudaFree(d_B) ); cutilSafeCall( cudaFree(d_A) );

//Read back GPU results into h_C_GPU cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, dims_A[0]*dims_A[1]*sizeof(float), cudaMemcpyDeviceToHost) );

//Release d_C cutilSafeCall( cudaFree(d_C) );

//Return result MLPutReal32List(stdlink, h_C_GPU, dims_A[0]*dims_A[1]);

//Release h_A and h_B MLReleaseInteger16Array(stdlink, h_A, dims_A, heads_A, rank_A); MLReleaseInteger16Array(stdlink, h_B, dims_B, heads_B, rank_B);

cudaThreadExit();

///////////////////////////////////////////////////////////////////////////////// Calculate ndvi of two channels d_A and d_B on GPU and store result in d_C///////////////////////////////////////////////////////////////////////////////

__global__ void ndviGPU( float *d_C, short int *d_A, short int *d_B, int width, int height){

unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y; if(xIndex < width && yIndex < height) { unsigned int i = yIndex * (width) + xIndex; d_C[i] = __fdividef( (float)(d_A[i] - d_B[i]), (float)(d_A[i] + d_B[i]) ); }}

NDVI Kernel

NDVI output

0 1

In[64]:= ArrayPlot!Partition!NDVI, reader!getSizeX!"", ColorFunction " "Rainbow""

Questions?

http://[email protected]

twitter krasul

http://hpc.nomad-labs.com




using cuda within mathematica

Technology

int addtwo int i

data int

heads int d

int sumlistint

int a32

int nget

int d release memory

mathlink mathlink