mpi-i/oprace.it4i.cz/sites/prace.it4i.cz/files/files/pario-03-2018-mpi-io.pdf · mpi-i/o patc...
TRANSCRIPT
MPI-I/O
PATC Courses, Ostrava, March 22-23, 2018
Nicole Audiffren
CINES, Centre Informatique de l’Enseignement Supérieur,
Montpellier France
HPC departement
SCIENTIFIC APPLICATION’S COMMON BEHAVIOR
Fine-grained requests to sparse portions of files
Low I/O performance
CURRENT SEQUENTIAL I/O
•File_P0 •File_P1 •File_P2
Lots of small files to manage
Difficult to read by a different
number of processes
Parallelism, performance
Po P1 P2
Po
P1
P3
P2
File
Necessary if no common file
system
Big Blocks improve perf.
Lack of parallelism
Limits the
scalability(bottleneck)
PROS AND CONS OF SEQUENTIAL I/O
•Big blocks improve performance
•Same way as the original code
Poor scalability
Single-node bottleneck
I/O STRATEGIES WHEN USING MULTIPLE FILES
Boito et al(1) have shown that
« when using the multiple file strategy, larger stripe size should be used ( 1MB or 64MB) except for read operations of big segments (larger than 512 KB ) », […] 12% better.
(1) : The impact of applications’I/O strategies on the
performance of the Lustre parallel file system, Int.J.High Performance Systems Architecture, Vol.3, Nos. 2/3, 2011,pp122-136
MPI HAS EVOLVED TO ADDRESS THIS ISSUE
1998
Data Sieving,
Thaktur et al
>Data is requested from the server in large contiguous portions of the file that cover the small portions needed by all processes
1996 MPI-IO Comittee
1999
Collective I/O
Thaktur et al
> Merging the portions needed by the clients in order to create larger and contiguous requests.
WHAT IS PARALLEL I/O ?
Common file
Po P1 Pn-1
• Processes access data from a common file
• Provides high performance and a single file
that can be easily used by other tools
4 LEVELS OF ACCESS PATTERN
Level 0
• Many independent, contiguous requests
Level 1
• Many collective, contiguous requests
Level 2
• Single independent, non-contiguous requests
Level 3
• collective, non-contiguous requests
DEFINITIONS
File : MPI supports random or sequential accces of a file
Displacement : absolute byte position relative to the beginning of a file. It defines the location where the view begins.
View: defines what file data are accessible by a process.
Etype: elementary datatype = unit of data access and positioning within a file.
A MPI data type of a MPI derived datatype
MPI performs data access in etype units (data items of etype)
Offsets are expressed as a count of etypes
INDEPENDENT I/O
Using individual file pointers
MPI_File_seek
MPI_File_read
MPI_File_write
MPI_MODE_CREATE
MPI_MODE_WRONLY
MPI_MODE_RDWR
Using explicit offsets
MPI_File_read_at
MPI_File_write_at
MPI_MODE_CREATE
MPI_MODE_WRONLY
MPI_MODE_RDWR
Using File view
MPI_File_set_view
Displacement
Etype
filetype
Common file
INDEPENDENT I/O Common file
/* read from a common file using individual file pointers */
/* Level 0 : many independent, contiguous requests*/ #include "mpi.h"
#define FILESIZE (1024 * 1024)
int main(int argc, char **argv)
{
int *buf, rank, nprocs, nints, bufsize;
MPI_File fh;
MPI_Status status;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
bufsize = FILESIZE/nprocs; buf = (int *) malloc(bufsize); nints = bufsize/sizeof(int);
MPI_File_open(MPI_COMM_WORLD, "/pfs/datafile", MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
MPI_File_seek(fh, rank*bufsize, MPI_SEEK_SET);
/* offset , start seeking from the beginning of the file */
MPI_File_read(fh, buf, nints, MPI_INT, &status);
MPI_File_close(&fh); free(buf); return 0; }
INDEPENDENT I/O Common file
PROGRAM main
C read from a common file using explicit offsets (file not tiled)
include 'mpif.h'
integer FILESIZE, MAX_BUFSIZE, INTSIZE
parameter (FILESIZE=1048576, MAX_BUFSIZE=1048576, INTSIZE=4)
integer buf(MAX_BUFSIZE), rank, ierr, fh, nprocs, nints
integer status(MPI_STATUS_SIZE), count
integer (kind=MPI_OFFSET_KIND) offset
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD, nprocs, ierr)
call MPI_FILE_OPEN(MPI_COMM_WORLD, '/pfs/datafile', MPI_MODE_RDONLY, MPI_INFO_NULL, fh, ierr)
nints = FILESIZE/(nprocs*INTSIZE)
offset = rank * nints * INTSIZE
call MPI_FILE_READ_AT(fh, offset, buf, nints, MPI_INTEGER, status, ierr)
call MPI_GET_COUNT(status, MPI_INTEGER, count, ierr)
call MPI_FILE_CLOSE(fh, ierr)
call MPI_FINALIZE(ierr)
END
INDEPENDENT I/O
PROGRAM main
C read from a common file using file view
use mpi
integer BUFSIZE
parameter (BUFSIZE=1000)
integer buf(BUFSIZE), myrank, ierr, thefile, intsize,i
integer (kind=MPI_OFFSET_KIND) disp
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD, myrank, ierr)
C Data is rank dependent
do i=0,BUFSIZE
buf(i)=myrank * BUFSIZE + i
enddo
call MPI_FILE_OPEN(MPI_COMM_WORLD, thefile, MPI_MODE_WRONLY + MPI_MODE_CREATE,
MPI_INFO_NULL, thefile, ierr)
call MPI_TYPE_SIZE(MPI_INTEGER,intsize)
disp = myrank * BUFSIZE * intsize
call MPI_FILE_SET_VIEW(thefile,disp,MPI_INTEGER,’native’, MPI_STATUS_IGNORE, ierr)
call MPI_FILE_WRITE(thefile, disp, buf, BUFSIZE,MPI_INTEGER, MPI_STATUS_IGNORE, ierr)
call MPI_FILE_CLOSE(fh, ierr)
call MPI_FINALIZE(ierr)
END
LEVEL 3 EXAMPLE
Each process creates a derived datatype to
describe the non-contiguous access pattern
Each process defines its file view
Collective call for read/or write
DISTRIBUTED ARRAY OVER 16 PROCESSORS
N COLUMS
M R
OW
S
P1 P2
P4 P6
P0 P3
COORDS=(0,0)
COORDS=(1,0)
COORDS=(0,1)
COORDS=(1,1)
COORDS=(0,2)
COORDS=(1,2)
The array can have
any number of
dimensions,
and each dimension
can be distributed as
this example, COORDS=(0,3)
COORDS=(1,3)
P7
P8 P9 P10 P11
P12 P15 P14 P13
COORDS=(2,1) COORDS=(2,2) COORDS=(2,3) COORDS=(2,0)
COORDS=(3,1) COORDS=(3,0) COORDS=(3,2) COORDS=(3,3)
P5
DARRAY CONSTRUCTOR
#include "mpi.h"
int main( int argc, char *argv[] )
{
int gsizes[2], distribs[2], dargs[2],
psizes[2], rank, size;
MPI_Datatype filetype;
int local_array_size, num_local_rows,
num_local_cols;
int num_global_rows,
num_global_cols;
MPI_File fh;
float *local_array;
MPI_Status status;
Easy way to created a derived datatype
that describes the location of the local array
of a process within a linearized
multidimensional global array for common
regular distributions : block,cyclic,….
Input to the darray constructor :
• Array size,
• Distribution information
• Rank of the process whose local array
is the one to be described.
Output of the darray constructor :
Derived datatype describing the layout of the
local array of that process within the
linearized global array.
DARRAY DATATYPE
/* ... */
MPI_Init( &argc, &argv );
/* This code is particular to a 4 x 4 process decomposition */
MPI_Comm_size( MPI_COMM_WORLD, &size );
if (size != 16) {
printf( "Communicator size must be 16\n" );
MPI_Abort( MPI_COMM_WORLD, 1 );
}
gsizes[0] = num_global_rows;
gsizes[1] = num_global_cols;
distribs[0] = distribs[1] = MPI_DISTRIBUTE_BLOCK;
dargs[0] = dargs[1] = MPI_DISTRIBUTE_DFLT_DARG;
psizes[0] = psizes[1] = 4;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Type_create_darray(16, rank, 2, gsizes, distribs, dargs, psizes, MPI_ORDER_C, MPI_FLOAT, &filetype);
MPI_Type_commit(&filetype);
local_array_size = num_local_rows * num_local_cols;
MPI_File_open(MPI_COMM_WORLD, "/pfs/datafile", MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_FLOAT, filetype, "native", MPI_INFO_NULL);
MPI_File_read_all(fh, local_array, local_array_size, MPI_FLOAT, &status);
MPI_File_close(&fh);
MPI_Finalize();
}
storage order
in memory of the local array
DARRAY
Very convenient -Uses for very specific definitions of data distributions
-Assumes a row-major ordering of processes
-Strong limitation : the array size must be divisible by the number of processes
Pro
s
Co
ns
MPI -IO
1998
Data Sieving,
Thaktur et al
>Data is requested from the server in large contiguous portions of the file that cover the small portions needed by all processes
1996 MPI-IO Comittee
1999
Collective I/O
Thaktur et al
> Merging the portions needed by the clients in order to create larger and continuous requests.
TWO KEY OPTIMIZATIONS IN ROMIO
Data sieving Two-phase
collective I/O
Data sieving and Collective I/O in ROMIO
Rajeev Thakur, William Gropp and Ewing Lusk, Proc, of the 7th Symposium
on the Frontiers of Massively Parallel Computation, Feb, 1999, pp.182-189
Independent I/O Collective I/O operations
ROMIO
Is a portable version of MP-IO
(https://www.ncs.anl.gov/romio
Works on most machines
Supports multiple file systems : GPFS, SGI XFS,
NFS, Lustre
Incorporated as part of vendor MPI
implementations ( SGI HP, Compaq, NEC)
MPI –I/O AND COLLECTIVE BUFFERING
Concept of collective buffering (or 2 phases I/O) The MPI processes send their writes to a subset of processes in order to do a smaller number of bigger
reads/writes.
Two stages for read :
uses a subset of MPI tasks (called aggregators) to communicate with the IO servers (OSTs in Lustre) and read a large chunk of data into a temporary buffer.
the aggregators ship the data from the buffer to its destination among the remaining MPI tasks using point-to-point MPI calls.
Two stages for write :
A collective write does the reverse, aggregating the data through MPI into buffers on the aggregator nodes,
writing from the aggregator nodes to the IO servers.
Main advantage of collective buffering
fewer nodes are communicating with the IO servers
In fact, Lustre prefers a one-to-one mapping of aggregator nodes to OSTs.
NAS BENCHMARKS BT-IO
https://www.nas.nasa.gov/publications/npb.html
BT-IO is built on the benchmark BT ( block tridiagonal solver)
and offers several ways to write the solutions to a file.
(https://www.nas.nasa.gov/assets/pdf/techreports/2003/nas-03-002.pdf)
Each processor is responsible for multiple cartesian subsets of the entire
data set, whose number increases as the square root of the number of
processors participating in the computation.
After every 5 time steps, the entire solution field, consisting of five double-
precision words per mesh point, must be writtten to one or more files.
The total number of bytes written are divided by the wall clock time spent
between the beginning of the first time step and the verification of the
solution : MB/sec
NAS BENCHMARKS BT-IO
MPI The complete Reference, volume 2 The MPI Extensions p264
Data on a cubic 3-D grid is
dividedinto N3 subcubes
called cells.
The code must run on N2
processes,where each
process is assigned N disjoint
cells.
During the solution of the BT
systems, computation is
performed on a single slice at
a time and each process is
working on its own cell within
the particular slice.
5 physical variables at each
grid point
Example N=3
MPI
task 5
MPI
task 5
MPI
task 5
NAS BENCHMARKS BT-IO (CONTINUED)
One single file : each node can write to the file its own
data concurrently (no collective calls) IOTYPE=2
Full MPI-IO collective file operations IOTYPE=1
The data scattered in memory among processors is collected
on a subset of the participating processors and rearranged
before written to file in order to increase granularity.
EP –IO Embarassingly parallel IO : Each participating
process writes the data belonging to its part of the domain
to a separate file as contiguous stream of data. It gives the
maximum I/O speed that is achievable.
HANDS-ON BT-IO : SUBTYPE=FORTRAN
do cio=1,ncells
do kio=0, cell_size(3,cio)-1
do jio=0, cell_size(2,cio)-1
iseek=(cell_low(1,cio) +
$ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
$ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
$ PROBLEM_SIZE*idump_sub)))
do ix=0,cell_size(1,cio)-1
write(99, rec=iseek+ix+1)
$ u(1,ix, jio,kio,cio),
$ u(2,ix, jio,kio,cio),
$ u(3,ix, jio,kio,cio),
$ u(4,ix, jio,kio,cio),
$ u(5,ix, jio,kio,cio)
enddo
enddo
enddo
enddo
if (node.eq.root) record_length = 40/fortran_rec_sz
call mpi_bcast(record_length, 1, MPI_INTEGER,
> root, comm_setup, ierr)
open (unit=99, file=filenm,
$ form='unformatted', access='direct',
$ recl=record_length)
One file per process
HANDS-ON BT-IO
wget
https://www.nas.nasa.gov/assets/npb/NPB3.3.1.tar.gz cd $HOME /NPB3.3.1/NPB3.3-MPI
module load impi/2017.4.239-iccifort-2017.5.239-GCC-6.3.0-2.27
make bt NPROCS=64 CLASS=B SUBTYPE=fortran
Create a Pbs job as below :
16 MPI Processes Per Node¶
$ qsub -q qexp -l select=4:ncpus=24:mpiprocs=16:ompthreads=1 -I
$ module load impi/2017.4.239-iccifort-2017.5.239-GCC-6.3.0-2.27
$ mpirun –n 64. $HOME /NPB3.3.1/NPB3.3-MPI /bin/bt.B.64.fortran_io
HANDS-ON BT-IO : CLASS=B SUBTYPE=FORTRAN
No input file inputbt.data.
Using compiled defaults
Size: 102x 102x 102
Iterations: 200 dt: 0.0003000
Number of active processes:
64
BTIO -- statistics:
I/O timing in seconds : 128.45
I/O timing percentage : 70.83
Total data written (MB) :
1697.93
I/O data rate (MB/sec) : 13.22
On 1 OST and stripe size=1MB
BT Benchmark Completed.
Class = B
Size = 102x 102x 102
Iterations = 200
Time in seconds = 181.34
Total processes = 64
Compiled procs = 64
Mop/s total = 3872.17
Mop/s/process = 60.50
Operation type = floating point
Verification = SUCCESSFUL
Version = 3.3.1
Compile date = 10 Jan 2018
HANDS-ON BT-IO Prepare the job :
#!/bin/bash
export SCRATCHDIR=/scratch/work/user/your_login
module load impi/2017.4.239-iccifort-2017.5.239-GCC-6.3.0-2.27
cd $SCRATCHDIR
mpirun -np 64 /home/your_login/NPB3.3.1/NPB3.3-MPI/bin/bt.B.64.fortran_io
echo "-----------------------------STRIPING COUNT-----------"
lfs getstripe btio.out
___________________________________________________________________________
____
Try with the default stripe count , save the rate
Try other striping count values , 1 OST, 4 OSTs, 6 OSTS, 8 OSTs, add the following lines
+ cd $SCRATCHDIR
mkdir new_dir
lfs setstripe –c 4 new_dir
cd new_dir
What do you get ?
HANDS-ON BT-IO : CLASS=B SUBTYPE=FORTRAN
Example of tuning : striping the file across 4 OSTS
WORK=$SCRATCHDIR/bt.B.64.fortran_io-striping=4
rm -rf $WORK
mkdir -p $WORK
lfs setstripe -c 4 $WORK
cd $WORK
I/O data rate (MB/sec) : 60.77
HANDS-ON BT-IO : CLASS=B SUBTYPE=FORTRAN
File size = 1.6 GB , transfer size 107 MB
Run on 64 cores Haswell 16 cores/node (Tier-1 Occigen)
Striping
over
OSTs
Total Time /
I/O time
Rate (MB/s)
Default
1 OST 181.3 s /128.4s 13.2
4 OSTs 38.7 s/28.2 s 60.17
8 OSTs 29.03 s/19,51 s 67.2
HANDS-ON BT-IO : CLASS=C SUBTYPE=FORTRAN
File size =6.4 GB
Conclusion for the multiple files strategy:
No MPI I/O calls ( no data rearrangement takes
place),
Plain fortran file operations are used instead
Many seek operations are needed
Main gain is obtained by striping over several OSTs
HANDS-ON BT-IO : CLASS=C SUBTYPE=FORTRAN
File size =6.4 GB
Sieving Striping over
OSTs
Time Rate (MB/s)
NO 4 195 s 46.1
NO 6 195 s 50
Yes 4 155 s 42.6
Yes 6 198 s 59
Yes 12 111 s 88
Yes 20 222 s 64
HANDS-ON BT-IO : SUBTYPE=SIMPLE IO
iseek=0
if (node .eq. root) then
call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
endif
call MPI_Barrier(comm_solve, ierr)
call MPI_File_open(comm_solve,
$ filenm,
$ MPI_MODE_RDWR + MPI_MODE_CREATE,
$ MPI_INFO_NULL,
$ fp,
$ ierr)
call MPI_File_set_view(fp,
$ iseek, MPI_DOUBLE_PRECISION,
MPI_DOUBLE_PRECISION,
$ 'native', MPI_INFO_NULL, ierr)
if (ierr .ne. MPI_SUCCESS) then
print *, 'Error opening file'
stop
endif
HANDS-ON BT-IO : SUBTYPE=SIMPLE IO
do cio=1,ncells
do kio=0, cell_size(3,cio)-1
do jio=0, cell_size(2,cio)-1
iseek=5*(cell_low(1,cio) +
$ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
$ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
$ PROBLEM_SIZE*idump_sub)))
count=5*cell_size(1,cio)
call MPI_File_write_at(fp, iseek,
$ u(1,0,jio,kio,cio),
$ count, MPI_DOUBLE_PRECISION,
$ mstatus, ierr)
if (ierr .ne. MPI_SUCCESS) then
print *, 'Error writing to file'
stop
endif
enddo
enddo
enddo
HANDS-ON BT-IO : DATA SIEVING
• Basic idea : avoid multiple reads of non-contiguous data in the file
• ROMIO reads large chunks from the file and
extracts in memory what is actually needed
Export ROMIO_HINTS=my_romio_hints
cat >my_romio_hints <<EOF
romio_ds_write=enable
#romio_ds_read=enable
ind_wr_buffer_size=4194304 (default value=512KB)
#ind_rd_buffer_size=
EOF
HANDS-ON BT-IO : CLASS=C SUBTYPE=SIMPLE IO
EXAMPLE OF DATA SIEVING
File size =6.4 GB
Romio_ds_write=enable ; ind_wr_buffer_size is set
Default value of ind_wr_buffer_size=512KB
HANDS-ON BT-IO : CLASS=C SUBTYPE=SIMPLE IO
EXAMPLE OF DATA SIEVING
Striping count is very useful
Decreasing the ind_wr_buffer_size from the default
value of 512KB to 128 KB gives better I/O rate
because smaller blocks leads to less contention
HANDS-ON BT-IO SUBTYPE=FULL
Motivation of the buffer datatypes
Map the data from their locations in a single MPI
process memory to the file view.
The datatype is composed of 3 MPI datatypes,
each representing a cell’s buffer.
HANDS-ON BT-IO SUBTYPE=FULL
Main program (1/2)
Btio declarations : ndims,ncells, PROBLEM_SIZE, cellmax
Solution array : U(5,-2:cellmax+1,-2:cellmax+1,ncells)
MPI type declarations
MPI I/O specific declarations
Build the gridpoint type ( gridpt)
Build the buftype : layout in memory of the data owned by the process
Build the filetype : defined the storage order of the data in the file
HANDS-ON BT-IO SUBTYPE=FULL
Main program (2/2)
Open the file with default view and reset to the btio
view
Compute, write data every wr_interval step
Advance the offset by the size of the buftype
HANDS-ON BT-IO SUBTYPE=FULL
Gridpt type :gridpt
gridpt is an integer
gridpt_size is an integer of kind=mpi_address_kind
MPI_TYPE_CONTIGUOUS-> create a new type (gridpt) of 5 double_precision values
MPI_TYPE_COMMIT-> commits the new type gridpt
MPI_TYPE_SIZE -> returns gridpt_size
HANDS-ON BT-IO SUBTYPE=FULL
Buftype :buftype = layout in memory of the data owned by the process
Aims to describe the location in memory of the data owned by this process
MPI_TYPE_CREATE_SUBARRAY(ndims+1,
sizes, -> 3 dimensions in global array (whole cell + boundary values), 4th
dimension is set to ncells (ncells can vary)
subsizes, local array (excludes the boundary conditions data)
starts, -> zero-based starting point of the subarray (global indices of first
element of local array)
MPI_ORDER_FORTRAN,
gridpt, - gridpt type
cell_buftypes(cell), ->>>> layout in memory ( cell-number dependent)
,ierr)
Each process does that for each cell !
HANDS-ON BT-IO SUBTYPE=FULL
Because cell sizes may vary we have to create a combined buffer type that is a structure and commit it.
MPI_TYPE_CREATE_STRUCT(
Count, >>>>> ncells
Array_of_blocklengths, >>>> 1 ( a single block of cell_buftypes(cell))
Array_of_displacements, >>> 0 (none)
Array_of_types >>> cell_buftypes
Newtype, >>> combined_buftype
Ierr)
HANDS-ON BT-IO SUBTYPE=FULL
Main program (1/2)
Btio declarations : ndims,ncells, PROBLEM_SIZE, cellmax
Solution array : U(5,-2:cellmax+1,-2:cellmax+1,ncells)
MPI type declarations
MPI I/O specific declarations
Build the gridpoint type ( gridpt) done !
Build the buftype : layout in memory of the data owned by the process done !
Build the filetype : defined the storage order of the data in the file to be done next
HANDS-ON BT-IO SUBTYPE=FULL
Filetype :
Aims to describe where data is stored in the file
MPI_TYPE_CREATE_SUBARRAY(ndims,
sizes, -> 3 dimensions, each equals to PROBLEM_SIZE
subsizes, local array (excludes the boundary conditions data)
starts, -> zero-based starting point of the subarray (global indices of first element of local array)
MPI_ORDER_FORTRAN,
gridpt, - gridpt type (data type~ etype)
cell_filetypes(cell), ->>>> new MPI type ( layout of local array in global array)
,ierr)
HANDS-ON BT-IO SUBTYPE=FULL
As done for buffer types, we have to create a combined file type that is a structure and commit it.
MPI_TYPE_CREATE_STRUCT(
Count, >>>>> ncells
Array_of_blocklengths, >>>> 1 ( a single block of cell_filetypes(cell))
Array_of_displacements, >>> 0 (none)
Array_of_types >>> cell_filetypes
Newtype, >>> combined_filetype
Ierr)
HANDS-ON BT-IO CLASS=C SUBTYPE=FULL
subroutine setup_btio
c-------------------------------------------------------------
--------
c-------------------------------------------------------------
--------
include 'header.h'
include 'mpinpb.h'
integer ierr
integer mstatus(MPI_STATUS_SIZE)
integer sizes(4), starts(4), subsizes(4)
integer cell_btype(maxcells), cell_ftype(maxcells)
integer cell_blength(maxcells)
integer info
character*20 cb_nodes, cb_size
integer c, m
integer cell_disp(maxcells)
call mpi_bcast(collbuf_nodes, 1, MPI_INTEGER,
> root, comm_setup, ierr)
call mpi_bcast(collbuf_size, 1, MPI_INTEGER,
> root, comm_setup, ierr)
if (collbuf_nodes .eq. 0) then
info = MPI_INFO_NULL
else
write (cb_nodes,*) collbuf_nodes
write (cb_size,*) collbuf_size
call MPI_Info_create(info, ierr)
call MPI_Info_set(info, 'cb_nodes', cb_nodes, ierr)
call MPI_Info_set(info, 'cb_buffer_size', cb_size,
ierr)
call MPI_Info_set(info, 'collective_buffering',
'true', ierr)
endif
call MPI_Type_contiguous(5, MPI_DOUBLE_PRECISION,
$ element, ierr)
call MPI_Type_commit(element, ierr)
call MPI_Type_extent(element, eltext, ierr)
HANDS-ON BT-IO CLASS=C SUBTYPE=FULL
This is a case with collective I/O MPI calls to written one shared
file of 6.6 GB
Run on 64 cores, with the default stripe count , save the rate
Try other striping count values , 1 OST, 4 OSTs,6 OSTs, 12 OSTS
What do you get ?
Change the value of cb_buffer_size to 128 KB, when striping
over 4 OSTs which rate do you get ? (cb_buffer_size default
value is equal to 4MB)
HANDS-ON BT-IO : CLASS=C SUBTYPE=FULL
(OCCIGEN)
IOTYPE=1, File size =6.6 GB, openMPI
4 nodes access the same OST => contention
4 nodes access 4 different OSTs
HANDS-ON BT-IO : CLASS=C SUBTYPE=FULL
(OCCIGEN)
IOTYPE=1, File size =6.6 GB, openMPI
HANDS-ON BT-IO : CLASS=C SUBTYPE=FULL
IOTYPE=1, File size =6.6 GB, openMPI
Each task has 107 MB to write
Results :
Best rate is obtained for a striping count of 4 OSTs and an
adjusted cb_buffer_size reduced from 4MB to 32 KB.
At a given striping count it may benefit to look for the best
cb_buffer_size value.
Reducing it reduces contention
HANDS-ON BT-IO : CLASS=D SUBTYPE=FULL
(OCCIGEN)
IOTYPE=1, File size =127 GB, openMPI
HANDS-ON BT-IO : CLASS=D SUBTYPE=FULL
(SALOMON)
intelMPI