2012-01-09 00:51:04 +04:00
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim : expandtab : shiftwidth = 8 : tabstop = 8 :
*/
2011-06-17 23:20:43 +04:00
/******************************************************************************\
* *
* Copyright ( c ) 2003 , The Regents of the University of California *
* See the file COPYRIGHT for a complete copyright notice and license . *
* *
\ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
2011-11-12 04:40:45 +04:00
# ifdef HAVE_CONFIG_H
2014-07-31 03:17:21 +04:00
# include "config.h"
2011-11-12 04:40:45 +04:00
# endif
# include <stdio.h>
# include <stdlib.h>
2012-01-07 05:29:45 +04:00
# include <unistd.h>
2011-11-12 02:22:17 +04:00
# include <ctype.h> /* tolower() */
2011-11-12 04:40:45 +04:00
# include <errno.h>
2011-06-17 23:20:43 +04:00
# include <math.h>
# include <mpi.h>
# include <string.h>
2011-11-12 02:22:17 +04:00
# include <sys/stat.h> /* struct stat */
2011-06-17 23:20:43 +04:00
# include <time.h>
2014-07-31 03:17:21 +04:00
2011-06-17 23:20:43 +04:00
# ifndef _WIN32
2014-07-31 03:17:21 +04:00
# include <sys / time.h> /* gettimeofday() */
# include <sys / utsname.h> /* uname() */
2011-06-17 23:20:43 +04:00
# endif
2014-07-31 03:17:21 +04:00
2011-11-10 04:34:16 +04:00
# include <assert.h>
2011-06-17 23:20:43 +04:00
2011-11-12 04:40:45 +04:00
# include "ior.h"
# include "aiori.h"
# include "utilities.h"
# include "parse_options.h"
2011-06-17 23:20:43 +04:00
S3 with Multi-Part Upload for N:1 is working.
Testing on our EMC ViPR installation. Therefore, we also have available
some EMC extensions. For example, EMC supports a special "byte-range"
header-option ("Range: bytes=-1-") which allows appending to an object.
This is not needed for N:1 (where every write creates an independent part),
but is vital for N:N (where every write is considered an append, unless
"transfer-size" is the same as "block-size").
We also use a LANL-extended implementation of aws4c 0.5, which provides
some special features, and allows greater efficiency. That is included in
this commit as a tarball. Untar it somewhere else and build it, to produce
a library, which is linked with IOR. (configure with --with-S3).
TBD: EMC also supports a simpler alternative to Multi-Part Upload, which
appears to have several advantages. We'll add that in next, but wanted to
capture this as is, before I break it.
2014-10-27 22:16:20 +03:00
2011-11-12 04:40:45 +04:00
/* globals used by other files, also defined "extern" in ior.h */
S3 with Multi-Part Upload for N:1 is working.
Testing on our EMC ViPR installation. Therefore, we also have available
some EMC extensions. For example, EMC supports a special "byte-range"
header-option ("Range: bytes=-1-") which allows appending to an object.
This is not needed for N:1 (where every write creates an independent part),
but is vital for N:N (where every write is considered an append, unless
"transfer-size" is the same as "block-size").
We also use a LANL-extended implementation of aws4c 0.5, which provides
some special features, and allows greater efficiency. That is included in
this commit as a tarball. Untar it somewhere else and build it, to produce
a library, which is linked with IOR. (configure with --with-S3).
TBD: EMC also supports a simpler alternative to Multi-Part Upload, which
appears to have several advantages. We'll add that in next, but wanted to
capture this as is, before I break it.
2014-10-27 22:16:20 +03:00
int numTasksWorld = 0 ;
int rank = 0 ;
int rankOffset = 0 ;
int tasksPerNode = 0 ; /* tasks per node */
int verbose = VERBOSE_0 ; /* verbose output */
2011-11-12 04:40:45 +04:00
MPI_Comm testComm ;
/* file scope globals */
extern char * * environ ;
int totalErrorCount = 0 ;
2011-11-12 02:22:17 +04:00
double wall_clock_delta = 0 ;
double wall_clock_deviation ;
2011-06-17 23:20:43 +04:00
2017-10-20 00:26:52 +03:00
const ior_aiori_t * backend ;
2011-10-28 03:50:05 +04:00
2011-12-13 09:00:18 +04:00
static void DestroyTests ( IOR_test_t * tests_head ) ;
2011-11-12 04:40:45 +04:00
static void DisplayUsage ( char * * ) ;
static void GetTestFileName ( char * , IOR_param_t * ) ;
static char * PrependDir ( IOR_param_t * , char * ) ;
static char * * ParseFileName ( char * , int * ) ;
2012-09-10 21:50:18 +04:00
static void PrintEarlyHeader ( ) ;
2011-12-11 13:50:19 +04:00
static void PrintHeader ( int argc , char * * argv ) ;
2011-12-13 09:00:18 +04:00
static IOR_test_t * SetupTests ( int , char * * ) ;
2011-12-11 13:50:19 +04:00
static void ShowTestInfo ( IOR_param_t * ) ;
2011-12-13 09:00:18 +04:00
static void ShowSetup ( IOR_param_t * params ) ;
2011-11-12 04:40:45 +04:00
static void ShowTest ( IOR_param_t * ) ;
2012-01-14 01:27:55 +04:00
static void PrintLongSummaryAllTests ( IOR_test_t * tests_head ) ;
2011-12-13 09:00:18 +04:00
static void TestIoSys ( IOR_test_t * ) ;
2015-05-19 18:36:28 +03:00
static void ValidateTests ( IOR_param_t * ) ;
2017-10-20 19:02:24 +03:00
static IOR_offset_t WriteOrRead ( IOR_param_t * test , IOR_results_t * results , void * fd , int access , IOR_io_buffers * ioBuffers ) ;
2011-11-12 04:40:45 +04:00
static void WriteTimes ( IOR_param_t * , double * * , int , int ) ;
2011-06-17 23:20:43 +04:00
/********************************** M A I N ***********************************/
2011-11-12 02:22:17 +04:00
int main ( int argc , char * * argv )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
int i ;
2011-12-13 09:00:18 +04:00
IOR_test_t * tests_head ;
IOR_test_t * tptr ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/*
* check - h option from commandline without starting MPI ;
* if the help option is requested in a script file ( showHelp = TRUE ) ,
* the help output will be displayed in the MPI job
*/
for ( i = 1 ; i < argc ; i + + ) {
if ( strcmp ( argv [ i ] , " -h " ) = = 0 ) {
DisplayUsage ( argv ) ;
return ( 0 ) ;
}
}
S3 with Multi-Part Upload for N:1 is working.
Testing on our EMC ViPR installation. Therefore, we also have available
some EMC extensions. For example, EMC supports a special "byte-range"
header-option ("Range: bytes=-1-") which allows appending to an object.
This is not needed for N:1 (where every write creates an independent part),
but is vital for N:N (where every write is considered an append, unless
"transfer-size" is the same as "block-size").
We also use a LANL-extended implementation of aws4c 0.5, which provides
some special features, and allows greater efficiency. That is included in
this commit as a tarball. Untar it somewhere else and build it, to produce
a library, which is linked with IOR. (configure with --with-S3).
TBD: EMC also supports a simpler alternative to Multi-Part Upload, which
appears to have several advantages. We'll add that in next, but wanted to
capture this as is, before I break it.
2014-10-27 22:16:20 +03:00
# ifdef USE_S3_AIORI
/* This is supposed to be done before *any* threads are created.
* Could MPI_Init ( ) create threads ( or call multi - threaded
* libraries ) ? We ' ll assume so . */
AWS4C_CHECK ( aws_init ( ) ) ;
# endif
2011-11-12 02:22:17 +04:00
/* start the MPI code */
MPI_CHECK ( MPI_Init ( & argc , & argv ) , " cannot initialize MPI " ) ;
MPI_CHECK ( MPI_Comm_size ( MPI_COMM_WORLD , & numTasksWorld ) ,
" cannot get number of tasks " ) ;
MPI_CHECK ( MPI_Comm_rank ( MPI_COMM_WORLD , & rank ) , " cannot get rank " ) ;
2012-09-10 21:50:18 +04:00
PrintEarlyHeader ( ) ;
2011-11-12 02:22:17 +04:00
/* set error-handling */
/*MPI_CHECK(MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN),
" cannot set errhandler " ) ; */
/* Sanity check, we were compiled with SOME backend, right? */
2017-10-20 00:26:52 +03:00
if ( 0 = = aiori_count ( ) ) {
2014-08-14 02:53:24 +04:00
ERR ( " No IO backends compiled into ior. "
" Run 'configure --with-<backend>', and recompile. " ) ;
2011-11-12 02:22:17 +04:00
}
2015-05-19 18:36:28 +03:00
/* setup tests, and validate parameters */
2011-12-13 09:00:18 +04:00
tests_head = SetupTests ( argc , argv ) ;
verbose = tests_head - > params . verbose ;
tests_head - > params . testComm = MPI_COMM_WORLD ;
2011-11-12 02:22:17 +04:00
2015-05-19 18:36:28 +03:00
/* check for commandline 'help' request */
2011-12-13 09:00:18 +04:00
if ( rank = = 0 & & tests_head - > params . showHelp = = TRUE ) {
2011-11-12 02:22:17 +04:00
DisplayUsage ( argv ) ;
}
2014-08-14 02:53:24 +04:00
PrintHeader ( argc , argv ) ;
2011-12-11 13:50:19 +04:00
2011-11-12 02:22:17 +04:00
/* perform each test */
2014-08-14 02:53:24 +04:00
for ( tptr = tests_head ; tptr ! = NULL ; tptr = tptr - > next ) {
2011-12-13 09:00:18 +04:00
verbose = tptr - > params . verbose ;
2011-11-12 02:22:17 +04:00
if ( rank = = 0 & & verbose > = VERBOSE_0 ) {
2011-12-13 09:00:18 +04:00
ShowTestInfo ( & tptr - > params ) ;
2011-11-12 02:22:17 +04:00
}
if ( rank = = 0 & & verbose > = VERBOSE_3 ) {
2011-12-13 09:00:18 +04:00
ShowTest ( & tptr - > params ) ;
2011-11-12 02:22:17 +04:00
}
S3 with Multi-Part Upload for N:1 is working.
Testing on our EMC ViPR installation. Therefore, we also have available
some EMC extensions. For example, EMC supports a special "byte-range"
header-option ("Range: bytes=-1-") which allows appending to an object.
This is not needed for N:1 (where every write creates an independent part),
but is vital for N:N (where every write is considered an append, unless
"transfer-size" is the same as "block-size").
We also use a LANL-extended implementation of aws4c 0.5, which provides
some special features, and allows greater efficiency. That is included in
this commit as a tarball. Untar it somewhere else and build it, to produce
a library, which is linked with IOR. (configure with --with-S3).
TBD: EMC also supports a simpler alternative to Multi-Part Upload, which
appears to have several advantages. We'll add that in next, but wanted to
capture this as is, before I break it.
2014-10-27 22:16:20 +03:00
2014-08-29 01:39:44 +04:00
// This is useful for trapping a running MPI process. While
// this is sleeping, run the script 'testing/hdfs/gdb.attach'
S3 with Multi-Part Upload for N:1 is working.
Testing on our EMC ViPR installation. Therefore, we also have available
some EMC extensions. For example, EMC supports a special "byte-range"
header-option ("Range: bytes=-1-") which allows appending to an object.
This is not needed for N:1 (where every write creates an independent part),
but is vital for N:N (where every write is considered an append, unless
"transfer-size" is the same as "block-size").
We also use a LANL-extended implementation of aws4c 0.5, which provides
some special features, and allows greater efficiency. That is included in
this commit as a tarball. Untar it somewhere else and build it, to produce
a library, which is linked with IOR. (configure with --with-S3).
TBD: EMC also supports a simpler alternative to Multi-Part Upload, which
appears to have several advantages. We'll add that in next, but wanted to
capture this as is, before I break it.
2014-10-27 22:16:20 +03:00
if ( verbose > = VERBOSE_4 ) {
printf ( " \t rank %d: sleeping \n " , rank ) ;
sleep ( 5 ) ;
printf ( " \t rank %d: awake. \n " , rank ) ;
}
2011-12-13 09:00:18 +04:00
TestIoSys ( tptr ) ;
2017-10-21 12:59:09 +03:00
2017-10-20 19:02:24 +03:00
if ( rank = = 0 & & tptr - > params . stoneWallingWearOut ) {
fprintf ( stdout , " Pairs deadlineForStonewallingaccessed: %lld \n " , ( long long ) tptr - > results - > pairs_accessed ) ;
}
2011-11-12 02:22:17 +04:00
}
2012-01-14 01:49:30 +04:00
if ( verbose < 0 )
/* always print final summary */
verbose = 0 ;
2014-08-14 02:53:24 +04:00
PrintLongSummaryAllTests ( tests_head ) ;
2011-12-13 09:00:18 +04:00
2011-11-12 02:22:17 +04:00
/* display finish time */
if ( rank = = 0 & & verbose > = VERBOSE_0 ) {
2014-08-14 02:53:24 +04:00
fprintf ( stdout , " \n " ) ;
fprintf ( stdout , " Finished: %s " , CurrentTimeString ( ) ) ;
2011-11-12 02:22:17 +04:00
}
2014-08-14 02:53:24 +04:00
DestroyTests ( tests_head ) ;
2011-12-13 09:00:18 +04:00
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Finalize ( ) , " cannot finalize MPI " ) ;
S3 with Multi-Part Upload for N:1 is working.
Testing on our EMC ViPR installation. Therefore, we also have available
some EMC extensions. For example, EMC supports a special "byte-range"
header-option ("Range: bytes=-1-") which allows appending to an object.
This is not needed for N:1 (where every write creates an independent part),
but is vital for N:N (where every write is considered an append, unless
"transfer-size" is the same as "block-size").
We also use a LANL-extended implementation of aws4c 0.5, which provides
some special features, and allows greater efficiency. That is included in
this commit as a tarball. Untar it somewhere else and build it, to produce
a library, which is linked with IOR. (configure with --with-S3).
TBD: EMC also supports a simpler alternative to Multi-Part Upload, which
appears to have several advantages. We'll add that in next, but wanted to
capture this as is, before I break it.
2014-10-27 22:16:20 +03:00
# ifdef USE_S3_AIORI
/* done once per program, after exiting all threads.
* NOTE : This fn doesn ' t return a value that can be checked for success . */
aws_cleanup ( ) ;
# endif
2011-11-12 02:22:17 +04:00
return ( totalErrorCount ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/***************************** F U N C T I O N S ******************************/
2011-11-10 04:13:44 +04:00
/*
* Initialize an IOR_param_t structure to the defaults
*/
2011-11-12 02:22:17 +04:00
void init_IOR_Param_t ( IOR_param_t * p )
2011-11-10 04:13:44 +04:00
{
2017-10-20 00:26:52 +03:00
const char * default_aiori = aiori_default ( ) ;
assert ( NULL ! = default_aiori ) ;
2011-11-10 04:13:44 +04:00
memset ( p , 0 , sizeof ( IOR_param_t ) ) ;
2011-11-12 02:22:17 +04:00
p - > mode = IOR_IRUSR | IOR_IWUSR | IOR_IRGRP | IOR_IWGRP ;
p - > openFlags = IOR_RDWR | IOR_CREAT ;
2014-08-14 02:53:24 +04:00
2017-10-20 00:26:52 +03:00
strncpy ( p - > api , default_aiori , MAX_STR ) ;
2011-11-10 04:13:44 +04:00
strncpy ( p - > platform , " HOST(OSTYPE) " , MAX_STR ) ;
strncpy ( p - > testFileName , " testFile " , MAXPATHLEN ) ;
2014-08-14 02:53:24 +04:00
2011-11-10 04:13:44 +04:00
p - > nodes = 1 ;
p - > tasksPerNode = 1 ;
p - > repetitions = 1 ;
p - > repCounter = - 1 ;
p - > open = WRITE ;
p - > taskPerNodeOffset = 1 ;
p - > segmentCount = 1 ;
p - > blockSize = 1048576 ;
p - > transferSize = 262144 ;
p - > randomSeed = - 1 ;
2015-05-21 21:05:56 +03:00
p - > incompressibleSeed = 573 ;
2011-11-10 04:13:44 +04:00
p - > testComm = MPI_COMM_WORLD ;
p - > setAlignment = 1 ;
p - > lustre_start_ost = - 1 ;
2014-08-29 01:39:44 +04:00
strncpy ( p - > hdfs_user , getenv ( " USER " ) , MAX_STR ) ;
2014-08-14 02:53:24 +04:00
p - > hdfs_name_node = " default " ;
p - > hdfs_name_node_port = 0 ; /* ??? */
p - > hdfs_fs = NULL ;
2014-08-29 01:39:44 +04:00
p - > hdfs_replicas = 0 ; /* invokes the default */
2014-08-14 02:53:24 +04:00
p - > hdfs_block_size = 0 ;
S3 with Multi-Part Upload for N:1 is working.
Testing on our EMC ViPR installation. Therefore, we also have available
some EMC extensions. For example, EMC supports a special "byte-range"
header-option ("Range: bytes=-1-") which allows appending to an object.
This is not needed for N:1 (where every write creates an independent part),
but is vital for N:N (where every write is considered an append, unless
"transfer-size" is the same as "block-size").
We also use a LANL-extended implementation of aws4c 0.5, which provides
some special features, and allows greater efficiency. That is included in
this commit as a tarball. Untar it somewhere else and build it, to produce
a library, which is linked with IOR. (configure with --with-S3).
TBD: EMC also supports a simpler alternative to Multi-Part Upload, which
appears to have several advantages. We'll add that in next, but wanted to
capture this as is, before I break it.
2014-10-27 22:16:20 +03:00
// p->curl = NULL;
p - > URI = NULL ;
p - > curl_flags = 0 ;
p - > io_buf = NULL ;
p - > etags = NULL ;
p - > part_number = 0 ;
2017-09-21 18:12:31 +03:00
p - > beegfs_numTargets = - 1 ;
p - > beegfs_chunkSize = - 1 ;
2011-11-10 04:13:44 +04:00
}
Algorithms 'S3', 'S3_plus', and 'S3_EMC' all available.
These are variants on S3. S3 uses the "pure" S3 interface, e.g. using
Multi-Part-Upload. The "plus" variant enables EMC-extensions in the aws4c
library. This allows the N:N case to use "append", in the case where
"transfer_size" != "block_size" for IOR. In pure S3, the N:N case will
fail, because the EMC-extensions won't be enabled, and appending (which
attempts to use the EMC byte-range tricks to do this) will throw an error.
In the S3_EMC alg, N:1 uses EMCs other byte-range tricks to write different
parts of an N:1 file, and also uses append to write the parts of an N:N
file. Preliminary tests show these EMC extensions look to improve BW by
~20%.
I put all three algs in aiori-S3.c, because it seemed some code was getting
reused. Not sure if that's still going to make sense after the TBD, below.
TBD: Recently realized that the "pure' S3 shouldn't be trying to use
appends for anything. In the N:N case, it should just use MPU, within each
file. Then, there's no need for S3_plus. We just have S3, which does MPU
for all writes where transfer_size != block_size, and uses (standard)
byte-range reads for reading. Then S3_EMC uses "appends for N:N writes,
and byte-range writes for N:1 writes. This separates the code for the two
algs a little more, but we might still want them in the same file.
2014-10-30 01:04:30 +03:00
static void AioriBind ( char * api , IOR_param_t * param )
2011-06-17 23:20:43 +04:00
{
2017-10-20 00:26:52 +03:00
backend = aiori_select ( api ) ;
if ( NULL ! = backend ) {
if ( ! strncmp ( api , " S3 " , 2 ) ) {
if ( ! strcmp ( api , " S3_EMC " ) ) {
param - > curl_flags | = IOR_CURL_S3_EMC_EXT ;
} else {
param - > curl_flags & = ~ ( IOR_CURL_S3_EMC_EXT ) ;
}
2011-11-12 02:22:17 +04:00
}
2017-10-20 00:26:52 +03:00
} else {
2011-11-12 02:22:17 +04:00
ERR ( " unrecognized IO API " ) ;
}
2011-11-12 04:40:45 +04:00
}
2011-06-17 23:20:43 +04:00
2011-11-12 04:40:45 +04:00
static void
2011-11-12 02:22:17 +04:00
DisplayOutliers ( int numTasks ,
2011-06-17 23:20:43 +04:00
double timerVal ,
2011-11-12 02:22:17 +04:00
char * timeString , int access , int outlierThreshold )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
char accessString [ MAX_STR ] ;
double sum , mean , sqrDiff , var , sd ;
/* for local timerVal, don't compensate for wall clock delta */
timerVal + = wall_clock_delta ;
MPI_CHECK ( MPI_Allreduce
( & timerVal , & sum , 1 , MPI_DOUBLE , MPI_SUM , testComm ) ,
" MPI_Allreduce() " ) ;
mean = sum / numTasks ;
sqrDiff = pow ( ( mean - timerVal ) , 2 ) ;
MPI_CHECK ( MPI_Allreduce
( & sqrDiff , & var , 1 , MPI_DOUBLE , MPI_SUM , testComm ) ,
" MPI_Allreduce() " ) ;
var = var / numTasks ;
sd = sqrt ( var ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
if ( access = = WRITE ) {
strcpy ( accessString , " write " ) ;
} else { /* READ */
strcpy ( accessString , " read " ) ;
}
if ( fabs ( timerVal - mean ) > ( double ) outlierThreshold ) {
fprintf ( stdout , " WARNING: for task %d, %s %s is %f \n " ,
rank , accessString , timeString , timerVal ) ;
fprintf ( stdout , " (mean=%f, stddev=%f) \n " , mean , sd ) ;
fflush ( stdout ) ;
}
2011-11-12 04:40:45 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Check for outliers in start / end times and elapsed create / xfer / close times .
*/
2011-11-12 04:40:45 +04:00
static void CheckForOutliers ( IOR_param_t * test , double * * timer , int rep ,
int access )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
int shift ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
if ( access = = WRITE ) {
shift = 0 ;
} else { /* READ */
shift = 6 ;
}
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
DisplayOutliers ( test - > numTasks , timer [ shift + 0 ] [ rep ] ,
" start time " , access , test - > outlierThreshold ) ;
DisplayOutliers ( test - > numTasks ,
timer [ shift + 1 ] [ rep ] - timer [ shift + 0 ] [ rep ] ,
" elapsed create time " , access , test - > outlierThreshold ) ;
DisplayOutliers ( test - > numTasks ,
timer [ shift + 3 ] [ rep ] - timer [ shift + 2 ] [ rep ] ,
" elapsed transfer time " , access ,
test - > outlierThreshold ) ;
DisplayOutliers ( test - > numTasks ,
timer [ shift + 5 ] [ rep ] - timer [ shift + 4 ] [ rep ] ,
" elapsed close time " , access , test - > outlierThreshold ) ;
DisplayOutliers ( test - > numTasks , timer [ shift + 5 ] [ rep ] , " end time " ,
access , test - > outlierThreshold ) ;
2011-11-12 04:40:45 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Check if actual file size equals expected size ; if not use actual for
* calculating performance rate .
*/
2011-12-13 09:00:18 +04:00
static void CheckFileSize ( IOR_test_t * test , IOR_offset_t dataMoved , int rep )
2011-06-17 23:20:43 +04:00
{
2014-08-14 02:53:24 +04:00
IOR_param_t * params = & test - > params ;
IOR_results_t * results = test - > results ;
2011-12-13 09:00:18 +04:00
MPI_CHECK ( MPI_Allreduce ( & dataMoved , & results - > aggFileSizeFromXfer [ rep ] ,
2011-11-12 02:22:17 +04:00
1 , MPI_LONG_LONG_INT , MPI_SUM , testComm ) ,
" cannot total data moved " ) ;
2011-12-13 09:00:18 +04:00
if ( strcmp ( params - > api , " HDF5 " ) ! = 0 & & strcmp ( params - > api , " NCMPI " ) ! = 0 ) {
2011-11-12 02:22:17 +04:00
if ( verbose > = VERBOSE_0 & & rank = = 0 ) {
2011-12-13 09:00:18 +04:00
if ( ( params - > expectedAggFileSize
! = results - > aggFileSizeFromXfer [ rep ] )
| | ( results - > aggFileSizeFromStat [ rep ]
! = results - > aggFileSizeFromXfer [ rep ] ) ) {
2011-11-12 02:22:17 +04:00
fprintf ( stdout ,
" WARNING: Expected aggregate file size = %lld. \n " ,
2011-12-14 10:04:27 +04:00
( long long ) params - > expectedAggFileSize ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout ,
" WARNING: Stat() of aggregate file size = %lld. \n " ,
2011-12-14 10:04:27 +04:00
( long long ) results - > aggFileSizeFromStat [ rep ] ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout ,
" WARNING: Using actual aggregate bytes moved = %lld. \n " ,
2011-12-14 10:04:27 +04:00
( long long ) results - > aggFileSizeFromXfer [ rep ] ) ;
2017-10-20 19:02:24 +03:00
if ( params - > deadlineForStonewalling ) {
fprintf ( stdout ,
" WARNING: maybe caused by deadlineForStonewalling \n " ) ;
}
2011-11-12 02:22:17 +04:00
}
}
2011-06-17 23:20:43 +04:00
}
2011-12-13 09:00:18 +04:00
results - > aggFileSizeForBW [ rep ] = results - > aggFileSizeFromXfer [ rep ] ;
2011-11-12 04:40:45 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Compare buffers after reading / writing each transfer . Displays only first
* difference in buffers and returns total errors counted .
*/
2011-11-12 04:40:45 +04:00
static size_t
2011-11-12 02:22:17 +04:00
CompareBuffers ( void * expectedBuffer ,
void * unknownBuffer ,
size_t size ,
2011-12-13 09:00:18 +04:00
IOR_offset_t transferCount , IOR_param_t * test , int access )
2011-06-17 23:20:43 +04:00
{
2011-11-12 03:11:28 +04:00
char testFileName [ MAXPATHLEN ] ;
char bufferLabel1 [ MAX_STR ] ;
char bufferLabel2 [ MAX_STR ] ;
2011-11-12 02:22:17 +04:00
size_t i , j , length , first , last ;
size_t errorCount = 0 ;
int inError = 0 ;
unsigned long long * goodbuf = ( unsigned long long * ) expectedBuffer ;
unsigned long long * testbuf = ( unsigned long long * ) unknownBuffer ;
2017-09-27 19:45:47 +03:00
if ( access = = WRITECHECK | | access = = READCHECK ) {
2011-11-12 02:22:17 +04:00
strcpy ( bufferLabel1 , " Expected: " ) ;
strcpy ( bufferLabel2 , " Actual: " ) ;
} else {
ERR ( " incorrect argument for CompareBuffers() " ) ;
}
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
length = size / sizeof ( IOR_size_t ) ;
first = - 1 ;
if ( verbose > = VERBOSE_3 ) {
2011-06-17 23:20:43 +04:00
fprintf ( stdout ,
2011-11-12 02:22:17 +04:00
" [%d] At file byte offset %lld, comparing %llu-byte transfer \n " ,
rank , test - > offset , ( long long ) size ) ;
}
for ( i = 0 ; i < length ; i + + ) {
if ( testbuf [ i ] ! = goodbuf [ i ] ) {
errorCount + + ;
if ( verbose > = VERBOSE_2 ) {
fprintf ( stdout ,
" [%d] At transfer buffer #%lld, index #%lld (file byte offset %lld): \n " ,
rank , transferCount - 1 , ( long long ) i ,
test - > offset +
( IOR_size_t ) ( i * sizeof ( IOR_size_t ) ) ) ;
2011-12-14 10:04:27 +04:00
fprintf ( stdout , " [%d] %s0x " , rank , bufferLabel1 ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " %016llx \n " , goodbuf [ i ] ) ;
2011-12-14 10:04:27 +04:00
fprintf ( stdout , " [%d] %s0x " , rank , bufferLabel2 ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " %016llx \n " , testbuf [ i ] ) ;
}
if ( ! inError ) {
inError = 1 ;
first = i ;
last = i ;
} else {
last = i ;
}
} else if ( verbose > = VERBOSE_5 & & i % 4 = = 0 ) {
fprintf ( stdout ,
" [%d] PASSED offset = %lld bytes, transfer %lld \n " ,
rank ,
( ( i * sizeof ( unsigned long long ) ) +
test - > offset ) , transferCount ) ;
fprintf ( stdout , " [%d] GOOD %s0x " , rank , bufferLabel1 ) ;
for ( j = 0 ; j < 4 ; j + + )
fprintf ( stdout , " %016llx " , goodbuf [ i + j ] ) ;
fprintf ( stdout , " \n [%d] GOOD %s0x " , rank , bufferLabel2 ) ;
for ( j = 0 ; j < 4 ; j + + )
fprintf ( stdout , " %016llx " , testbuf [ i + j ] ) ;
fprintf ( stdout , " \n " ) ;
}
}
if ( inError ) {
inError = 0 ;
GetTestFileName ( testFileName , test ) ;
fprintf ( stdout ,
" [%d] FAILED comparison of buffer containing %d-byte ints: \n " ,
rank , ( int ) sizeof ( unsigned long long int ) ) ;
fprintf ( stdout , " [%d] File name = %s \n " , rank , testFileName ) ;
fprintf ( stdout , " [%d] In transfer %lld, " , rank ,
transferCount ) ;
2011-06-17 23:20:43 +04:00
fprintf ( stdout ,
2011-11-12 02:22:17 +04:00
" %lld errors between buffer indices %lld and %lld. \n " ,
( long long ) errorCount , ( long long ) first ,
( long long ) last ) ;
fprintf ( stdout , " [%d] File byte offset = %lld: \n " , rank ,
( ( first * sizeof ( unsigned long long ) ) + test - > offset ) ) ;
fprintf ( stdout , " [%d] %s0x " , rank , bufferLabel1 ) ;
for ( j = first ; j < length & & j < first + 4 ; j + + )
fprintf ( stdout , " %016llx " , goodbuf [ j ] ) ;
if ( j = = length )
fprintf ( stdout , " [end of buffer] " ) ;
fprintf ( stdout , " \n [%d] %s0x " , rank , bufferLabel2 ) ;
for ( j = first ; j < length & & j < first + 4 ; j + + )
fprintf ( stdout , " %016llx " , testbuf [ j ] ) ;
if ( j = = length )
fprintf ( stdout , " [end of buffer] " ) ;
fprintf ( stdout , " \n " ) ;
if ( test - > quitOnError = = TRUE )
ERR ( " data check error, aborting execution " ) ;
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
return ( errorCount ) ;
2011-11-12 03:11:28 +04:00
}
2011-11-12 02:22:17 +04:00
2011-11-12 03:11:28 +04:00
/*
* Count all errors across all tasks ; report errors found .
*/
2011-11-12 04:40:45 +04:00
static int CountErrors ( IOR_param_t * test , int access , int errors )
2011-11-12 02:22:17 +04:00
{
int allErrors = 0 ;
if ( test - > checkWrite | | test - > checkRead ) {
MPI_CHECK ( MPI_Reduce ( & errors , & allErrors , 1 , MPI_INT , MPI_SUM ,
0 , testComm ) , " cannot reduce errors " ) ;
MPI_CHECK ( MPI_Bcast ( & allErrors , 1 , MPI_INT , 0 , testComm ) ,
" cannot broadcast allErrors value " ) ;
if ( allErrors ! = 0 ) {
totalErrorCount + = allErrors ;
test - > errorFound = TRUE ;
}
if ( rank = = 0 & & allErrors ! = 0 ) {
if ( allErrors < 0 ) {
WARN ( " overflow in errors counted " ) ;
allErrors = - 1 ;
}
2014-08-14 02:53:24 +04:00
fprintf ( stdout , " WARNING: incorrect data on %s (%d errors found). \n " ,
access = = WRITECHECK ? " write " : " read " , allErrors ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout ,
" Used Time Stamp %u (0x%x) for Data Signature \n " ,
test - > timeStampSignatureValue ,
test - > timeStampSignatureValue ) ;
}
}
return ( allErrors ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
2012-09-12 22:21:41 +04:00
* Count the number of tasks that share a host .
*
* This function employees the gethostname ( ) call , rather than using
* MPI_Get_processor_name ( ) . We are interested in knowing the number
* of tasks that share a file system client ( I / O node , compute node ,
* whatever that may be ) . However on machines like BlueGene / Q ,
* MPI_Get_processor_name ( ) uniquely identifies a cpu in a compute node ,
* not the node where the I / O is function shipped to . gethostname ( )
* is assumed to identify the shared filesystem client in more situations .
*
* NOTE : This also assumes that the task count on all nodes is equal
* to the task count on the host running MPI task 0.
2011-06-17 23:20:43 +04:00
*/
2011-11-12 04:40:45 +04:00
static int CountTasksPerNode ( int numTasks , MPI_Comm comm )
2011-06-17 23:20:43 +04:00
{
2017-11-30 13:56:26 +03:00
/* for debugging and testing */
2017-11-29 11:54:22 +03:00
if ( getenv ( " IOR_FAKE_TASK_PER_NODES " ) ) {
int tasksPerNode = atoi ( getenv ( " IOR_FAKE_TASK_PER_NODES " ) ) ;
int rank ;
MPI_Comm_rank ( comm , & rank ) ;
if ( rank = = 0 ) {
printf ( " Fake tasks per node: using %d \n " , tasksPerNode ) ;
}
return tasksPerNode ;
}
2012-09-12 22:21:41 +04:00
char localhost [ MAX_STR ] ;
char hostname0 [ MAX_STR ] ;
2011-11-12 02:22:17 +04:00
static int firstPass = TRUE ;
2012-09-12 22:21:41 +04:00
unsigned count ;
unsigned flag ;
int rc ;
2011-11-12 02:22:17 +04:00
2012-09-12 22:21:41 +04:00
rc = gethostname ( localhost , MAX_STR ) ;
if ( rc = = - 1 ) {
2014-08-14 02:53:24 +04:00
/* This node won't match task 0's hostname...except in the
2012-09-12 22:21:41 +04:00
case where ALL gethostname ( ) calls fail , in which
case ALL nodes will appear to be on the same node .
We ' ll handle that later . */
localhost [ 0 ] = ' \0 ' ;
if ( rank = = 0 )
perror ( " gethostname() failed " ) ;
2011-11-12 02:22:17 +04:00
}
2011-06-17 23:20:43 +04:00
2012-11-21 07:42:28 +04:00
if ( verbose > = VERBOSE_2 & & firstPass ) {
char tmp [ MAX_STR ] ;
sprintf ( tmp , " task %d on %s " , rank , localhost ) ;
OutputToRoot ( numTasks , comm , tmp ) ;
firstPass = FALSE ;
}
2012-09-12 22:21:41 +04:00
/* send task 0's hostname to all tasks */
if ( rank = = 0 )
strcpy ( hostname0 , localhost ) ;
MPI_CHECK ( MPI_Bcast ( hostname0 , MAX_STR , MPI_CHAR , 0 , comm ) ,
" broadcast of task 0's hostname failed " ) ;
if ( strcmp ( hostname0 , localhost ) = = 0 )
flag = 1 ;
else
flag = 0 ;
/* count the tasks share the same host as task 0 */
MPI_Allreduce ( & flag , & count , 1 , MPI_UNSIGNED , MPI_SUM , comm ) ;
if ( hostname0 [ 0 ] = = ' \0 ' )
count = 1 ;
return ( int ) count ;
2011-11-12 04:40:45 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Allocate a page - aligned ( required by O_DIRECT ) buffer .
*/
2012-01-13 08:34:40 +04:00
static void * aligned_buffer_alloc ( size_t size )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
size_t pageSize ;
size_t pageMask ;
char * buf , * tmp ;
char * aligned ;
pageSize = getpagesize ( ) ;
pageMask = pageSize - 1 ;
buf = malloc ( size + pageSize + sizeof ( void * ) ) ;
if ( buf = = NULL )
ERR ( " out of memory " ) ;
/* find the alinged buffer */
tmp = buf + sizeof ( char * ) ;
aligned = tmp + pageSize - ( ( size_t ) tmp & pageMask ) ;
/* write a pointer to the original malloc()ed buffer into the bytes
preceding " aligned " , so that the aligned buffer can later be free ( ) ed */
tmp = aligned - sizeof ( void * ) ;
* ( void * * ) tmp = buf ;
2011-12-13 09:00:18 +04:00
return ( void * ) aligned ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2012-01-13 08:34:40 +04:00
/*
* Free a buffer allocated by aligned_buffer_alloc ( ) .
*/
static void aligned_buffer_free ( void * buf )
{
free ( * ( void * * ) ( ( char * ) buf - sizeof ( char * ) ) ) ;
}
2011-12-13 09:00:18 +04:00
void AllocResults ( IOR_test_t * test )
{
2014-08-14 02:53:24 +04:00
int reps ;
if ( test - > results ! = NULL )
return ;
2011-12-13 09:00:18 +04:00
2014-08-14 02:53:24 +04:00
reps = test - > params . repetitions ;
test - > results = ( IOR_results_t * ) malloc ( sizeof ( IOR_results_t ) ) ;
if ( test - > results = = NULL )
ERR ( " malloc of IOR_results_t failed " ) ;
2011-12-13 09:00:18 +04:00
2014-08-14 02:53:24 +04:00
test - > results - > writeTime = ( double * ) malloc ( reps * sizeof ( double ) ) ;
if ( test - > results - > writeTime = = NULL )
ERR ( " malloc of writeTime array failed " ) ;
memset ( test - > results - > writeTime , 0 , reps * sizeof ( double ) ) ;
2011-12-13 09:00:18 +04:00
2014-08-14 02:53:24 +04:00
test - > results - > readTime = ( double * ) malloc ( reps * sizeof ( double ) ) ;
if ( test - > results - > readTime = = NULL )
ERR ( " malloc of readTime array failed " ) ;
memset ( test - > results - > readTime , 0 , reps * sizeof ( double ) ) ;
2011-12-13 09:00:18 +04:00
test - > results - > aggFileSizeFromStat =
2014-08-14 02:53:24 +04:00
( IOR_offset_t * ) malloc ( reps * sizeof ( IOR_offset_t ) ) ;
2011-12-13 09:00:18 +04:00
if ( test - > results - > aggFileSizeFromStat = = NULL )
ERR ( " malloc of aggFileSizeFromStat failed " ) ;
test - > results - > aggFileSizeFromXfer =
2014-08-14 02:53:24 +04:00
( IOR_offset_t * ) malloc ( reps * sizeof ( IOR_offset_t ) ) ;
2011-12-13 09:00:18 +04:00
if ( test - > results - > aggFileSizeFromXfer = = NULL )
ERR ( " malloc of aggFileSizeFromXfer failed " ) ;
test - > results - > aggFileSizeForBW =
2014-08-14 02:53:24 +04:00
( IOR_offset_t * ) malloc ( reps * sizeof ( IOR_offset_t ) ) ;
2011-12-13 09:00:18 +04:00
if ( test - > results - > aggFileSizeForBW = = NULL )
ERR ( " malloc of aggFileSizeForBW failed " ) ;
}
void FreeResults ( IOR_test_t * test )
{
2014-08-14 02:53:24 +04:00
if ( test - > results ! = NULL ) {
free ( test - > results - > aggFileSizeFromStat ) ;
free ( test - > results - > aggFileSizeFromXfer ) ;
free ( test - > results - > aggFileSizeForBW ) ;
free ( test - > results - > readTime ) ;
free ( test - > results - > writeTime ) ;
free ( test - > results ) ;
}
2011-12-13 09:00:18 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Create new test for list of tests .
*/
2011-12-13 09:00:18 +04:00
IOR_test_t * CreateTest ( IOR_param_t * init_params , int test_num )
2011-06-17 23:20:43 +04:00
{
2011-12-13 09:00:18 +04:00
IOR_test_t * newTest = NULL ;
2011-11-12 02:22:17 +04:00
2011-12-13 09:00:18 +04:00
newTest = ( IOR_test_t * ) malloc ( sizeof ( IOR_test_t ) ) ;
2011-11-12 02:22:17 +04:00
if ( newTest = = NULL )
2011-12-13 09:00:18 +04:00
ERR ( " malloc() of IOR_test_t failed " ) ;
newTest - > params = * init_params ;
GetPlatformName ( newTest - > params . platform ) ;
newTest - > params . nodes = init_params - > numTasks / tasksPerNode ;
newTest - > params . tasksPerNode = tasksPerNode ;
newTest - > params . id = test_num ;
newTest - > next = NULL ;
2014-08-14 02:53:24 +04:00
newTest - > results = NULL ;
2011-12-13 09:00:18 +04:00
return newTest ;
}
static void DestroyTest ( IOR_test_t * test )
{
2014-08-14 02:53:24 +04:00
FreeResults ( test ) ;
free ( test ) ;
2011-12-13 09:00:18 +04:00
}
static void DestroyTests ( IOR_test_t * tests_head )
{
2014-08-14 02:53:24 +04:00
IOR_test_t * tptr , * next ;
2011-12-13 09:00:18 +04:00
2014-08-14 02:53:24 +04:00
for ( tptr = tests_head ; tptr ! = NULL ; tptr = next ) {
next = tptr - > next ;
DestroyTest ( tptr ) ;
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Sleep for ' delay ' seconds .
*/
2011-11-12 04:40:45 +04:00
static void DelaySecs ( int delay )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
if ( rank = = 0 & & delay > 0 ) {
if ( verbose > = VERBOSE_1 )
fprintf ( stdout , " delaying %d seconds . . . \n " , delay ) ;
sleep ( delay ) ;
}
2011-11-12 04:40:45 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Display freespace ( df ) .
*/
2011-11-12 04:40:45 +04:00
static void DisplayFreespace ( IOR_param_t * test )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
char fileName [ MAX_STR ] = { 0 } ;
int i ;
int directoryFound = FALSE ;
/* get outfile name */
GetTestFileName ( fileName , test ) ;
/* get directory for outfile */
i = strlen ( fileName ) ;
while ( i - - > 0 ) {
if ( fileName [ i ] = = ' / ' ) {
fileName [ i ] = ' \0 ' ;
directoryFound = TRUE ;
break ;
}
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
/* if no directory/, use '.' */
if ( directoryFound = = FALSE ) {
strcpy ( fileName , " . " ) ;
}
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
ShowFileSystemSize ( fileName ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
return ;
2011-11-12 04:40:45 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Display usage of script file .
*/
2011-11-12 04:40:45 +04:00
static void DisplayUsage ( char * * argv )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
char * opts [ ] = {
" OPTIONS: " ,
Fixed striding/segmentation in N:N and N:1 cases, for multi-part-upload.
All ranks locally capture and accumulate Etags for the parts they are
writing. In the N:1 cases, these are ethen collected by rank 0, via
MPI_Gather. This is effectively an organization matching the "segmented"
layout. If data was written segmented, then rank0 assigns part-numbers to
with appropriate offsets to correspond to what would've been used by each
rank when writing a given etag. If data was written strided, then etags
must also be accessed in strided order, to build the XML that will be sent.
TBD: Once the total volume of etag data exceeds the size of memory at rank
0, we'll need to impose a more-sophisticated technique. One idea is to
thread the MPI comms differently from the libcurl comms, so that multiple
gathers can be staged incrementally, while sending a single stream of XML
data tot he servers. For example, the libcurl write-function could
interact with the MPI prog to allow the appearance of a single stream of
data.
2014-12-02 19:10:32 +03:00
" -a S api -- API for I/O [POSIX|MPIIO|HDF5|HDFS|S3|S3_EMC|NCMPI] " ,
2012-01-14 02:05:13 +04:00
" -A N refNum -- user supplied reference number to include in the summary " ,
2011-11-12 02:22:17 +04:00
" -b N blockSize -- contiguous bytes to write per task (e.g.: 8, 4k, 2m, 1g) " ,
" -B useO_DIRECT -- uses O_DIRECT for POSIX, bypassing I/O buffers " ,
" -c collective -- collective I/O " ,
" -C reorderTasks -- changes task ordering to n+1 ordering for readback " ,
" -d N interTestDelay -- delay between reps in seconds " ,
" -D N deadlineForStonewalling -- seconds before stopping write or read phase " ,
2017-10-21 12:59:09 +03:00
" -O stoneWallingWearOut=1 -- once the stonewalling timout is over, all process finish to access the amount of data " ,
" -O stoneWallingWearOutIterations=N -- stop after processing this number of iterations, needed for reading data back written with stoneWallingWearOut " ,
2011-11-12 02:22:17 +04:00
" -e fsync -- perform fsync upon POSIX write close " ,
" -E useExistingTestFile -- do not remove test file before write access " ,
" -f S scriptFile -- test script name " ,
" -F filePerProc -- file-per-process " ,
" -g intraTestBarriers -- use barriers between open, write/read, and close " ,
2017-09-27 19:45:47 +03:00
" -G N setTimeStampSignature -- set value for time stamp signature/random seed " ,
2011-11-12 02:22:17 +04:00
" -h showHelp -- displays options and help " ,
" -H showHints -- show hints " ,
" -i N repetitions -- number of repetitions of test " ,
" -I individualDataSets -- datasets not shared by all procs [not working] " ,
" -j N outlierThreshold -- warn on outlier N seconds from mean " ,
" -J N setAlignment -- HDF5 alignment in bytes (e.g.: 8, 4k, 2m, 1g) " ,
" -k keepFile -- don't remove the test file(s) on program exit " ,
" -K keepFileWithError -- keep error-filled file(s) after data-checking " ,
2015-05-21 21:05:56 +03:00
" -l datapacket type-- type of packet that will be created [offset|incompressible|timestamp|o|i|t] " ,
2011-11-12 02:22:17 +04:00
" -m multiFile -- use number of reps (-i) for multiple file count " ,
2012-01-14 03:47:37 +04:00
" -M N memoryPerNode -- hog memory on the node (e.g.: 2g, 75%) " ,
2011-11-12 02:22:17 +04:00
" -n noFill -- no fill in HDF5 file creation " ,
" -N N numTasks -- number of tasks that should participate in the test " ,
" -o S testFile -- full name for test " ,
" -O S string of IOR directives (e.g. -O checkRead=1,lustreStripeCount=32) " ,
" -p preallocate -- preallocate file size " ,
" -P useSharedFilePointer -- use shared file pointer [not working] " ,
" -q quitOnError -- during file error-checking, abort on error " ,
2012-01-13 08:49:56 +04:00
" -Q N taskPerNodeOffset for read tests use with -C & -Z options (-C constant N, -Z at least N) " ,
2011-11-12 02:22:17 +04:00
" -r readFile -- read existing file " ,
2017-09-27 19:45:47 +03:00
" -R checkRead -- verify that the output of read matches the expected signature (used with -G) " ,
2011-11-12 02:22:17 +04:00
" -s N segmentCount -- number of segments " ,
" -S useStridedDatatype -- put strided access into datatype [not working] " ,
" -t N transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g) " ,
2012-01-09 00:30:05 +04:00
" -T N maxTimeDuration -- max time in minutes for each test " ,
2011-11-12 02:22:17 +04:00
" -u uniqueDir -- use unique directory name for each file-per-process " ,
" -U S hintsFileName -- full name for hints file " ,
" -v verbose -- output information (repeating flag increases level) " ,
" -V useFileView -- use MPI_File_set_view " ,
" -w writeFile -- write file " ,
" -W checkWrite -- check read after write " ,
" -x singleXferAttempt -- do not retry transfer if incomplete " ,
2012-01-13 08:49:56 +04:00
" -X N reorderTasksRandomSeed -- random seed for -Z option " ,
" -Y fsyncPerWrite -- perform fsync after each POSIX write " ,
2011-11-12 02:22:17 +04:00
" -z randomOffset -- access is to random, not sequential, offsets within a file " ,
2012-01-13 08:49:56 +04:00
" -Z reorderTasksRandom -- changes task ordering to random ordering for readback " ,
2011-11-12 02:22:17 +04:00
" " ,
" NOTE: S is a string, N is an integer number. " ,
" " ,
" "
} ;
int i = 0 ;
fprintf ( stdout , " Usage: %s [OPTIONS] \n \n " , * argv ) ;
for ( i = 0 ; strlen ( opts [ i ] ) > 0 ; i + + )
fprintf ( stdout , " %s \n " , opts [ i ] ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
return ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Distribute IOR_HINTs to all tasks ' environments .
*/
2011-11-12 02:22:17 +04:00
void DistributeHints ( void )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
char hint [ MAX_HINTS ] [ MAX_STR ] , fullHint [ MAX_STR ] , hintVariable [ MAX_STR ] ;
int hintCount = 0 , i ;
if ( rank = = 0 ) {
for ( i = 0 ; environ [ i ] ! = NULL ; i + + ) {
if ( strncmp ( environ [ i ] , " IOR_HINT " , strlen ( " IOR_HINT " ) )
= = 0 ) {
hintCount + + ;
if ( hintCount = = MAX_HINTS ) {
WARN ( " exceeded max hints; reset MAX_HINTS and recompile " ) ;
hintCount = MAX_HINTS ;
break ;
}
/* assume no IOR_HINT is greater than MAX_STR in length */
strncpy ( hint [ hintCount - 1 ] , environ [ i ] ,
MAX_STR - 1 ) ;
}
2011-06-17 23:20:43 +04:00
}
}
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Bcast ( & hintCount , sizeof ( hintCount ) , MPI_BYTE ,
2011-06-17 23:20:43 +04:00
0 , MPI_COMM_WORLD ) , " cannot broadcast hints " ) ;
2011-11-12 02:22:17 +04:00
for ( i = 0 ; i < hintCount ; i + + ) {
MPI_CHECK ( MPI_Bcast ( & hint [ i ] , MAX_STR , MPI_BYTE ,
0 , MPI_COMM_WORLD ) ,
" cannot broadcast hints " ) ;
strcpy ( fullHint , hint [ i ] ) ;
strcpy ( hintVariable , strtok ( fullHint , " = " ) ) ;
if ( getenv ( hintVariable ) = = NULL ) {
/* doesn't exist in this task's environment; better set it */
if ( putenv ( hint [ i ] ) ! = 0 )
WARN ( " cannot set environment variable " ) ;
}
2011-06-17 23:20:43 +04:00
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Fill buffer , which is transfer size bytes long , with known 8 - byte long long
* int values . In even - numbered 8 - byte long long ints , store MPI task in high
* bits and timestamp signature in low bits . In odd - numbered 8 - byte long long
* ints , store transfer offset . If storeFileOffset option is used , the file
* ( not transfer ) offset is stored instead .
*/
2015-05-21 21:05:56 +03:00
static void
FillIncompressibleBuffer ( void * buffer , IOR_param_t * test )
{
size_t i ;
unsigned long long hi , lo ;
unsigned long long * buf = ( unsigned long long * ) buffer ;
2017-09-27 19:45:47 +03:00
2015-05-21 21:05:56 +03:00
for ( i = 0 ; i < test - > transferSize / sizeof ( unsigned long long ) ; i + + ) {
hi = ( ( unsigned long long ) rand_r ( & test - > incompressibleSeed ) < < 32 ) ;
lo = ( unsigned long long ) rand_r ( & test - > incompressibleSeed ) ;
2017-09-27 19:45:47 +03:00
buf [ i ] = hi | lo ;
2015-05-21 21:05:56 +03:00
}
2017-09-27 19:45:47 +03:00
2015-05-21 21:05:56 +03:00
}
unsigned int reseed_incompressible_prng = TRUE ;
2011-11-12 04:40:45 +04:00
static void
2011-11-12 02:22:17 +04:00
FillBuffer ( void * buffer ,
IOR_param_t * test , unsigned long long offset , int fillrank )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
size_t i ;
unsigned long long hi , lo ;
unsigned long long * buf = ( unsigned long long * ) buffer ;
2017-09-27 19:45:47 +03:00
2015-05-21 21:05:56 +03:00
if ( test - > dataPacketType = = incompressible ) { /* Make for some non compressable buffers with randomish data */
2011-11-12 02:22:17 +04:00
2015-05-21 21:05:56 +03:00
/* In order for write checks to work, we have to restart the psuedo random sequence */
if ( reseed_incompressible_prng = = TRUE ) {
2017-09-27 19:45:47 +03:00
test - > incompressibleSeed = test - > setTimeStampSignature + rank ; /* We copied seed into timestampSignature at initialization, also add the rank to add randomness between processes */
2015-05-21 21:05:56 +03:00
reseed_incompressible_prng = FALSE ;
}
FillIncompressibleBuffer ( buffer , test ) ;
}
2017-09-27 19:45:47 +03:00
2015-05-21 21:05:56 +03:00
else {
hi = ( ( unsigned long long ) fillrank ) < < 32 ;
lo = ( unsigned long long ) test - > timeStampSignatureValue ;
for ( i = 0 ; i < test - > transferSize / sizeof ( unsigned long long ) ; i + + ) {
if ( ( i % 2 ) = = 0 ) {
/* evens contain MPI rank and time in seconds */
buf [ i ] = hi | lo ;
} else {
/* odds contain offset */
buf [ i ] = offset + ( i * sizeof ( unsigned long long ) ) ;
}
2011-11-12 02:22:17 +04:00
}
2011-06-17 23:20:43 +04:00
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Return string describing machine name and type .
*/
2011-11-12 02:22:17 +04:00
void GetPlatformName ( char * platformName )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
char nodeName [ MAX_STR ] , * p , * start , sysName [ MAX_STR ] ;
struct utsname name ;
if ( uname ( & name ) ! = 0 ) {
2011-12-15 01:40:25 +04:00
EWARN ( " cannot get platform name " ) ;
2011-11-12 02:22:17 +04:00
sprintf ( sysName , " %s " , " Unknown " ) ;
sprintf ( nodeName , " %s " , " Unknown " ) ;
2011-06-17 23:20:43 +04:00
} else {
2011-11-12 02:22:17 +04:00
sprintf ( sysName , " %s " , name . sysname ) ;
sprintf ( nodeName , " %s " , name . nodename ) ;
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
start = nodeName ;
if ( strlen ( nodeName ) = = 0 ) {
p = start ;
} else {
/* point to one character back from '\0' */
p = start + strlen ( nodeName ) - 1 ;
}
/*
* to cut off trailing node number , search backwards
* for the first non - numeric character
*/
while ( p ! = start ) {
if ( * p < ' 0 ' | | * p > ' 9 ' ) {
* ( p + 1 ) = ' \0 ' ;
break ;
} else {
p - - ;
}
}
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
sprintf ( platformName , " %s(%s) " , nodeName , sysName ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Return test file name to access .
* for single shared file , fileNames [ 0 ] is returned in testFileName
*/
2011-11-12 04:40:45 +04:00
static void GetTestFileName ( char * testFileName , IOR_param_t * test )
2011-06-17 23:20:43 +04:00
{
2014-08-29 01:39:44 +04:00
char * * fileNames ;
char initialTestFileName [ MAXPATHLEN ] ;
char testFileNameRoot [ MAX_STR ] ;
char tmpString [ MAX_STR ] ;
2011-11-12 02:22:17 +04:00
int count ;
/* parse filename for multiple file systems */
strcpy ( initialTestFileName , test - > testFileName ) ;
fileNames = ParseFileName ( initialTestFileName , & count ) ;
if ( count > 1 & & test - > uniqueDir = = TRUE )
ERR ( " cannot use multiple file names with unique directories " ) ;
if ( test - > filePerProc ) {
strcpy ( testFileNameRoot ,
fileNames [ ( ( rank +
rankOffset ) % test - > numTasks ) % count ] ) ;
} else {
strcpy ( testFileNameRoot , fileNames [ 0 ] ) ;
}
/* give unique name if using multiple files */
if ( test - > filePerProc ) {
/*
* prepend rank subdirectory before filename
* e . g . , / dir / file = > / dir / < rank > / file
*/
if ( test - > uniqueDir = = TRUE ) {
strcpy ( testFileNameRoot ,
PrependDir ( test , testFileNameRoot ) ) ;
}
sprintf ( testFileName , " %s.%08d " , testFileNameRoot ,
( rank + rankOffset ) % test - > numTasks ) ;
} else {
strcpy ( testFileName , testFileNameRoot ) ;
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
/* add suffix for multiple files */
if ( test - > repCounter > - 1 ) {
sprintf ( tmpString , " .%d " , test - > repCounter ) ;
strcat ( testFileName , tmpString ) ;
}
2014-04-03 22:49:16 +04:00
free ( fileNames ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Get time stamp . Use MPI_Timer ( ) unless _NO_MPI_TIMER is defined ,
* in which case use gettimeofday ( ) .
*/
2011-11-12 04:40:45 +04:00
static double GetTimeStamp ( void )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
double timeVal ;
2011-06-17 23:20:43 +04:00
# ifdef _NO_MPI_TIMER
2011-11-12 02:22:17 +04:00
struct timeval timer ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
if ( gettimeofday ( & timer , ( struct timezone * ) NULL ) ! = 0 )
ERR ( " cannot use gettimeofday() " ) ;
timeVal = ( double ) timer . tv_sec + ( ( double ) timer . tv_usec / 1000000 ) ;
# else /* not _NO_MPI_TIMER */
timeVal = MPI_Wtime ( ) ; /* no MPI_CHECK(), just check return value */
if ( timeVal < 0 )
ERR ( " cannot use MPI_Wtime() " ) ;
# endif /* _NO_MPI_TIMER */
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/* wall_clock_delta is difference from root node's time */
timeVal - = wall_clock_delta ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
return ( timeVal ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
2014-04-03 22:49:16 +04:00
* Convert IOR_offset_t value to human readable string . This routine uses a
* statically - allocated buffer internally and so is not re - entrant .
2011-06-17 23:20:43 +04:00
*/
2011-11-12 04:40:45 +04:00
static char * HumanReadable ( IOR_offset_t value , int base )
2011-06-17 23:20:43 +04:00
{
2014-04-03 22:49:16 +04:00
static char valueStr [ MAX_STR ] ;
2011-11-12 02:22:17 +04:00
int m = 0 , g = 0 ;
char m_str [ 8 ] , g_str [ 8 ] ;
if ( base = = BASE_TWO ) {
m = MEBIBYTE ;
g = GIBIBYTE ;
strcpy ( m_str , " MiB " ) ;
strcpy ( g_str , " GiB " ) ;
} else if ( base = = BASE_TEN ) {
m = MEGABYTE ;
g = GIGABYTE ;
strcpy ( m_str , " MB " ) ;
strcpy ( g_str , " GB " ) ;
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
if ( value > = g ) {
if ( value % ( IOR_offset_t ) g ) {
2014-04-03 22:49:16 +04:00
snprintf ( valueStr , MAX_STR - 1 , " %.2f %s " ,
2011-11-12 02:22:17 +04:00
( double ) ( ( double ) value / g ) , g_str ) ;
} else {
2014-04-03 22:49:16 +04:00
snprintf ( valueStr , MAX_STR - 1 , " %d %s " , ( int ) ( value / g ) , g_str ) ;
2011-11-12 02:22:17 +04:00
}
} else if ( value > = m ) {
if ( value % ( IOR_offset_t ) m ) {
2014-04-03 22:49:16 +04:00
snprintf ( valueStr , MAX_STR - 1 , " %.2f %s " ,
2011-11-12 02:22:17 +04:00
( double ) ( ( double ) value / m ) , m_str ) ;
} else {
2014-04-03 22:49:16 +04:00
snprintf ( valueStr , MAX_STR - 1 , " %d %s " , ( int ) ( value / m ) , m_str ) ;
2011-11-12 02:22:17 +04:00
}
} else if ( value > = 0 ) {
2014-04-03 22:49:16 +04:00
snprintf ( valueStr , MAX_STR - 1 , " %d bytes " , ( int ) value ) ;
2011-06-17 23:20:43 +04:00
} else {
2014-04-03 22:49:16 +04:00
snprintf ( valueStr , MAX_STR - 1 , " - " ) ;
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
return valueStr ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Parse file name .
*/
2011-11-12 04:40:45 +04:00
static char * * ParseFileName ( char * name , int * count )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
char * * fileNames , * tmp , * token ;
char delimiterString [ 3 ] = { FILENAME_DELIMITER , ' \n ' , ' \0 ' } ;
int i = 0 ;
* count = 0 ;
tmp = name ;
/* pass one */
/* if something there, count the first item */
if ( * tmp ! = ' \0 ' ) {
( * count ) + + ;
}
/* count the rest of the filenames */
while ( * tmp ! = ' \0 ' ) {
if ( * tmp = = FILENAME_DELIMITER ) {
( * count ) + + ;
}
tmp + + ;
}
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
fileNames = ( char * * ) malloc ( ( * count ) * sizeof ( char * * ) ) ;
if ( fileNames = = NULL )
ERR ( " out of memory " ) ;
/* pass two */
token = strtok ( name , delimiterString ) ;
while ( token ! = NULL ) {
fileNames [ i ] = token ;
token = strtok ( NULL , delimiterString ) ;
i + + ;
}
return ( fileNames ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Pretty Print a Double . The First parameter is a flag determining if left
* justification should be used . The third parameter a null - terminated string
* that should be appended to the number field .
*/
2011-11-12 04:40:45 +04:00
static void PPDouble ( int leftjustify , double number , char * append )
2011-06-17 23:20:43 +04:00
{
2012-01-09 03:46:43 +04:00
char format [ 16 ] ;
int width = 10 ;
int precision ;
2011-11-12 02:22:17 +04:00
if ( number < 0 ) {
fprintf ( stdout , " - %s " , append ) ;
2012-01-09 03:46:43 +04:00
return ;
2011-06-17 23:20:43 +04:00
}
2012-01-09 03:46:43 +04:00
if ( number < 1 )
precision = 6 ;
else if ( number < 3600 )
precision = 2 ;
else
precision = 0 ;
sprintf ( format , " %%%s%d.%df%%s " ,
leftjustify ? " - " : " " ,
width , precision ) ;
printf ( format , number , append ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* From absolute directory , insert rank as subdirectory . Allows each task
* to write to its own directory . E . g . , / dir / file = > / dir / < rank > / file .
*/
2011-11-12 04:40:45 +04:00
static char * PrependDir ( IOR_param_t * test , char * rootDir )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
char * dir ;
char fname [ MAX_STR + 1 ] ;
char * p ;
int i ;
dir = ( char * ) malloc ( MAX_STR + 1 ) ;
if ( dir = = NULL )
ERR ( " out of memory " ) ;
/* get dir name */
strcpy ( dir , rootDir ) ;
i = strlen ( dir ) - 1 ;
while ( i > 0 ) {
if ( dir [ i ] = = ' \0 ' | | dir [ i ] = = ' / ' ) {
dir [ i ] = ' / ' ;
dir [ i + 1 ] = ' \0 ' ;
break ;
}
i - - ;
}
/* get file name */
strcpy ( fname , rootDir ) ;
p = fname ;
while ( i > 0 ) {
if ( fname [ i ] = = ' \0 ' | | fname [ i ] = = ' / ' ) {
p = fname + ( i + 1 ) ;
break ;
}
i - - ;
}
/* create directory with rank as subdirectory */
sprintf ( dir , " %s%d " , dir , ( rank + rankOffset ) % test - > numTasks ) ;
/* dir doesn't exist, so create */
if ( access ( dir , F_OK ) ! = 0 ) {
if ( mkdir ( dir , S_IRWXU ) < 0 ) {
ERR ( " cannot create directory " ) ;
}
/* check if correct permissions */
} else if ( access ( dir , R_OK ) ! = 0 | | access ( dir , W_OK ) ! = 0 | |
access ( dir , X_OK ) ! = 0 ) {
ERR ( " invalid directory permissions " ) ;
}
/* concatenate dir and file names */
strcat ( dir , " / " ) ;
strcat ( dir , p ) ;
return dir ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/******************************************************************************/
/*
* Reduce test results , and show if verbose set .
*/
2011-12-13 09:00:18 +04:00
static void ReduceIterResults ( IOR_test_t * test , double * * timer , int rep ,
2011-11-12 04:40:45 +04:00
int access )
2011-06-17 23:20:43 +04:00
{
2011-12-11 08:45:19 +04:00
double reduced [ 12 ] = { 0 } ;
2014-08-14 02:53:24 +04:00
double diff [ 6 ] ;
double * diff_subset ;
double totalTime ;
double bw ;
2011-11-12 02:22:17 +04:00
enum { RIGHT , LEFT } ;
int i ;
MPI_Op op ;
2014-08-14 02:53:24 +04:00
assert ( access = = WRITE | | access = = READ ) ;
2011-12-11 08:45:19 +04:00
2011-11-12 02:22:17 +04:00
/* Find the minimum start time of the even numbered timers, and the
maximum finish time for the odd numbered timers */
for ( i = 0 ; i < 12 ; i + + ) {
op = i % 2 ? MPI_MAX : MPI_MIN ;
MPI_CHECK ( MPI_Reduce ( & timer [ i ] [ rep ] , & reduced [ i ] , 1 , MPI_DOUBLE ,
op , 0 , testComm ) , " MPI_Reduce() " ) ;
2011-06-17 23:20:43 +04:00
}
2011-12-11 08:45:19 +04:00
if ( rank ! = 0 ) {
2014-08-14 02:53:24 +04:00
/* Only rank 0 tallies and prints the results. */
return ;
}
/* Calculate elapsed times and throughput numbers */
for ( i = 0 ; i < 6 ; i + + ) {
diff [ i ] = reduced [ 2 * i + 1 ] - reduced [ 2 * i ] ;
}
if ( access = = WRITE ) {
totalTime = reduced [ 5 ] - reduced [ 0 ] ;
test - > results - > writeTime [ rep ] = totalTime ;
diff_subset = & diff [ 0 ] ;
} else { /* READ */
totalTime = reduced [ 11 ] - reduced [ 6 ] ;
test - > results - > readTime [ rep ] = totalTime ;
diff_subset = & diff [ 3 ] ;
}
2011-11-12 02:22:17 +04:00
2011-12-11 08:45:19 +04:00
if ( verbose < VERBOSE_0 ) {
2014-08-14 02:53:24 +04:00
return ;
}
fprintf ( stdout , " %-10s " , access = = WRITE ? " write " : " read " ) ;
bw = ( double ) test - > results - > aggFileSizeForBW [ rep ] / totalTime ;
PPDouble ( LEFT , bw / MEBIBYTE , " " ) ;
PPDouble ( LEFT , ( double ) test - > params . blockSize / KIBIBYTE , " " ) ;
PPDouble ( LEFT , ( double ) test - > params . transferSize / KIBIBYTE , " " ) ;
PPDouble ( LEFT , diff_subset [ 0 ] , " " ) ;
PPDouble ( LEFT , diff_subset [ 1 ] , " " ) ;
PPDouble ( LEFT , diff_subset [ 2 ] , " " ) ;
PPDouble ( LEFT , totalTime , " " ) ;
fprintf ( stdout , " %-4d \n " , rep ) ;
fflush ( stdout ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2012-01-09 06:55:46 +04:00
static void PrintRemoveTiming ( double start , double finish , int rep )
{
2012-01-14 01:49:30 +04:00
if ( rank ! = 0 | | verbose < VERBOSE_0 )
2014-08-14 02:53:24 +04:00
return ;
2012-01-09 06:55:46 +04:00
printf ( " remove - - - - - - " ) ;
PPDouble ( 1 , finish - start , " " ) ;
printf ( " %-4d \n " , rep ) ;
}
2011-06-17 23:20:43 +04:00
/*
* Check for file ( s ) , then remove all files if file - per - proc , else single file .
Algorithms 'S3', 'S3_plus', and 'S3_EMC' all available.
These are variants on S3. S3 uses the "pure" S3 interface, e.g. using
Multi-Part-Upload. The "plus" variant enables EMC-extensions in the aws4c
library. This allows the N:N case to use "append", in the case where
"transfer_size" != "block_size" for IOR. In pure S3, the N:N case will
fail, because the EMC-extensions won't be enabled, and appending (which
attempts to use the EMC byte-range tricks to do this) will throw an error.
In the S3_EMC alg, N:1 uses EMCs other byte-range tricks to write different
parts of an N:1 file, and also uses append to write the parts of an N:N
file. Preliminary tests show these EMC extensions look to improve BW by
~20%.
I put all three algs in aiori-S3.c, because it seemed some code was getting
reused. Not sure if that's still going to make sense after the TBD, below.
TBD: Recently realized that the "pure' S3 shouldn't be trying to use
appends for anything. In the N:N case, it should just use MPU, within each
file. Then, there's no need for S3_plus. We just have S3, which does MPU
for all writes where transfer_size != block_size, and uses (standard)
byte-range reads for reading. Then S3_EMC uses "appends for N:N writes,
and byte-range writes for N:1 writes. This separates the code for the two
algs a little more, but we might still want them in the same file.
2014-10-30 01:04:30 +03:00
*
2011-06-17 23:20:43 +04:00
*/
2011-11-12 04:40:45 +04:00
static void RemoveFile ( char * testFileName , int filePerProc , IOR_param_t * test )
2011-11-12 02:22:17 +04:00
{
int tmpRankOffset ;
if ( filePerProc ) {
/* in random tasks, delete own file */
if ( test - > reorderTasksRandom = = TRUE ) {
tmpRankOffset = rankOffset ;
rankOffset = 0 ;
GetTestFileName ( testFileName , test ) ;
}
if ( access ( testFileName , F_OK ) = = 0 ) {
backend - > delete ( testFileName , test ) ;
}
if ( test - > reorderTasksRandom = = TRUE ) {
rankOffset = tmpRankOffset ;
GetTestFileName ( testFileName , test ) ;
}
} else {
Algorithms 'S3', 'S3_plus', and 'S3_EMC' all available.
These are variants on S3. S3 uses the "pure" S3 interface, e.g. using
Multi-Part-Upload. The "plus" variant enables EMC-extensions in the aws4c
library. This allows the N:N case to use "append", in the case where
"transfer_size" != "block_size" for IOR. In pure S3, the N:N case will
fail, because the EMC-extensions won't be enabled, and appending (which
attempts to use the EMC byte-range tricks to do this) will throw an error.
In the S3_EMC alg, N:1 uses EMCs other byte-range tricks to write different
parts of an N:1 file, and also uses append to write the parts of an N:N
file. Preliminary tests show these EMC extensions look to improve BW by
~20%.
I put all three algs in aiori-S3.c, because it seemed some code was getting
reused. Not sure if that's still going to make sense after the TBD, below.
TBD: Recently realized that the "pure' S3 shouldn't be trying to use
appends for anything. In the N:N case, it should just use MPU, within each
file. Then, there's no need for S3_plus. We just have S3, which does MPU
for all writes where transfer_size != block_size, and uses (standard)
byte-range reads for reading. Then S3_EMC uses "appends for N:N writes,
and byte-range writes for N:1 writes. This separates the code for the two
algs a little more, but we might still want them in the same file.
2014-10-30 01:04:30 +03:00
// BUG: "access()" assumes a POSIX filesystem. Maybe use
// backend->get_file_size(), instead, (and catch
// errors), or extend the aiori struct to include
// something to safely check for existence of the
// "file".
2017-09-27 19:45:47 +03:00
//
2011-11-12 02:22:17 +04:00
if ( ( rank = = 0 ) & & ( access ( testFileName , F_OK ) = = 0 ) ) {
backend - > delete ( testFileName , test ) ;
}
}
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2012-01-13 08:34:40 +04:00
/*
* Determine any spread ( range ) between node times .
*/
static double TimeDeviation ( void )
{
double timestamp ;
double min = 0 ;
double max = 0 ;
double roottimestamp ;
MPI_CHECK ( MPI_Barrier ( MPI_COMM_WORLD ) , " barrier error " ) ;
timestamp = GetTimeStamp ( ) ;
MPI_CHECK ( MPI_Reduce ( & timestamp , & min , 1 , MPI_DOUBLE ,
MPI_MIN , 0 , MPI_COMM_WORLD ) ,
" cannot reduce tasks' times " ) ;
MPI_CHECK ( MPI_Reduce ( & timestamp , & max , 1 , MPI_DOUBLE ,
MPI_MAX , 0 , MPI_COMM_WORLD ) ,
" cannot reduce tasks' times " ) ;
/* delta between individual nodes' time and root node's time */
roottimestamp = timestamp ;
MPI_CHECK ( MPI_Bcast ( & roottimestamp , 1 , MPI_DOUBLE , 0 , MPI_COMM_WORLD ) ,
" cannot broadcast root's time " ) ;
wall_clock_delta = timestamp - roottimestamp ;
return max - min ;
}
2011-06-17 23:20:43 +04:00
/*
* Setup tests by parsing commandline and creating test script .
2015-05-19 18:36:28 +03:00
* Perform a sanity - check on the configured parameters .
2011-06-17 23:20:43 +04:00
*/
2011-12-13 09:00:18 +04:00
static IOR_test_t * SetupTests ( int argc , char * * argv )
2011-06-17 23:20:43 +04:00
{
2011-12-13 09:00:18 +04:00
IOR_test_t * tests , * testsHead ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/* count the tasks per node */
tasksPerNode = CountTasksPerNode ( numTasksWorld , MPI_COMM_WORLD ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
testsHead = tests = ParseCommandLine ( argc , argv ) ;
/*
* Since there is no guarantee that anyone other than
* task 0 has the environment settings for the hints , pass
* the hint = value pair to everyone else in MPI_COMM_WORLD
*/
DistributeHints ( ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/* check validity of tests and create test queue */
while ( tests ! = NULL ) {
2015-05-19 18:36:28 +03:00
ValidateTests ( & tests - > params ) ;
2011-12-13 09:00:18 +04:00
tests = tests - > next ;
2011-11-12 02:22:17 +04:00
}
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/* check for skew between tasks' start times */
wall_clock_deviation = TimeDeviation ( ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/* seed random number generator */
SeedRandGen ( MPI_COMM_WORLD ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
return ( testsHead ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2011-11-12 03:11:28 +04:00
/*
* Setup transfer buffers , creating and filling as needed .
*/
2017-09-27 19:45:47 +03:00
static void XferBuffersSetup ( IOR_io_buffers * ioBuffers , IOR_param_t * test ,
2015-05-27 19:24:52 +03:00
int pretendRank )
2011-06-17 23:20:43 +04:00
{
2015-05-27 19:24:52 +03:00
ioBuffers - > buffer = aligned_buffer_alloc ( test - > transferSize ) ;
FillBuffer ( ioBuffers - > buffer , test , 0 , pretendRank ) ;
if ( test - > checkWrite | | test - > checkRead ) {
ioBuffers - > checkBuffer = aligned_buffer_alloc ( test - > transferSize ) ;
2012-01-13 08:34:40 +04:00
}
2017-11-29 12:17:02 +03:00
if ( test - > checkRead | | test - > checkWrite ) {
2015-05-27 19:24:52 +03:00
ioBuffers - > readCheckBuffer = aligned_buffer_alloc ( test - > transferSize ) ;
2012-01-13 08:34:40 +04:00
}
return ;
}
/*
* Free transfer buffers .
*/
2015-05-27 19:24:52 +03:00
static void XferBuffersFree ( IOR_io_buffers * ioBuffers , IOR_param_t * test )
2012-01-13 08:34:40 +04:00
{
2015-05-27 19:24:52 +03:00
aligned_buffer_free ( ioBuffers - > buffer ) ;
if ( test - > checkWrite | | test - > checkRead ) {
aligned_buffer_free ( ioBuffers - > checkBuffer ) ;
2012-01-13 08:34:40 +04:00
}
2015-05-27 19:24:52 +03:00
if ( test - > checkRead ) {
aligned_buffer_free ( ioBuffers - > readCheckBuffer ) ;
2011-06-17 23:20:43 +04:00
}
2012-01-13 08:34:40 +04:00
2011-11-12 02:22:17 +04:00
return ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2012-09-10 21:50:18 +04:00
/*
* Message to print immediately after MPI_Init so we know that
* ior has started .
*/
static void PrintEarlyHeader ( )
{
2014-08-14 02:53:24 +04:00
if ( rank ! = 0 )
return ;
2012-09-10 21:50:18 +04:00
2014-08-14 02:53:24 +04:00
printf ( " IOR- " META_VERSION " : MPI Coordinated Test of Parallel I/O \n " ) ;
printf ( " \n " ) ;
fflush ( stdout ) ;
2012-09-10 21:50:18 +04:00
}
2011-12-11 13:50:19 +04:00
static void PrintHeader ( int argc , char * * argv )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
struct utsname unamebuf ;
2014-08-14 02:53:24 +04:00
int i ;
2011-11-12 02:22:17 +04:00
2014-08-14 02:53:24 +04:00
if ( rank ! = 0 )
return ;
2011-11-12 02:22:17 +04:00
2011-12-13 13:07:55 +04:00
fprintf ( stdout , " Began: %s " , CurrentTimeString ( ) ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " Command line used: " ) ;
for ( i = 0 ; i < argc ; i + + ) {
fprintf ( stdout , " %s " , argv [ i ] ) ;
}
fprintf ( stdout , " \n " ) ;
if ( uname ( & unamebuf ) ! = 0 ) {
2011-12-15 01:40:25 +04:00
EWARN ( " uname failed " ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " Machine: Unknown " ) ;
} else {
fprintf ( stdout , " Machine: %s %s " , unamebuf . sysname ,
unamebuf . nodename ) ;
if ( verbose > = VERBOSE_2 ) {
fprintf ( stdout , " %s %s %s " , unamebuf . release ,
unamebuf . version , unamebuf . machine ) ;
}
}
2014-08-14 02:53:24 +04:00
fprintf ( stdout , " \n " ) ;
2011-06-17 23:20:43 +04:00
# ifdef _NO_MPI_TIMER
if ( verbose > = VERBOSE_2 )
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " Using unsynchronized POSIX timer \n " ) ;
# else /* not _NO_MPI_TIMER */
if ( MPI_WTIME_IS_GLOBAL ) {
if ( verbose > = VERBOSE_2 )
fprintf ( stdout , " Using synchronized MPI timer \n " ) ;
} else {
if ( verbose > = VERBOSE_2 )
fprintf ( stdout , " Using unsynchronized MPI timer \n " ) ;
}
# endif /* _NO_MPI_TIMER */
if ( verbose > = VERBOSE_1 ) {
fprintf ( stdout , " Start time skew across all tasks: %.02f sec \n " ,
wall_clock_deviation ) ;
}
if ( verbose > = VERBOSE_3 ) { /* show env */
fprintf ( stdout , " STARTING ENVIRON LOOP \n " ) ;
for ( i = 0 ; environ [ i ] ! = NULL ; i + + ) {
fprintf ( stdout , " %s \n " , environ [ i ] ) ;
}
fprintf ( stdout , " ENDING ENVIRON LOOP \n " ) ;
}
2014-08-14 02:53:24 +04:00
fflush ( stdout ) ;
2011-12-11 13:50:19 +04:00
}
/*
* Print header information for test output .
*/
2011-12-13 12:41:19 +04:00
static void ShowTestInfo ( IOR_param_t * params )
2011-12-11 13:50:19 +04:00
{
2014-08-14 02:53:24 +04:00
fprintf ( stdout , " \n " ) ;
2011-12-13 12:41:19 +04:00
fprintf ( stdout , " Test %d started: %s " , params - > id , CurrentTimeString ( ) ) ;
2011-12-11 13:50:19 +04:00
if ( verbose > = VERBOSE_1 ) {
/* if pvfs2:, then skip */
2011-12-13 12:41:19 +04:00
if ( Regex ( params - > testFileName , " ^[a-z][a-z].*: " ) = = 0 ) {
DisplayFreespace ( params ) ;
2011-12-11 13:50:19 +04:00
}
}
2011-11-12 02:22:17 +04:00
fflush ( stdout ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Show simple test output with max results for iterations .
*/
2011-12-13 09:00:18 +04:00
static void ShowSetup ( IOR_param_t * params )
2011-06-17 23:20:43 +04:00
{
2011-12-13 09:00:18 +04:00
if ( strcmp ( params - > debug , " " ) ! = 0 ) {
2011-12-14 10:04:27 +04:00
printf ( " \n *** DEBUG MODE *** \n " ) ;
printf ( " *** %s *** \n \n " , params - > debug ) ;
}
printf ( " Summary: \n " ) ;
printf ( " \t api = %s \n " , params - > apiVersion ) ;
printf ( " \t test filename = %s \n " , params - > testFileName ) ;
printf ( " \t access = " ) ;
2014-08-29 01:39:44 +04:00
printf ( params - > filePerProc ? " file-per-process " : " single-shared-file " ) ;
2011-12-13 09:00:18 +04:00
if ( verbose > = VERBOSE_1 & & strcmp ( params - > api , " POSIX " ) ! = 0 ) {
2011-12-14 10:04:27 +04:00
printf ( params - > collective = = FALSE ? " , independent " : " , collective " ) ;
2011-11-12 02:22:17 +04:00
}
2011-12-14 10:04:27 +04:00
printf ( " \n " ) ;
2011-11-12 02:22:17 +04:00
if ( verbose > = VERBOSE_1 ) {
2011-12-13 09:00:18 +04:00
if ( params - > segmentCount > 1 ) {
2011-11-12 02:22:17 +04:00
fprintf ( stdout ,
" \t pattern = strided (%d segments) \n " ,
2011-12-13 09:00:18 +04:00
( int ) params - > segmentCount ) ;
2011-11-12 02:22:17 +04:00
} else {
fprintf ( stdout ,
" \t pattern = segmented (1 segment) \n " ) ;
}
}
2011-12-14 10:04:27 +04:00
printf ( " \t ordering in a file = " ) ;
2011-12-13 09:00:18 +04:00
if ( params - > randomOffset = = FALSE ) {
2011-12-14 10:04:27 +04:00
printf ( " sequential offsets \n " ) ;
2011-06-17 23:20:43 +04:00
} else {
2011-12-14 10:04:27 +04:00
printf ( " random offsets \n " ) ;
2011-11-12 02:22:17 +04:00
}
2011-12-14 10:04:27 +04:00
printf ( " \t ordering inter file= " ) ;
2011-12-13 09:00:18 +04:00
if ( params - > reorderTasks = = FALSE & & params - > reorderTasksRandom = = FALSE ) {
2011-12-14 10:04:27 +04:00
printf ( " no tasks offsets \n " ) ;
2011-11-12 02:22:17 +04:00
}
2011-12-13 09:00:18 +04:00
if ( params - > reorderTasks = = TRUE ) {
2011-12-14 10:04:27 +04:00
printf ( " constant task offsets = %d \n " ,
2011-12-13 09:00:18 +04:00
params - > taskPerNodeOffset ) ;
2011-11-12 02:22:17 +04:00
}
2011-12-13 09:00:18 +04:00
if ( params - > reorderTasksRandom = = TRUE ) {
2011-12-14 10:04:27 +04:00
printf ( " random task offsets >= %d, seed=%d \n " ,
2011-12-13 09:00:18 +04:00
params - > taskPerNodeOffset , params - > reorderTasksRandomSeed ) ;
2011-11-12 02:22:17 +04:00
}
2011-12-14 10:04:27 +04:00
printf ( " \t clients = %d (%d per node) \n " ,
2011-12-13 09:00:18 +04:00
params - > numTasks , params - > tasksPerNode ) ;
2012-01-09 06:41:30 +04:00
if ( params - > memoryPerTask ! = 0 )
printf ( " \t memoryPerTask = %s \n " ,
HumanReadable ( params - > memoryPerTask , BASE_TWO ) ) ;
if ( params - > memoryPerNode ! = 0 )
printf ( " \t memoryPerNode = %s \n " ,
HumanReadable ( params - > memoryPerNode , BASE_TWO ) ) ;
printf ( " \t repetitions = %d \n " , params - > repetitions ) ;
printf ( " \t xfersize = %s \n " ,
2011-12-13 09:00:18 +04:00
HumanReadable ( params - > transferSize , BASE_TWO ) ) ;
2011-12-14 10:04:27 +04:00
printf ( " \t blocksize = %s \n " ,
2011-12-13 09:00:18 +04:00
HumanReadable ( params - > blockSize , BASE_TWO ) ) ;
2011-12-14 10:04:27 +04:00
printf ( " \t aggregate filesize = %s \n " ,
2011-12-13 09:00:18 +04:00
HumanReadable ( params - > expectedAggFileSize , BASE_TWO ) ) ;
2011-11-10 02:14:14 +04:00
# ifdef HAVE_LUSTRE_LUSTRE_USER_H
2012-01-14 04:57:21 +04:00
if ( params - > lustre_set_striping ) {
printf ( " \t Lustre stripe size = %s \n " ,
( ( params - > lustre_stripe_size = = 0 ) ? " Use default " :
HumanReadable ( params - > lustre_stripe_size , BASE_TWO ) ) ) ;
if ( params - > lustre_stripe_count = = 0 ) {
printf ( " \t stripe count = %s \n " , " Use default " ) ;
} else {
printf ( " \t stripe count = %d \n " ,
params - > lustre_stripe_count ) ;
}
2011-11-12 02:22:17 +04:00
}
2012-01-14 04:57:21 +04:00
# endif /* HAVE_LUSTRE_LUSTRE_USER_H */
2011-12-13 09:00:18 +04:00
if ( params - > deadlineForStonewalling > 0 ) {
2017-10-20 19:02:24 +03:00
printf ( " \t Using stonewalling = %d second(s)%s \n " ,
params - > deadlineForStonewalling , params - > stoneWallingWearOut ? " with phase out " : " " ) ;
2011-11-12 02:22:17 +04:00
}
fflush ( stdout ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Show test description .
*/
2011-11-12 04:40:45 +04:00
static void ShowTest ( IOR_param_t * test )
2011-06-17 23:20:43 +04:00
{
2015-05-21 21:05:56 +03:00
const char * data_packets [ ] = { " g " , " t " , " o " , " i " } ;
2017-09-27 19:45:47 +03:00
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " TEST: \t %s=%d \n " , " id " , test - > id ) ;
2012-01-14 02:05:13 +04:00
fprintf ( stdout , " \t %s=%d \n " , " refnum " , test - > referenceNumber ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " \t %s=%s \n " , " api " , test - > api ) ;
fprintf ( stdout , " \t %s=%s \n " , " platform " , test - > platform ) ;
fprintf ( stdout , " \t %s=%s \n " , " testFileName " , test - > testFileName ) ;
fprintf ( stdout , " \t %s=%s \n " , " hintsFileName " , test - > hintsFileName ) ;
fprintf ( stdout , " \t %s=%d \n " , " deadlineForStonewall " ,
test - > deadlineForStonewalling ) ;
2017-10-20 19:02:24 +03:00
fprintf ( stdout , " \t %s=%d \n " , " stoneWallingWearOut " , test - > stoneWallingWearOut ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " \t %s=%d \n " , " maxTimeDuration " , test - > maxTimeDuration ) ;
fprintf ( stdout , " \t %s=%d \n " , " outlierThreshold " ,
test - > outlierThreshold ) ;
fprintf ( stdout , " \t %s=%s \n " , " options " , test - > options ) ;
fprintf ( stdout , " \t %s=%d \n " , " nodes " , test - > nodes ) ;
2012-01-07 05:29:45 +04:00
fprintf ( stdout , " \t %s=%lu \n " , " memoryPerTask " , ( unsigned long ) test - > memoryPerTask ) ;
2012-01-09 06:41:30 +04:00
fprintf ( stdout , " \t %s=%lu \n " , " memoryPerNode " , ( unsigned long ) test - > memoryPerNode ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " \t %s=%d \n " , " tasksPerNode " , tasksPerNode ) ;
fprintf ( stdout , " \t %s=%d \n " , " repetitions " , test - > repetitions ) ;
fprintf ( stdout , " \t %s=%d \n " , " multiFile " , test - > multiFile ) ;
fprintf ( stdout , " \t %s=%d \n " , " interTestDelay " , test - > interTestDelay ) ;
fprintf ( stdout , " \t %s=%d \n " , " fsync " , test - > fsync ) ;
fprintf ( stdout , " \t %s=%d \n " , " fsYncperwrite " , test - > fsyncPerWrite ) ;
fprintf ( stdout , " \t %s=%d \n " , " useExistingTestFile " ,
test - > useExistingTestFile ) ;
fprintf ( stdout , " \t %s=%d \n " , " showHints " , test - > showHints ) ;
fprintf ( stdout , " \t %s=%d \n " , " uniqueDir " , test - > uniqueDir ) ;
fprintf ( stdout , " \t %s=%d \n " , " showHelp " , test - > showHelp ) ;
fprintf ( stdout , " \t %s=%d \n " , " individualDataSets " ,
test - > individualDataSets ) ;
fprintf ( stdout , " \t %s=%d \n " , " singleXferAttempt " ,
test - > singleXferAttempt ) ;
fprintf ( stdout , " \t %s=%d \n " , " readFile " , test - > readFile ) ;
fprintf ( stdout , " \t %s=%d \n " , " writeFile " , test - > writeFile ) ;
fprintf ( stdout , " \t %s=%d \n " , " filePerProc " , test - > filePerProc ) ;
fprintf ( stdout , " \t %s=%d \n " , " reorderTasks " , test - > reorderTasks ) ;
fprintf ( stdout , " \t %s=%d \n " , " reorderTasksRandom " ,
test - > reorderTasksRandom ) ;
fprintf ( stdout , " \t %s=%d \n " , " reorderTasksRandomSeed " ,
test - > reorderTasksRandomSeed ) ;
fprintf ( stdout , " \t %s=%d \n " , " randomOffset " , test - > randomOffset ) ;
fprintf ( stdout , " \t %s=%d \n " , " checkWrite " , test - > checkWrite ) ;
fprintf ( stdout , " \t %s=%d \n " , " checkRead " , test - > checkRead ) ;
fprintf ( stdout , " \t %s=%d \n " , " preallocate " , test - > preallocate ) ;
fprintf ( stdout , " \t %s=%d \n " , " useFileView " , test - > useFileView ) ;
fprintf ( stdout , " \t %s=%lld \n " , " setAlignment " , test - > setAlignment ) ;
fprintf ( stdout , " \t %s=%d \n " , " storeFileOffset " , test - > storeFileOffset ) ;
fprintf ( stdout , " \t %s=%d \n " , " useSharedFilePointer " ,
test - > useSharedFilePointer ) ;
fprintf ( stdout , " \t %s=%d \n " , " useO_DIRECT " , test - > useO_DIRECT ) ;
fprintf ( stdout , " \t %s=%d \n " , " useStridedDatatype " ,
test - > useStridedDatatype ) ;
fprintf ( stdout , " \t %s=%d \n " , " keepFile " , test - > keepFile ) ;
fprintf ( stdout , " \t %s=%d \n " , " keepFileWithError " ,
test - > keepFileWithError ) ;
fprintf ( stdout , " \t %s=%d \n " , " quitOnError " , test - > quitOnError ) ;
fprintf ( stdout , " \t %s=%d \n " , " verbose " , verbose ) ;
2015-05-21 21:05:56 +03:00
fprintf ( stdout , " \t %s=%s \n " , " data packet type " , data_packets [ test - > dataPacketType ] ) ;
fprintf ( stdout , " \t %s=%d \n " , " setTimeStampSignature/incompressibleSeed " ,
test - > setTimeStampSignature ) ; /* Seed value was copied into setTimeStampSignature as well */
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " \t %s=%d \n " , " collective " , test - > collective ) ;
fprintf ( stdout , " \t %s=%lld " , " segmentCount " , test - > segmentCount ) ;
2013-09-26 17:48:50 +04:00
# ifdef HAVE_GPFS_FCNTL_H
fprintf ( stdout , " \t %s=%d \n " , " gpfsHintAccess " , test - > gpfs_hint_access ) ;
fprintf ( stdout , " \t %s=%d \n " , " gpfsReleaseToken " , test - > gpfs_release_token ) ;
# endif
2011-11-12 02:22:17 +04:00
if ( strcmp ( test - > api , " HDF5 " ) = = 0 ) {
fprintf ( stdout , " (datasets) " ) ;
}
fprintf ( stdout , " \n " ) ;
fprintf ( stdout , " \t %s=%lld \n " , " transferSize " , test - > transferSize ) ;
fprintf ( stdout , " \t %s=%lld \n " , " blockSize " , test - > blockSize ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2011-12-11 08:45:19 +04:00
static double mean_of_array_of_doubles ( double * values , int len )
2011-06-17 23:20:43 +04:00
{
2014-08-14 02:53:24 +04:00
double tot = 0.0 ;
int i ;
2011-06-17 23:20:43 +04:00
2014-08-14 02:53:24 +04:00
for ( i = 0 ; i < len ; i + + ) {
tot + = values [ i ] ;
}
return tot / len ;
2012-01-14 01:27:55 +04:00
2011-12-11 08:45:19 +04:00
}
2011-11-12 02:22:17 +04:00
2011-12-11 08:45:19 +04:00
struct results {
2014-08-14 02:53:24 +04:00
double min ;
double max ;
double mean ;
double var ;
double sd ;
double sum ;
double * val ;
2011-12-11 08:45:19 +04:00
} ;
2011-11-12 02:22:17 +04:00
2011-12-11 08:45:19 +04:00
static struct results * bw_values ( int reps , IOR_offset_t * agg_file_size , double * vals )
{
2014-08-14 02:53:24 +04:00
struct results * r ;
int i ;
r = ( struct results * ) malloc ( sizeof ( struct results )
+ ( reps * sizeof ( double ) ) ) ;
if ( r = = NULL )
ERR ( " malloc failed " ) ;
r - > val = ( double * ) & r [ 1 ] ;
for ( i = 0 ; i < reps ; i + + ) {
r - > val [ i ] = ( double ) agg_file_size [ i ] / vals [ i ] ;
if ( i = = 0 ) {
r - > min = r - > val [ i ] ;
r - > max = r - > val [ i ] ;
r - > sum = 0.0 ;
}
r - > min = MIN ( r - > min , r - > val [ i ] ) ;
r - > max = MAX ( r - > max , r - > val [ i ] ) ;
r - > sum + = r - > val [ i ] ;
}
r - > mean = r - > sum / reps ;
r - > var = 0.0 ;
for ( i = 0 ; i < reps ; i + + ) {
r - > var + = pow ( ( r - > mean - r - > val [ i ] ) , 2 ) ;
}
r - > var = r - > var / reps ;
r - > sd = sqrt ( r - > var ) ;
return r ;
2011-12-11 08:45:19 +04:00
}
2011-11-12 02:22:17 +04:00
2011-12-11 08:45:19 +04:00
/*
2012-01-14 01:27:55 +04:00
* Summarize results
*
* operation is typically " write " or " read "
2011-12-11 08:45:19 +04:00
*/
2012-01-14 01:27:55 +04:00
static void PrintLongSummaryOneOperation ( IOR_test_t * test , double * times , char * operation )
2011-12-11 08:45:19 +04:00
{
2014-08-14 02:53:24 +04:00
IOR_param_t * params = & test - > params ;
IOR_results_t * results = test - > results ;
struct results * bw ;
int reps ;
2017-09-27 19:45:47 +03:00
2014-08-14 02:53:24 +04:00
if ( rank ! = 0 | | verbose < VERBOSE_0 )
return ;
2011-06-17 23:20:43 +04:00
2014-08-14 02:53:24 +04:00
reps = params - > repetitions ;
2011-12-13 09:00:18 +04:00
2014-08-14 02:53:24 +04:00
bw = bw_values ( reps , results - > aggFileSizeForBW , times ) ;
2011-12-13 23:26:41 +04:00
2011-12-13 23:36:47 +04:00
fprintf ( stdout , " %-9s " , operation ) ;
2011-12-13 23:26:41 +04:00
fprintf ( stdout , " %10.2f " , bw - > max / MEBIBYTE ) ;
2011-12-13 23:36:47 +04:00
fprintf ( stdout , " %10.2f " , bw - > min / MEBIBYTE ) ;
fprintf ( stdout , " %10.2f " , bw - > mean / MEBIBYTE ) ;
2011-12-13 23:26:41 +04:00
fprintf ( stdout , " %10.2f " , bw - > sd / MEBIBYTE ) ;
2011-12-13 23:36:47 +04:00
fprintf ( stdout , " %10.5f " ,
2011-12-13 23:26:41 +04:00
mean_of_array_of_doubles ( times , reps ) ) ;
2012-01-14 02:05:13 +04:00
fprintf ( stdout , " %d " , params - > id ) ;
2011-12-13 23:26:41 +04:00
fprintf ( stdout , " %d " , params - > numTasks ) ;
fprintf ( stdout , " %d " , params - > tasksPerNode ) ;
fprintf ( stdout , " %d " , params - > repetitions ) ;
fprintf ( stdout , " %d " , params - > filePerProc ) ;
fprintf ( stdout , " %d " , params - > reorderTasks ) ;
fprintf ( stdout , " %d " , params - > taskPerNodeOffset ) ;
fprintf ( stdout , " %d " , params - > reorderTasksRandom ) ;
fprintf ( stdout , " %d " , params - > reorderTasksRandomSeed ) ;
fprintf ( stdout , " %lld " , params - > segmentCount ) ;
fprintf ( stdout , " %lld " , params - > blockSize ) ;
fprintf ( stdout , " %lld " , params - > transferSize ) ;
fprintf ( stdout , " %lld " , results - > aggFileSizeForBW [ 0 ] ) ;
2012-01-14 02:05:13 +04:00
fprintf ( stdout , " %s " , params - > api ) ;
fprintf ( stdout , " %d " , params - > referenceNumber ) ;
2011-12-13 23:26:41 +04:00
fprintf ( stdout , " \n " ) ;
2014-08-14 02:53:24 +04:00
fflush ( stdout ) ;
2011-12-11 08:45:19 +04:00
2014-08-14 02:53:24 +04:00
free ( bw ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2012-01-14 01:27:55 +04:00
static void PrintLongSummaryOneTest ( IOR_test_t * test )
2011-12-13 09:00:18 +04:00
{
2014-08-14 02:53:24 +04:00
IOR_param_t * params = & test - > params ;
IOR_results_t * results = test - > results ;
2011-12-13 09:00:18 +04:00
2012-01-14 01:27:55 +04:00
if ( params - > writeFile )
PrintLongSummaryOneOperation ( test , results - > writeTime , " write " ) ;
if ( params - > readFile )
PrintLongSummaryOneOperation ( test , results - > readTime , " read " ) ;
}
2011-12-13 09:00:18 +04:00
2012-01-14 01:27:55 +04:00
static void PrintLongSummaryHeader ( )
{
2014-08-14 02:53:24 +04:00
if ( rank ! = 0 | | verbose < VERBOSE_0 )
return ;
2012-01-14 01:49:30 +04:00
2014-08-14 02:53:24 +04:00
fprintf ( stdout , " \n " ) ;
fprintf ( stdout , " %-9s %10s %10s %10s %10s %10s " ,
2011-12-13 23:36:47 +04:00
" Operation " , " Max(MiB) " , " Min(MiB) " , " Mean(MiB) " , " StdDev " ,
2014-08-14 02:53:24 +04:00
" Mean(s) " ) ;
2012-01-14 02:05:13 +04:00
fprintf ( stdout , " Test# #Tasks tPN reps fPP reord reordoff reordrand seed "
" segcnt blksiz xsize aggsize API RefNum \n " ) ;
2012-01-14 01:27:55 +04:00
}
static void PrintLongSummaryAllTests ( IOR_test_t * tests_head )
{
IOR_test_t * tptr ;
2014-08-14 02:53:24 +04:00
if ( rank ! = 0 | | verbose < VERBOSE_0 )
return ;
2012-01-14 01:27:55 +04:00
2014-08-14 02:53:24 +04:00
fprintf ( stdout , " \n " ) ;
fprintf ( stdout , " Summary of all tests: " ) ;
2012-01-14 01:27:55 +04:00
PrintLongSummaryHeader ( ) ;
2011-12-13 09:00:18 +04:00
2014-08-14 02:53:24 +04:00
for ( tptr = tests_head ; tptr ! = NULL ; tptr = tptr - > next ) {
2012-01-14 01:27:55 +04:00
PrintLongSummaryOneTest ( tptr ) ;
2014-08-14 02:53:24 +04:00
}
2011-12-13 09:00:18 +04:00
}
static void PrintShortSummary ( IOR_test_t * test )
{
2014-08-14 02:53:24 +04:00
IOR_param_t * params = & test - > params ;
IOR_results_t * results = test - > results ;
double max_write = 0.0 ;
double max_read = 0.0 ;
double bw ;
int reps ;
int i ;
2017-09-27 19:45:47 +03:00
2014-08-14 02:53:24 +04:00
if ( rank ! = 0 | | verbose < VERBOSE_0 )
return ;
reps = params - > repetitions ;
max_write = results - > writeTime [ 0 ] ;
max_read = results - > readTime [ 0 ] ;
for ( i = 0 ; i < reps ; i + + ) {
bw = ( double ) results - > aggFileSizeForBW [ i ] / results - > writeTime [ i ] ;
max_write = MAX ( bw , max_write ) ;
bw = ( double ) results - > aggFileSizeForBW [ i ] / results - > readTime [ i ] ;
max_read = MAX ( bw , max_read ) ;
}
fprintf ( stdout , " \n " ) ;
if ( params - > writeFile ) {
fprintf ( stdout , " Max Write: %.2f MiB/sec (%.2f MB/sec) \n " ,
max_write / MEBIBYTE , max_write / MEGABYTE ) ;
}
if ( params - > readFile ) {
fprintf ( stdout , " Max Read: %.2f MiB/sec (%.2f MB/sec) \n " ,
max_read / MEBIBYTE , max_read / MEGABYTE ) ;
}
2011-12-13 09:00:18 +04:00
}
2012-01-07 05:29:45 +04:00
/*
* malloc a buffer , touching every page in an attempt to defeat lazy allocation .
*/
static void * malloc_and_touch ( size_t size )
{
size_t page_size ;
char * buf ;
char * ptr ;
if ( size = = 0 )
return NULL ;
2014-08-14 02:53:24 +04:00
page_size = sysconf ( _SC_PAGESIZE ) ;
2012-01-07 05:29:45 +04:00
buf = ( char * ) malloc ( size ) ;
if ( buf = = NULL )
2012-01-09 06:41:30 +04:00
return NULL ;
2012-01-07 05:29:45 +04:00
for ( ptr = buf ; ptr < buf + size ; ptr + = page_size ) {
* ptr = ( char ) 1 ;
}
return ( void * ) buf ;
}
2011-12-14 10:48:14 +04:00
static void file_hits_histogram ( IOR_param_t * params )
{
2014-08-14 02:53:24 +04:00
int * rankoffs ;
int * filecont ;
int * filehits ;
int ifile ;
int jfile ;
if ( rank = = 0 ) {
rankoffs = ( int * ) malloc ( params - > numTasks * sizeof ( int ) ) ;
filecont = ( int * ) malloc ( params - > numTasks * sizeof ( int ) ) ;
filehits = ( int * ) malloc ( params - > numTasks * sizeof ( int ) ) ;
}
MPI_CHECK ( MPI_Gather ( & rankOffset , 1 , MPI_INT , rankoffs ,
1 , MPI_INT , 0 , MPI_COMM_WORLD ) ,
" MPI_Gather error " ) ;
if ( rank ! = 0 )
return ;
memset ( ( void * ) filecont , 0 , params - > numTasks * sizeof ( int ) ) ;
for ( ifile = 0 ; ifile < params - > numTasks ; ifile + + ) {
filecont [ ( ifile + rankoffs [ ifile ] ) % params - > numTasks ] + + ;
}
memset ( ( void * ) filehits , 0 , params - > numTasks * sizeof ( int ) ) ;
for ( ifile = 0 ; ifile < params - > numTasks ; ifile + + )
for ( jfile = 0 ; jfile < params - > numTasks ; jfile + + ) {
if ( ifile = = filecont [ jfile ] )
filehits [ ifile ] + + ;
}
fprintf ( stdout , " #File Hits Dist: " ) ;
jfile = 0 ;
ifile = 0 ;
while ( jfile < params - > numTasks & & ifile < params - > numTasks ) {
fprintf ( stdout , " %d " , filehits [ ifile ] ) ;
jfile + = filehits [ ifile ] , ifile + + ;
}
fprintf ( stdout , " \n " ) ;
free ( rankoffs ) ;
free ( filecont ) ;
free ( filehits ) ;
2011-12-14 10:48:14 +04:00
}
2012-01-09 00:30:05 +04:00
int test_time_elapsed ( IOR_param_t * params , double startTime )
{
2014-08-14 02:53:24 +04:00
double endTime ;
2012-01-09 00:30:05 +04:00
2014-08-14 02:53:24 +04:00
if ( params - > maxTimeDuration = = 0 )
return 0 ;
2012-01-09 00:30:05 +04:00
2014-08-14 02:53:24 +04:00
endTime = startTime + ( params - > maxTimeDuration * 60 ) ;
2012-01-09 00:30:05 +04:00
2014-08-14 02:53:24 +04:00
return GetTimeStamp ( ) > = endTime ;
2012-01-09 00:30:05 +04:00
}
2012-01-09 06:41:30 +04:00
/*
* hog some memory as a rough simulation of a real application ' s memory use
*/
static void * HogMemory ( IOR_param_t * params )
{
size_t size ;
void * buf ;
if ( params - > memoryPerTask ! = 0 ) {
size = params - > memoryPerTask ;
} else if ( params - > memoryPerNode ! = 0 ) {
if ( verbose > = VERBOSE_3 )
fprintf ( stderr , " This node hogging %ld bytes of memory \n " ,
params - > memoryPerNode ) ;
size = params - > memoryPerNode / params - > tasksPerNode ;
} else {
return NULL ;
}
if ( verbose > = VERBOSE_3 )
fprintf ( stderr , " This task hogging %ld bytes of memory \n " , size ) ;
buf = malloc_and_touch ( size ) ;
if ( buf = = NULL )
ERR ( " malloc of simulated applciation buffer failed " ) ;
return buf ;
}
2011-06-17 23:20:43 +04:00
/*
* Using the test parameters , run iteration ( s ) of single test .
*/
2011-12-13 09:00:18 +04:00
static void TestIoSys ( IOR_test_t * test )
2011-06-17 23:20:43 +04:00
{
2014-08-14 02:53:24 +04:00
IOR_param_t * params = & test - > params ;
IOR_results_t * results = test - > results ;
2011-11-12 02:22:17 +04:00
char testFileName [ MAX_STR ] ;
double * timer [ 12 ] ;
double startTime ;
2015-05-27 19:24:52 +03:00
int pretendRank ;
2012-01-09 00:30:05 +04:00
int i , rep ;
2011-11-12 02:22:17 +04:00
void * fd ;
MPI_Group orig_group , new_group ;
int range [ 3 ] ;
IOR_offset_t dataMoved ; /* for data rate calculation */
2012-01-07 05:29:45 +04:00
void * hog_buf ;
2015-05-27 19:24:52 +03:00
IOR_io_buffers ioBuffers ;
2011-11-12 02:22:17 +04:00
/* set up communicator for test */
2011-12-13 09:00:18 +04:00
if ( params - > numTasks > numTasksWorld ) {
2011-11-12 02:22:17 +04:00
if ( rank = = 0 ) {
fprintf ( stdout ,
" WARNING: More tasks requested (%d) than available (%d), " ,
2011-12-13 09:00:18 +04:00
params - > numTasks , numTasksWorld ) ;
2011-11-12 02:22:17 +04:00
fprintf ( stdout , " running on %d tasks. \n " ,
numTasksWorld ) ;
2011-06-17 23:20:43 +04:00
}
2011-12-13 09:00:18 +04:00
params - > numTasks = numTasksWorld ;
2011-11-12 02:22:17 +04:00
}
MPI_CHECK ( MPI_Comm_group ( MPI_COMM_WORLD , & orig_group ) ,
" MPI_Comm_group() error " ) ;
2014-08-14 02:53:24 +04:00
range [ 0 ] = 0 ; /* first rank */
2011-12-13 09:00:18 +04:00
range [ 1 ] = params - > numTasks - 1 ; /* last rank */
2014-08-14 02:53:24 +04:00
range [ 2 ] = 1 ; /* stride */
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Group_range_incl ( orig_group , 1 , & range , & new_group ) ,
" MPI_Group_range_incl() error " ) ;
MPI_CHECK ( MPI_Comm_create ( MPI_COMM_WORLD , new_group , & testComm ) ,
" MPI_Comm_create() error " ) ;
2013-10-07 18:12:20 +04:00
MPI_CHECK ( MPI_Group_free ( & orig_group ) , " MPI_Group_Free() error " ) ;
MPI_CHECK ( MPI_Group_free ( & new_group ) , " MPI_Group_Free() error " ) ;
2011-12-13 09:00:18 +04:00
params - > testComm = testComm ;
2011-11-12 02:22:17 +04:00
if ( testComm = = MPI_COMM_NULL ) {
/* tasks not in the group do not participate in this test */
MPI_CHECK ( MPI_Barrier ( MPI_COMM_WORLD ) , " barrier error " ) ;
return ;
}
if ( rank = = 0 & & verbose > = VERBOSE_1 ) {
2011-12-13 09:00:18 +04:00
fprintf ( stdout , " Participating tasks: %d \n " , params - > numTasks ) ;
2011-11-12 02:22:17 +04:00
fflush ( stdout ) ;
}
2011-12-13 09:00:18 +04:00
if ( rank = = 0 & & params - > reorderTasks = = TRUE & & verbose > = VERBOSE_1 ) {
2011-06-17 23:20:43 +04:00
fprintf ( stdout ,
2011-11-12 02:22:17 +04:00
" Using reorderTasks '-C' (expecting block, not cyclic, task assignment) \n " ) ;
2011-06-17 23:20:43 +04:00
fflush ( stdout ) ;
}
2011-12-13 09:00:18 +04:00
params - > tasksPerNode = CountTasksPerNode ( params - > numTasks , testComm ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/* setup timers */
for ( i = 0 ; i < 12 ; i + + ) {
2011-12-13 09:00:18 +04:00
timer [ i ] = ( double * ) malloc ( params - > repetitions * sizeof ( double ) ) ;
2011-11-12 02:22:17 +04:00
if ( timer [ i ] = = NULL )
2011-12-11 08:45:19 +04:00
ERR ( " malloc failed " ) ;
2011-11-12 02:22:17 +04:00
}
2011-12-11 08:45:19 +04:00
2011-11-12 02:22:17 +04:00
/* bind I/O calls to specific API */
Algorithms 'S3', 'S3_plus', and 'S3_EMC' all available.
These are variants on S3. S3 uses the "pure" S3 interface, e.g. using
Multi-Part-Upload. The "plus" variant enables EMC-extensions in the aws4c
library. This allows the N:N case to use "append", in the case where
"transfer_size" != "block_size" for IOR. In pure S3, the N:N case will
fail, because the EMC-extensions won't be enabled, and appending (which
attempts to use the EMC byte-range tricks to do this) will throw an error.
In the S3_EMC alg, N:1 uses EMCs other byte-range tricks to write different
parts of an N:1 file, and also uses append to write the parts of an N:N
file. Preliminary tests show these EMC extensions look to improve BW by
~20%.
I put all three algs in aiori-S3.c, because it seemed some code was getting
reused. Not sure if that's still going to make sense after the TBD, below.
TBD: Recently realized that the "pure' S3 shouldn't be trying to use
appends for anything. In the N:N case, it should just use MPU, within each
file. Then, there's no need for S3_plus. We just have S3, which does MPU
for all writes where transfer_size != block_size, and uses (standard)
byte-range reads for reading. Then S3_EMC uses "appends for N:N writes,
and byte-range writes for N:1 writes. This separates the code for the two
algs a little more, but we might still want them in the same file.
2014-10-30 01:04:30 +03:00
AioriBind ( params - > api , params ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/* show test setup */
if ( rank = = 0 & & verbose > = VERBOSE_0 )
2011-12-13 09:00:18 +04:00
ShowSetup ( params ) ;
2011-06-17 23:20:43 +04:00
2012-01-09 06:41:30 +04:00
hog_buf = HogMemory ( params ) ;
2012-01-07 05:29:45 +04:00
2015-05-27 19:24:52 +03:00
pretendRank = ( rank + rankOffset ) % params - > numTasks ;
/* IO Buffer Setup */
2017-09-27 19:45:47 +03:00
if ( params - > setTimeStampSignature ) { // initialize the buffer properly
params - > timeStampSignatureValue = ( unsigned int ) params - > setTimeStampSignature ;
}
2015-05-27 19:24:52 +03:00
XferBuffersSetup ( & ioBuffers , params , pretendRank ) ;
2017-09-27 19:45:47 +03:00
reseed_incompressible_prng = TRUE ; // reset pseudo random generator, necessary to guarantee the next call to FillBuffer produces the same value as it is right now
2015-05-27 19:24:52 +03:00
/* Initial time stamp */
2011-11-12 02:22:17 +04:00
startTime = GetTimeStamp ( ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/* loop over test iterations */
2011-12-13 09:00:18 +04:00
for ( rep = 0 ; rep < params - > repetitions ; rep + + ) {
2011-11-12 02:22:17 +04:00
/* Get iteration start time in seconds in task 0 and broadcast to
all tasks */
if ( rank = = 0 ) {
2017-09-27 19:45:47 +03:00
if ( ! params - > setTimeStampSignature ) {
2011-11-12 02:22:17 +04:00
time_t currentTime ;
if ( ( currentTime = time ( NULL ) ) = = - 1 ) {
ERR ( " cannot get current time " ) ;
}
2011-12-13 09:00:18 +04:00
params - > timeStampSignatureValue =
2014-08-14 02:53:24 +04:00
( unsigned int ) currentTime ;
2017-09-27 19:45:47 +03:00
if ( verbose > = VERBOSE_2 ) {
fprintf ( stdout ,
" Using Time Stamp %u (0x%x) for Data Signature \n " ,
params - > timeStampSignatureValue ,
params - > timeStampSignatureValue ) ;
}
2011-11-12 02:22:17 +04:00
}
2014-08-14 02:53:24 +04:00
if ( rep = = 0 & & verbose > = VERBOSE_0 ) {
fprintf ( stdout , " \n " ) ;
fprintf ( stdout , " access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) total(s) iter \n " ) ;
fprintf ( stdout , " ------ --------- ---------- --------- -------- -------- -------- -------- ---- \n " ) ;
}
2011-11-12 02:22:17 +04:00
}
MPI_CHECK ( MPI_Bcast
2011-12-13 09:00:18 +04:00
( & params - > timeStampSignatureValue , 1 , MPI_UNSIGNED , 0 ,
2011-11-12 02:22:17 +04:00
testComm ) , " cannot broadcast start time value " ) ;
/* use repetition count for number of multiple files */
2011-12-13 09:00:18 +04:00
if ( params - > multiFile )
params - > repCounter = rep ;
2011-11-12 02:22:17 +04:00
/*
* write the file ( s ) , getting timing between I / O calls
*/
2012-01-09 00:30:05 +04:00
if ( params - > writeFile & & ! test_time_elapsed ( params , startTime ) ) {
2011-12-13 09:00:18 +04:00
GetTestFileName ( testFileName , params ) ;
2011-11-12 02:22:17 +04:00
if ( verbose > = VERBOSE_3 ) {
fprintf ( stdout , " task %d writing %s \n " , rank ,
testFileName ) ;
}
2011-12-13 09:00:18 +04:00
DelaySecs ( params - > interTestDelay ) ;
if ( params - > useExistingTestFile = = FALSE ) {
RemoveFile ( testFileName , params - > filePerProc ,
params ) ;
2011-11-12 02:22:17 +04:00
}
MPI_CHECK ( MPI_Barrier ( testComm ) , " barrier error " ) ;
2011-12-13 09:00:18 +04:00
params - > open = WRITE ;
2011-11-12 02:22:17 +04:00
timer [ 0 ] [ rep ] = GetTimeStamp ( ) ;
2011-12-13 09:00:18 +04:00
fd = backend - > create ( testFileName , params ) ;
2011-11-12 02:22:17 +04:00
timer [ 1 ] [ rep ] = GetTimeStamp ( ) ;
2011-12-13 09:00:18 +04:00
if ( params - > intraTestBarriers )
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Barrier ( testComm ) ,
" barrier error " ) ;
if ( rank = = 0 & & verbose > = VERBOSE_1 ) {
2011-12-13 13:07:55 +04:00
fprintf ( stderr ,
" Commencing write performance test: %s " ,
2014-08-14 02:53:24 +04:00
CurrentTimeString ( ) ) ;
2011-11-12 02:22:17 +04:00
}
timer [ 2 ] [ rep ] = GetTimeStamp ( ) ;
2017-10-20 19:02:24 +03:00
dataMoved = WriteOrRead ( params , results , fd , WRITE , & ioBuffers ) ;
2014-08-29 01:39:44 +04:00
if ( params - > verbose > = VERBOSE_4 ) {
printf ( " * data moved = %llu \n " , dataMoved ) ;
fflush ( stdout ) ;
}
2011-11-12 02:22:17 +04:00
timer [ 3 ] [ rep ] = GetTimeStamp ( ) ;
2011-12-13 09:00:18 +04:00
if ( params - > intraTestBarriers )
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Barrier ( testComm ) ,
" barrier error " ) ;
timer [ 4 ] [ rep ] = GetTimeStamp ( ) ;
2011-12-13 09:00:18 +04:00
backend - > close ( fd , params ) ;
2011-11-12 02:22:17 +04:00
timer [ 5 ] [ rep ] = GetTimeStamp ( ) ;
MPI_CHECK ( MPI_Barrier ( testComm ) , " barrier error " ) ;
2014-08-14 02:53:24 +04:00
/* get the size of the file just written */
results - > aggFileSizeFromStat [ rep ] =
backend - > get_file_size ( params , testComm , testFileName ) ;
2011-11-12 02:22:17 +04:00
2014-08-14 02:53:24 +04:00
/* check if stat() of file doesn't equal expected file size,
use actual amount of byte moved */
CheckFileSize ( test , dataMoved , rep ) ;
2011-11-12 02:22:17 +04:00
if ( verbose > = VERBOSE_3 )
2011-12-13 09:00:18 +04:00
WriteTimes ( params , timer , rep , WRITE ) ;
2011-11-12 02:22:17 +04:00
ReduceIterResults ( test , timer , rep , WRITE ) ;
2011-12-13 09:00:18 +04:00
if ( params - > outlierThreshold ) {
CheckForOutliers ( params , timer , rep , WRITE ) ;
2011-11-12 02:22:17 +04:00
}
}
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
/*
* perform a check of data , reading back data and comparing
* against what was expected to be written
*/
2012-01-09 00:30:05 +04:00
if ( params - > checkWrite & & ! test_time_elapsed ( params , startTime ) ) {
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Barrier ( testComm ) , " barrier error " ) ;
if ( rank = = 0 & & verbose > = VERBOSE_1 ) {
fprintf ( stdout ,
" Verifying contents of the file(s) just written. \n " ) ;
fprintf ( stdout , " %s \n " , CurrentTimeString ( ) ) ;
}
2011-12-13 09:00:18 +04:00
if ( params - > reorderTasks ) {
2011-11-12 02:22:17 +04:00
/* move two nodes away from writing node */
rankOffset =
2014-08-14 02:53:24 +04:00
( 2 * params - > tasksPerNode ) % params - > numTasks ;
2011-11-12 02:22:17 +04:00
}
2015-05-21 21:05:56 +03:00
2017-11-29 12:17:02 +03:00
// update the check buffer
FillBuffer ( ioBuffers . readCheckBuffer , params , 0 , ( rank + rankOffset ) % params - > numTasks ) ;
2015-05-21 21:05:56 +03:00
reseed_incompressible_prng = TRUE ; /* Re-Seed the PRNG to get same sequence back, if random */
2011-12-13 09:00:18 +04:00
GetTestFileName ( testFileName , params ) ;
params - > open = WRITECHECK ;
fd = backend - > open ( testFileName , params ) ;
2017-10-20 19:02:24 +03:00
dataMoved = WriteOrRead ( params , results , fd , WRITECHECK , & ioBuffers ) ;
2011-12-13 09:00:18 +04:00
backend - > close ( fd , params ) ;
2011-11-12 02:22:17 +04:00
rankOffset = 0 ;
}
/*
* read the file ( s ) , getting timing between I / O calls
*/
2017-09-27 19:45:47 +03:00
if ( ( params - > readFile | | params - > checkRead ) & & ! test_time_elapsed ( params , startTime ) ) {
int operation_flag = READ ;
if ( params - > checkRead ) {
// actually read and then compare the buffer
operation_flag = READCHECK ;
}
2011-11-12 02:22:17 +04:00
/* Get rankOffset [file offset] for this process to read, based on -C,-Z,-Q,-X options */
/* Constant process offset reading */
2011-12-13 09:00:18 +04:00
if ( params - > reorderTasks ) {
2011-11-12 02:22:17 +04:00
/* move taskPerNodeOffset nodes[1==default] away from writing node */
rankOffset =
2014-08-14 02:53:24 +04:00
( params - > taskPerNodeOffset *
params - > tasksPerNode ) % params - > numTasks ;
2011-11-12 02:22:17 +04:00
}
/* random process offset reading */
2011-12-13 09:00:18 +04:00
if ( params - > reorderTasksRandom ) {
2011-11-12 02:22:17 +04:00
/* this should not intefere with randomOffset within a file because GetOffsetArrayRandom */
/* seeds every random() call */
2014-08-14 02:53:24 +04:00
int nodeoffset ;
2011-11-12 02:22:17 +04:00
unsigned int iseed0 ;
2011-12-13 09:00:18 +04:00
nodeoffset = params - > taskPerNodeOffset ;
2011-12-14 10:48:14 +04:00
nodeoffset = ( nodeoffset < params - > nodes ) ? nodeoffset : params - > nodes - 1 ;
if ( params - > reorderTasksRandomSeed < 0 )
2014-08-14 02:53:24 +04:00
iseed0 = - 1 * params - > reorderTasksRandomSeed + rep ;
else
iseed0 = params - > reorderTasksRandomSeed ;
2011-11-12 02:22:17 +04:00
srand ( rank + iseed0 ) ;
{
2011-12-13 09:00:18 +04:00
rankOffset = rand ( ) % params - > numTasks ;
2011-11-12 02:22:17 +04:00
}
while ( rankOffset <
2011-12-13 09:00:18 +04:00
( nodeoffset * params - > tasksPerNode ) ) {
rankOffset = rand ( ) % params - > numTasks ;
2011-11-12 02:22:17 +04:00
}
/* Get more detailed stats if requested by verbose level */
if ( verbose > = VERBOSE_2 ) {
2014-08-14 02:53:24 +04:00
file_hits_histogram ( params ) ;
2011-11-12 02:22:17 +04:00
}
}
2017-09-27 19:45:47 +03:00
if ( operation_flag = = READCHECK ) {
2017-10-25 16:57:50 +03:00
FillBuffer ( ioBuffers . readCheckBuffer , params , 0 , ( rank + rankOffset ) % params - > numTasks ) ;
2017-09-27 19:45:47 +03:00
}
2011-11-12 02:22:17 +04:00
/* Using globally passed rankOffset, following function generates testFileName to read */
2011-12-13 09:00:18 +04:00
GetTestFileName ( testFileName , params ) ;
2011-11-12 02:22:17 +04:00
if ( verbose > = VERBOSE_3 ) {
fprintf ( stdout , " task %d reading %s \n " , rank ,
testFileName ) ;
}
2011-12-13 09:00:18 +04:00
DelaySecs ( params - > interTestDelay ) ;
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Barrier ( testComm ) , " barrier error " ) ;
2011-12-13 09:00:18 +04:00
params - > open = READ ;
2011-11-12 02:22:17 +04:00
timer [ 6 ] [ rep ] = GetTimeStamp ( ) ;
2011-12-13 09:00:18 +04:00
fd = backend - > open ( testFileName , params ) ;
2011-11-12 02:22:17 +04:00
timer [ 7 ] [ rep ] = GetTimeStamp ( ) ;
2011-12-13 09:00:18 +04:00
if ( params - > intraTestBarriers )
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Barrier ( testComm ) ,
" barrier error " ) ;
if ( rank = = 0 & & verbose > = VERBOSE_1 ) {
2011-12-13 13:07:55 +04:00
fprintf ( stderr ,
" Commencing read performance test: %s " ,
2014-08-14 02:53:24 +04:00
CurrentTimeString ( ) ) ;
2011-11-12 02:22:17 +04:00
}
timer [ 8 ] [ rep ] = GetTimeStamp ( ) ;
2017-10-20 19:02:24 +03:00
dataMoved = WriteOrRead ( params , results , fd , operation_flag , & ioBuffers ) ;
2011-11-12 02:22:17 +04:00
timer [ 9 ] [ rep ] = GetTimeStamp ( ) ;
2011-12-13 09:00:18 +04:00
if ( params - > intraTestBarriers )
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Barrier ( testComm ) ,
" barrier error " ) ;
timer [ 10 ] [ rep ] = GetTimeStamp ( ) ;
2011-12-13 09:00:18 +04:00
backend - > close ( fd , params ) ;
2011-11-12 02:22:17 +04:00
timer [ 11 ] [ rep ] = GetTimeStamp ( ) ;
/* get the size of the file just read */
2011-12-13 09:00:18 +04:00
results - > aggFileSizeFromStat [ rep ] =
2014-08-14 02:53:24 +04:00
backend - > get_file_size ( params , testComm ,
testFileName ) ;
2011-11-12 02:22:17 +04:00
/* check if stat() of file doesn't equal expected file size,
use actual amount of byte moved */
CheckFileSize ( test , dataMoved , rep ) ;
if ( verbose > = VERBOSE_3 )
2011-12-13 09:00:18 +04:00
WriteTimes ( params , timer , rep , READ ) ;
2011-11-12 02:22:17 +04:00
ReduceIterResults ( test , timer , rep , READ ) ;
2011-12-13 09:00:18 +04:00
if ( params - > outlierThreshold ) {
CheckForOutliers ( params , timer , rep , READ ) ;
2011-11-12 02:22:17 +04:00
}
}
2011-12-13 09:00:18 +04:00
if ( ! params - > keepFile
2012-01-09 06:55:46 +04:00
& & ! ( params - > errorFound & & params - > keepFileWithError ) ) {
double start , finish ;
start = GetTimeStamp ( ) ;
MPI_CHECK ( MPI_Barrier ( testComm ) , " barrier error " ) ;
2011-12-13 09:00:18 +04:00
RemoveFile ( testFileName , params - > filePerProc , params ) ;
2012-01-09 06:55:46 +04:00
MPI_CHECK ( MPI_Barrier ( testComm ) , " barrier error " ) ;
finish = GetTimeStamp ( ) ;
PrintRemoveTiming ( start , finish , rep ) ;
} else {
MPI_CHECK ( MPI_Barrier ( testComm ) , " barrier error " ) ;
2011-11-12 02:22:17 +04:00
}
2011-12-13 09:00:18 +04:00
params - > errorFound = FALSE ;
2011-11-12 02:22:17 +04:00
rankOffset = 0 ;
}
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
MPI_CHECK ( MPI_Comm_free ( & testComm ) , " MPI_Comm_free() error " ) ;
2011-12-13 09:00:18 +04:00
2012-01-14 01:27:55 +04:00
if ( params - > summary_every_test ) {
PrintLongSummaryHeader ( ) ;
PrintLongSummaryOneTest ( test ) ;
} else {
PrintShortSummary ( test ) ;
}
2011-12-13 09:00:18 +04:00
2015-05-27 19:24:52 +03:00
XferBuffersFree ( & ioBuffers , params ) ;
2014-08-14 02:53:24 +04:00
if ( hog_buf ! = NULL )
free ( hog_buf ) ;
2011-11-12 02:22:17 +04:00
for ( i = 0 ; i < 12 ; i + + ) {
free ( timer [ i ] ) ;
2011-06-17 23:20:43 +04:00
}
2012-01-07 05:29:45 +04:00
2011-11-12 02:22:17 +04:00
/* Sync with the tasks that did not participate in this test */
MPI_CHECK ( MPI_Barrier ( MPI_COMM_WORLD ) , " barrier error " ) ;
2012-01-07 05:29:45 +04:00
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Determine if valid tests from parameters .
*/
2015-05-19 18:36:28 +03:00
static void ValidateTests ( IOR_param_t * test )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
IOR_param_t defaults ;
init_IOR_Param_t ( & defaults ) ;
2015-05-19 18:36:28 +03:00
2011-11-12 02:22:17 +04:00
/* get the version of the tests */
Algorithms 'S3', 'S3_plus', and 'S3_EMC' all available.
These are variants on S3. S3 uses the "pure" S3 interface, e.g. using
Multi-Part-Upload. The "plus" variant enables EMC-extensions in the aws4c
library. This allows the N:N case to use "append", in the case where
"transfer_size" != "block_size" for IOR. In pure S3, the N:N case will
fail, because the EMC-extensions won't be enabled, and appending (which
attempts to use the EMC byte-range tricks to do this) will throw an error.
In the S3_EMC alg, N:1 uses EMCs other byte-range tricks to write different
parts of an N:1 file, and also uses append to write the parts of an N:N
file. Preliminary tests show these EMC extensions look to improve BW by
~20%.
I put all three algs in aiori-S3.c, because it seemed some code was getting
reused. Not sure if that's still going to make sense after the TBD, below.
TBD: Recently realized that the "pure' S3 shouldn't be trying to use
appends for anything. In the N:N case, it should just use MPU, within each
file. Then, there's no need for S3_plus. We just have S3, which does MPU
for all writes where transfer_size != block_size, and uses (standard)
byte-range reads for reading. Then S3_EMC uses "appends for N:N writes,
and byte-range writes for N:1 writes. This separates the code for the two
algs a little more, but we might still want them in the same file.
2014-10-30 01:04:30 +03:00
AioriBind ( test - > api , test ) ;
2011-11-12 02:22:17 +04:00
backend - > set_version ( test ) ;
if ( test - > repetitions < = 0 )
WARN_RESET ( " too few test repetitions " ,
test , & defaults , repetitions ) ;
if ( test - > numTasks < = 0 )
ERR ( " too few tasks for testing " ) ;
if ( test - > interTestDelay < 0 )
WARN_RESET ( " inter-test delay must be nonnegative value " ,
test , & defaults , interTestDelay ) ;
if ( test - > readFile ! = TRUE & & test - > writeFile ! = TRUE
& & test - > checkRead ! = TRUE & & test - > checkWrite ! = TRUE )
ERR ( " test must write, read, or check file " ) ;
if ( ( test - > deadlineForStonewalling > 0 )
& & ( test - > checkWrite = = TRUE | | test - > checkRead = = TRUE ) )
ERR ( " can not perform write or read check with stonewalling " ) ;
if ( test - > segmentCount < 0 )
ERR ( " segment count must be positive value " ) ;
if ( ( test - > blockSize % sizeof ( IOR_size_t ) ) ! = 0 )
ERR ( " block size must be a multiple of access size " ) ;
if ( test - > blockSize < 0 )
ERR ( " block size must be non-negative integer " ) ;
if ( ( test - > transferSize % sizeof ( IOR_size_t ) ) ! = 0 )
ERR ( " transfer size must be a multiple of access size " ) ;
if ( test - > setAlignment < 0 )
ERR ( " alignment must be non-negative integer " ) ;
if ( test - > transferSize < 0 )
ERR ( " transfer size must be non-negative integer " ) ;
if ( test - > transferSize = = 0 ) {
ERR ( " test will not complete with zero transfer size " ) ;
2011-06-17 23:20:43 +04:00
} else {
2011-11-12 02:22:17 +04:00
if ( ( test - > blockSize % test - > transferSize ) ! = 0 )
ERR ( " block size must be a multiple of transfer size " ) ;
}
if ( test - > blockSize < test - > transferSize )
ERR ( " block size must not be smaller than transfer size " ) ;
2015-05-19 18:36:28 +03:00
/* specific APIs */
2011-11-12 02:22:17 +04:00
if ( ( strcmp ( test - > api , " MPIIO " ) = = 0 )
& & ( test - > blockSize < sizeof ( IOR_size_t )
| | test - > transferSize < sizeof ( IOR_size_t ) ) )
ERR ( " block/transfer size may not be smaller than IOR_size_t for MPIIO " ) ;
if ( ( strcmp ( test - > api , " HDF5 " ) = = 0 )
& & ( test - > blockSize < sizeof ( IOR_size_t )
| | test - > transferSize < sizeof ( IOR_size_t ) ) )
ERR ( " block/transfer size may not be smaller than IOR_size_t for HDF5 " ) ;
if ( ( strcmp ( test - > api , " NCMPI " ) = = 0 )
& & ( test - > blockSize < sizeof ( IOR_size_t )
| | test - > transferSize < sizeof ( IOR_size_t ) ) )
ERR ( " block/transfer size may not be smaller than IOR_size_t for NCMPI " ) ;
if ( ( test - > useFileView = = TRUE )
& & ( sizeof ( MPI_Aint ) < 8 ) /* used for 64-bit datatypes */
& & ( ( test - > numTasks * test - > blockSize ) >
( 2 * ( IOR_offset_t ) GIBIBYTE ) ) )
ERR ( " segment size must be < 2GiB " ) ;
if ( ( strcmp ( test - > api , " POSIX " ) ! = 0 ) & & test - > singleXferAttempt )
WARN_RESET ( " retry only available in POSIX " ,
test , & defaults , singleXferAttempt ) ;
if ( ( strcmp ( test - > api , " POSIX " ) ! = 0 ) & & test - > fsync )
WARN_RESET ( " fsync() only available in POSIX " ,
test , & defaults , fsync ) ;
if ( ( strcmp ( test - > api , " MPIIO " ) ! = 0 ) & & test - > preallocate )
WARN_RESET ( " preallocation only available in MPIIO " ,
test , & defaults , preallocate ) ;
if ( ( strcmp ( test - > api , " MPIIO " ) ! = 0 ) & & test - > useFileView )
WARN_RESET ( " file view only available in MPIIO " ,
test , & defaults , useFileView ) ;
if ( ( strcmp ( test - > api , " MPIIO " ) ! = 0 ) & & test - > useSharedFilePointer )
WARN_RESET ( " shared file pointer only available in MPIIO " ,
test , & defaults , useSharedFilePointer ) ;
if ( ( strcmp ( test - > api , " MPIIO " ) = = 0 ) & & test - > useSharedFilePointer )
WARN_RESET ( " shared file pointer not implemented " ,
test , & defaults , useSharedFilePointer ) ;
if ( ( strcmp ( test - > api , " MPIIO " ) ! = 0 ) & & test - > useStridedDatatype )
WARN_RESET ( " strided datatype only available in MPIIO " ,
test , & defaults , useStridedDatatype ) ;
if ( ( strcmp ( test - > api , " MPIIO " ) = = 0 ) & & test - > useStridedDatatype )
WARN_RESET ( " strided datatype not implemented " ,
test , & defaults , useStridedDatatype ) ;
if ( ( strcmp ( test - > api , " MPIIO " ) = = 0 )
& & test - > useStridedDatatype & & ( test - > blockSize < sizeof ( IOR_size_t )
| | test - > transferSize <
sizeof ( IOR_size_t ) ) )
ERR ( " need larger file size for strided datatype in MPIIO " ) ;
if ( ( strcmp ( test - > api , " POSIX " ) = = 0 ) & & test - > showHints )
WARN_RESET ( " hints not available in POSIX " ,
test , & defaults , showHints ) ;
if ( ( strcmp ( test - > api , " POSIX " ) = = 0 ) & & test - > collective )
WARN_RESET ( " collective not available in POSIX " ,
test , & defaults , collective ) ;
2015-05-19 18:36:28 +03:00
/* parameter consitency */
2011-11-12 02:22:17 +04:00
if ( test - > reorderTasks = = TRUE & & test - > reorderTasksRandom = = TRUE )
ERR ( " Both Constant and Random task re-ordering specified. Choose one and resubmit " ) ;
if ( test - > randomOffset & & test - > reorderTasksRandom
& & test - > filePerProc = = FALSE )
ERR ( " random offset and random reorder tasks specified with single-shared-file. Choose one and resubmit " ) ;
if ( test - > randomOffset & & test - > reorderTasks
& & test - > filePerProc = = FALSE )
ERR ( " random offset and constant reorder tasks specified with single-shared-file. Choose one and resubmit " ) ;
if ( test - > randomOffset & & test - > checkRead )
ERR ( " random offset not available with read check option (use write check) " ) ;
if ( test - > randomOffset & & test - > storeFileOffset )
ERR ( " random offset not available with store file offset option) " ) ;
2015-05-19 18:36:28 +03:00
2011-11-12 02:22:17 +04:00
if ( ( strcmp ( test - > api , " MPIIO " ) = = 0 ) & & test - > randomOffset
& & test - > collective )
ERR ( " random offset not available with collective MPIIO " ) ;
if ( ( strcmp ( test - > api , " MPIIO " ) = = 0 ) & & test - > randomOffset
& & test - > useFileView )
ERR ( " random offset not available with MPIIO fileviews " ) ;
if ( ( strcmp ( test - > api , " HDF5 " ) = = 0 ) & & test - > randomOffset )
ERR ( " random offset not available with HDF5 " ) ;
if ( ( strcmp ( test - > api , " NCMPI " ) = = 0 ) & & test - > randomOffset )
ERR ( " random offset not available with NCMPI " ) ;
if ( ( strcmp ( test - > api , " HDF5 " ) ! = 0 ) & & test - > individualDataSets )
WARN_RESET ( " individual datasets only available in HDF5 " ,
test , & defaults , individualDataSets ) ;
if ( ( strcmp ( test - > api , " HDF5 " ) = = 0 ) & & test - > individualDataSets )
WARN_RESET ( " individual data sets not implemented " ,
test , & defaults , individualDataSets ) ;
if ( ( strcmp ( test - > api , " NCMPI " ) = = 0 ) & & test - > filePerProc )
ERR ( " file-per-proc not available in current NCMPI " ) ;
if ( test - > noFill ) {
if ( strcmp ( test - > api , " HDF5 " ) ! = 0 ) {
ERR ( " 'no fill' option only available in HDF5 " ) ;
} else {
/* check if hdf5 available */
# if defined (H5_VERS_MAJOR) && defined (H5_VERS_MINOR)
/* no-fill option not available until hdf5-1.6.x */
# if (H5_VERS_MAJOR > 0 && H5_VERS_MINOR > 5)
;
# else
char errorString [ MAX_STR ] ;
sprintf ( errorString ,
" 'no fill' option not available in %s " ,
test - > apiVersion ) ;
ERR ( errorString ) ;
# endif
# else
WARN ( " unable to determine HDF5 version for 'no fill' usage " ) ;
# endif
}
}
if ( test - > useExistingTestFile & & test - > lustre_set_striping )
ERR ( " Lustre stripe options are incompatible with useExistingTestFile " ) ;
2015-05-19 18:36:28 +03:00
/* N:1 and N:N */
IOR_offset_t NtoN = test - > filePerProc ;
IOR_offset_t Nto1 = ! NtoN ;
IOR_offset_t s = test - > segmentCount ;
IOR_offset_t t = test - > transferSize ;
IOR_offset_t b = test - > blockSize ;
if ( Nto1 & & ( s ! = 1 ) & & ( b ! = t ) ) {
ERR ( " N:1 (strided) requires xfer-size == block-size " ) ;
}
2011-11-12 03:11:28 +04:00
}
2011-11-12 02:22:17 +04:00
2011-11-12 04:40:45 +04:00
static IOR_offset_t * GetOffsetArraySequential ( IOR_param_t * test ,
int pretendRank )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
IOR_offset_t i , j , k = 0 ;
IOR_offset_t offsets ;
IOR_offset_t * offsetArray ;
/* count needed offsets */
offsets = ( test - > blockSize / test - > transferSize ) * test - > segmentCount ;
/* setup empty array */
offsetArray =
2014-08-14 02:53:24 +04:00
( IOR_offset_t * ) malloc ( ( offsets + 1 ) * sizeof ( IOR_offset_t ) ) ;
2011-11-12 02:22:17 +04:00
if ( offsetArray = = NULL )
ERR ( " malloc() failed " ) ;
offsetArray [ offsets ] = - 1 ; /* set last offset with -1 */
/* fill with offsets */
for ( i = 0 ; i < test - > segmentCount ; i + + ) {
for ( j = 0 ; j < ( test - > blockSize / test - > transferSize ) ; j + + ) {
offsetArray [ k ] = j * test - > transferSize ;
if ( test - > filePerProc ) {
offsetArray [ k ] + = i * test - > blockSize ;
} else {
offsetArray [ k ] + =
2014-08-14 02:53:24 +04:00
( i * test - > numTasks * test - > blockSize )
+ ( pretendRank * test - > blockSize ) ;
2011-11-12 02:22:17 +04:00
}
k + + ;
}
}
return ( offsetArray ) ;
2011-11-12 03:11:28 +04:00
}
2011-11-12 02:22:17 +04:00
2011-11-12 04:40:45 +04:00
static IOR_offset_t * GetOffsetArrayRandom ( IOR_param_t * test , int pretendRank ,
int access )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
int seed ;
IOR_offset_t i , value , tmp ;
2011-11-12 03:11:28 +04:00
IOR_offset_t offsets = 0 ;
IOR_offset_t offsetCnt = 0 ;
2011-11-12 02:22:17 +04:00
IOR_offset_t fileSize ;
IOR_offset_t * offsetArray ;
/* set up seed for random() */
if ( access = = WRITE | | access = = READ ) {
test - > randomSeed = seed = random ( ) ;
2011-06-17 23:20:43 +04:00
} else {
2011-11-12 02:22:17 +04:00
seed = test - > randomSeed ;
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
srandom ( seed ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
fileSize = test - > blockSize * test - > segmentCount ;
if ( test - > filePerProc = = FALSE ) {
fileSize * = test - > numTasks ;
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
/* count needed offsets (pass 1) */
2011-06-17 23:20:43 +04:00
for ( i = 0 ; i < fileSize ; i + = test - > transferSize ) {
2011-11-12 02:22:17 +04:00
if ( test - > filePerProc = = FALSE ) {
if ( ( random ( ) % test - > numTasks ) = = pretendRank ) {
offsets + + ;
}
} else {
offsets + + ;
}
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
/* setup empty array */
offsetArray =
2014-08-14 02:53:24 +04:00
( IOR_offset_t * ) malloc ( ( offsets + 1 ) * sizeof ( IOR_offset_t ) ) ;
2011-11-12 02:22:17 +04:00
if ( offsetArray = = NULL )
ERR ( " malloc() failed " ) ;
offsetArray [ offsets ] = - 1 ; /* set last offset with -1 */
if ( test - > filePerProc ) {
/* fill array */
for ( i = 0 ; i < offsets ; i + + ) {
offsetArray [ i ] = i * test - > transferSize ;
}
} else {
/* fill with offsets (pass 2) */
srandom ( seed ) ; /* need same seed */
for ( i = 0 ; i < fileSize ; i + = test - > transferSize ) {
if ( ( random ( ) % test - > numTasks ) = = pretendRank ) {
offsetArray [ offsetCnt ] = i ;
offsetCnt + + ;
}
}
}
/* reorder array */
for ( i = 0 ; i < offsets ; i + + ) {
value = random ( ) % offsets ;
tmp = offsetArray [ value ] ;
offsetArray [ value ] = offsetArray [ i ] ;
offsetArray [ i ] = tmp ;
}
SeedRandGen ( test - > testComm ) ; /* synchronize seeds across tasks */
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
return ( offsetArray ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
2017-10-20 19:02:24 +03:00
static IOR_offset_t WriteOrReadSingle ( IOR_offset_t pairCnt , IOR_offset_t * offsetArray , int pretendRank ,
IOR_offset_t * transferCount , int * errors , IOR_param_t * test , int * fd , IOR_io_buffers * ioBuffers , int access ) {
IOR_offset_t amtXferred ;
IOR_offset_t transfer ;
void * buffer = ioBuffers - > buffer ;
void * checkBuffer = ioBuffers - > checkBuffer ;
void * readCheckBuffer = ioBuffers - > readCheckBuffer ;
test - > offset = offsetArray [ pairCnt ] ;
transfer = test - > transferSize ;
if ( access = = WRITE ) {
2017-11-30 13:56:26 +03:00
/* fills each transfer with a unique pattern
* containing the offset into the file */
2017-10-20 19:02:24 +03:00
if ( test - > storeFileOffset = = TRUE ) {
FillBuffer ( buffer , test , test - > offset , pretendRank ) ;
}
amtXferred =
backend - > xfer ( access , fd , buffer , transfer , test ) ;
if ( amtXferred ! = transfer )
ERR ( " cannot write to file " ) ;
} else if ( access = = READ ) {
amtXferred =
backend - > xfer ( access , fd , buffer , transfer , test ) ;
if ( amtXferred ! = transfer )
ERR ( " cannot read from file " ) ;
} else if ( access = = WRITECHECK ) {
memset ( checkBuffer , ' a ' , transfer ) ;
2017-11-29 12:17:02 +03:00
2017-11-30 13:56:26 +03:00
if ( test - > storeFileOffset = = TRUE ) {
FillBuffer ( readCheckBuffer , test , test - > offset , pretendRank ) ;
}
amtXferred = backend - > xfer ( access , fd , checkBuffer , transfer , test ) ;
2017-10-20 19:02:24 +03:00
if ( amtXferred ! = transfer )
ERR ( " cannot read from file write check " ) ;
( * transferCount ) + + ;
2017-11-29 12:17:02 +03:00
* errors + = CompareBuffers ( readCheckBuffer , checkBuffer , transfer ,
2017-10-20 19:02:24 +03:00
* transferCount , test ,
WRITECHECK ) ;
} else if ( access = = READCHECK ) {
amtXferred = backend - > xfer ( access , fd , buffer , transfer , test ) ;
if ( amtXferred ! = transfer ) {
ERR ( " cannot read from file " ) ;
}
if ( test - > storeFileOffset = = TRUE ) {
FillBuffer ( readCheckBuffer , test , test - > offset , pretendRank ) ;
}
* errors + = CompareBuffers ( readCheckBuffer , buffer , transfer , * transferCount , test , READCHECK ) ;
}
return amtXferred ;
}
2011-06-17 23:20:43 +04:00
/*
* Write or Read data to file ( s ) . This loops through the strides , writing
* out the data to each block in transfer sizes , until the remainder left is 0.
*/
2017-10-20 19:02:24 +03:00
static IOR_offset_t WriteOrRead ( IOR_param_t * test , IOR_results_t * results , void * fd , int access , IOR_io_buffers * ioBuffers )
2011-06-17 23:20:43 +04:00
{
2011-11-12 02:22:17 +04:00
int errors = 0 ;
2012-01-13 08:34:40 +04:00
IOR_offset_t amtXferred ;
IOR_offset_t transferCount = 0 ;
IOR_offset_t pairCnt = 0 ;
IOR_offset_t * offsetArray ;
2011-11-12 02:22:17 +04:00
int pretendRank ;
IOR_offset_t dataMoved = 0 ; /* for data rate calculation */
double startForStonewall ;
int hitStonewall ;
/* initialize values */
pretendRank = ( rank + rankOffset ) % test - > numTasks ;
if ( test - > randomOffset ) {
offsetArray = GetOffsetArrayRandom ( test , pretendRank , access ) ;
} else {
offsetArray = GetOffsetArraySequential ( test , pretendRank ) ;
2011-06-17 23:20:43 +04:00
}
2011-11-12 02:22:17 +04:00
/* check for stonewall */
startForStonewall = GetTimeStamp ( ) ;
2011-06-17 23:20:43 +04:00
hitStonewall = ( ( test - > deadlineForStonewalling ! = 0 )
& & ( ( GetTimeStamp ( ) - startForStonewall )
> test - > deadlineForStonewalling ) ) ;
2011-11-12 02:22:17 +04:00
/* loop over offsets to access */
2017-10-20 19:13:29 +03:00
while ( ( offsetArray [ pairCnt ] ! = - 1 ) & & ! hitStonewall ) {
2017-10-20 19:02:24 +03:00
dataMoved + = WriteOrReadSingle ( pairCnt , offsetArray , pretendRank , & transferCount , & errors , test , fd , ioBuffers , access ) ;
2011-11-12 02:22:17 +04:00
pairCnt + + ;
hitStonewall = ( ( test - > deadlineForStonewalling ! = 0 )
& & ( ( GetTimeStamp ( ) - startForStonewall )
2017-10-20 19:13:29 +03:00
> test - > deadlineForStonewalling ) ) | | ( test - > stoneWallingWearOutIterations ! = 0 & & pairCnt = = test - > stoneWallingWearOutIterations ) ;
2011-11-12 02:22:17 +04:00
}
2017-10-20 19:02:24 +03:00
if ( test - > stoneWallingWearOut ) {
MPI_CHECK ( MPI_Allreduce ( & pairCnt , & results - > pairs_accessed ,
1 , MPI_LONG_LONG_INT , MPI_MAX , testComm ) , " cannot reduce pairs moved " ) ;
if ( verbose > = VERBOSE_1 ) {
printf ( " %d: stonewalling pairs accessed globally: %lld this rank: %lld \n " , rank , ( long long ) results - > pairs_accessed , ( long long ) pairCnt ) ;
}
if ( pairCnt ! = results - > pairs_accessed ) {
// some work needs still to be done !
for ( ; pairCnt < results - > pairs_accessed ; pairCnt + + ) {
dataMoved + = WriteOrReadSingle ( pairCnt , offsetArray , pretendRank , & transferCount , & errors , test , fd , ioBuffers , access ) ;
}
}
} else {
results - > pairs_accessed = pairCnt ;
2011-11-12 02:22:17 +04:00
}
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
totalErrorCount + = CountErrors ( test , access , errors ) ;
2011-06-17 23:20:43 +04:00
2012-01-13 08:34:40 +04:00
free ( offsetArray ) ;
2011-06-17 23:20:43 +04:00
2011-11-12 02:22:17 +04:00
if ( access = = WRITE & & test - > fsync = = TRUE ) {
backend - > fsync ( fd , test ) ; /*fsync after all accesses */
}
return ( dataMoved ) ;
2011-11-12 03:11:28 +04:00
}
2011-06-17 23:20:43 +04:00
/*
* Write times taken during each iteration of the test .
*/
2011-11-12 04:40:45 +04:00
static void
2011-11-12 02:22:17 +04:00
WriteTimes ( IOR_param_t * test , double * * timer , int iteration , int writeOrRead )
2011-06-17 23:20:43 +04:00
{
2011-11-12 03:11:28 +04:00
char accessType [ MAX_STR ] ;
char timerName [ MAX_STR ] ;
2011-11-12 02:22:17 +04:00
int i , start , stop ;
if ( writeOrRead = = WRITE ) {
start = 0 ;
stop = 6 ;
strcpy ( accessType , " WRITE " ) ;
} else if ( writeOrRead = = READ ) {
start = 6 ;
stop = 12 ;
strcpy ( accessType , " READ " ) ;
} else {
ERR ( " incorrect WRITE/READ option " ) ;
}
2011-11-10 04:13:44 +04:00
2011-11-12 02:22:17 +04:00
for ( i = start ; i < stop ; i + + ) {
switch ( i ) {
case 0 :
strcpy ( timerName , " write open start " ) ;
break ;
case 1 :
strcpy ( timerName , " write open stop " ) ;
break ;
case 2 :
strcpy ( timerName , " write start " ) ;
break ;
case 3 :
strcpy ( timerName , " write stop " ) ;
break ;
case 4 :
strcpy ( timerName , " write close start " ) ;
break ;
case 5 :
strcpy ( timerName , " write close stop " ) ;
break ;
case 6 :
strcpy ( timerName , " read open start " ) ;
break ;
case 7 :
strcpy ( timerName , " read open stop " ) ;
break ;
case 8 :
strcpy ( timerName , " read start " ) ;
break ;
case 9 :
strcpy ( timerName , " read stop " ) ;
break ;
case 10 :
strcpy ( timerName , " read close start " ) ;
break ;
case 11 :
strcpy ( timerName , " read close stop " ) ;
break ;
default :
strcpy ( timerName , " invalid timer " ) ;
break ;
}
fprintf ( stdout , " Test %d: Iter=%d, Task=%d, Time=%f, %s \n " ,
test - > id , iteration , ( int ) rank , timer [ i ] [ iteration ] ,
timerName ) ;
}
2011-11-12 03:11:28 +04:00
}